com.stratio.ingestion.deserializer.xmlxpath.XmlXpathDeserializer.java Source code

Java tutorial

Introduction

Here is the source code for com.stratio.ingestion.deserializer.xmlxpath.XmlXpathDeserializer.java

Source

/**
 * Copyright (C) 2014 Stratio (http://stratio.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.stratio.ingestion.deserializer.xmlxpath;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Map.Entry;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.conf.ConfigurationException;
import org.apache.flume.event.EventBuilder;
import org.apache.flume.serialization.EventDeserializer;
import org.apache.flume.serialization.ResettableInputStream;
import org.apache.flume.serialization.Seekable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;

//@formatter:off
/**
* <p>XML XPath Deserializer. Read InputStream as XML compile a XPathExpression and create event for each element 
* result of apply that expression to the xml in headers. Maintain whole xml in body.</p>.
* <ul>
* <li><em>outputField</em>: Output Field in header where put events. Default: element.</li>
* <li><em>expression</em>: XPath expression. </li>
* </ul>
* 
* <p>A special option is the chance to evaluate xpath expression for each event and add result in a header. For example:</p>
* <code>
* <li>headers.author= <XPathExpression> will put result of expression in author field of header.</li>
* </code>
*/
//@formatter:on
public class XmlXpathDeserializer implements EventDeserializer {

    private static final Logger log = LoggerFactory.getLogger(XmlXpathDeserializer.class);

    private static final String CONF_XPATH_EXPRESSION = "expression";
    private static final String CONF_OUTPUT_HEADER = "outputHeader";
    private static final String CONF_OUTPUT_BODY = "outputBody";

    private static final boolean DEFAULT_OUTPUT_BODY = true;

    private boolean isOpen;
    private String outputHeader;
    private boolean outputBody;
    private String body;
    private final XPath xpath;
    private Document doc = null;
    private List<String> list = null;
    private ListIterator<String> markIt, currentIt;

    XmlXpathDeserializer(Context context, ResettableInputStream in) throws IOException {
        try {
            final String expression = context.getString(CONF_XPATH_EXPRESSION);

            outputBody = context.getBoolean(CONF_OUTPUT_BODY, DEFAULT_OUTPUT_BODY);
            if (!outputBody) {
                if (!context.containsKey(CONF_OUTPUT_HEADER)) {
                    throw new ConfigurationException(String.format("Either %s must be false or %s must be defined",
                            CONF_OUTPUT_BODY, CONF_OUTPUT_HEADER));
                }
                outputHeader = context.getString(CONF_OUTPUT_HEADER);
            }

            xpath = XPathFactory.newInstance().newXPath();
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            DocumentBuilder docBuilder;
            try {
                docBuilder = factory.newDocumentBuilder();
            } catch (ParserConfigurationException e) {
                throw new IOException("Creating DocumentBuilder failed", e);
            }

            try {
                doc = docBuilder.parse(new ResettableInputStreamInputStream(in));
            } catch (SAXException e) {
                throw new IOException("Cannot parse body", e);
            }

            // Extract full xml to body
            try {
                body = documentToString(doc);
            } catch (TransformerException e) {
                throw new IOException("Cannot serialize XML", e);
            }

            if (doc != null) {
                isOpen = true;
            }

            NodeList nodeList;
            try {
                final XPathExpression expr = xpath.compile(expression);
                nodeList = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
                list = new ArrayList<String>(nodeList.getLength());
                log.debug("XPath expression matched {} elements", list.size());
            } catch (XPathExpressionException e) {
                throw new IOException("Applying XPath expression failed", e);
            }

            for (int i = 0; i < nodeList.getLength(); i++) {
                Node node = nodeList.item(i);
                String eventSt = nodeToString(node);
                list.add(eventSt);
            }

            markIt = list.listIterator();
            currentIt = list.listIterator();

        } finally {
            try {
                in.close();
            } catch (IOException ex) {
                log.warn("Error while closing input stream");
            }
        }
    }

    @Override
    public Event readEvent() throws IOException {
        ensureOpen();
        if (!currentIt.hasNext()) {
            return null;
        } else {
            final String node = currentIt.next();
            if (outputBody) {
                return EventBuilder.withBody(node, Charsets.UTF_8);
            } else {
                final Event event = EventBuilder.withBody(body, Charsets.UTF_8);
                event.getHeaders().put(outputHeader, node);
                return event;
            }
        }
    }

    @Override
    public List<Event> readEvents(int numEvents) throws IOException {
        ensureOpen();
        List<Event> events = Lists.newLinkedList();
        for (int i = 0; i < numEvents; i++) {
            Event event = readEvent();
            if (event != null) {
                events.add(event);
            }
        }
        return events;
    }

    @Override
    public void mark() throws IOException {
        ensureOpen();
        int index = currentIt.previousIndex();
        markIt = index >= 0 ? list.listIterator(currentIt.previousIndex()) : list.listIterator(0);
        if (markIt.hasNext()) {
            markIt.next();
        }
    }

    @Override
    public void reset() throws IOException {
        ensureOpen();
        int index = markIt.previousIndex();
        currentIt = index >= 0 ? list.listIterator(markIt.previousIndex()) : list.listIterator(0);
        if (currentIt.hasNext()) {
            currentIt.next();
        }
    }

    @Override
    public void close() throws IOException {
        if (isOpen) {
            isOpen = false;
        }
    }

    private void ensureOpen() {
        if (!isOpen) {
            throw new IllegalStateException("Serializer has been closed");
        }
    }

    public String nodeToString(Node node) {
        StringWriter writer = new StringWriter();
        TransformerFactory tfactory = TransformerFactory.newInstance();
        Transformer xform;
        try {
            xform = tfactory.newTransformer();
            xform.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
            Source src = new DOMSource(node);
            Result result = new StreamResult(writer);
            xform.transform(src, result);
        } catch (TransformerException e) {
            e.printStackTrace();
        }
        return writer.toString();
    }

    public String documentToString(Document document) throws TransformerException {
        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer transformer = tf.newTransformer();
        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
        StringWriter writer = new StringWriter();
        transformer.transform(new DOMSource(document), new StreamResult(writer));
        return writer.getBuffer().toString().replaceAll("\n|\r|\t", "");
    }

    /**
     * From a properties, evaluate every xpath expression in value and put result in a map
     * maintaining given key.
     * 
     * @param properties
     * @return
     */
    private Map<String, String> evaluateStaticFields(ImmutableMap<String, String> properties) {
        Map<String, String> headers = new HashMap<String, String>();
        for (Entry<String, String> entry : properties.entrySet()) {
            try {
                XPathExpression expression = xpath.compile(entry.getValue());
                String value = (String) expression.evaluate(doc, XPathConstants.STRING);
                headers.put(entry.getKey(), value);
            } catch (XPathExpressionException e) {
                e.printStackTrace();
            }
        }

        return headers;
    }

    public static class Builder implements EventDeserializer.Builder {

        @Override
        public EventDeserializer build(Context context, ResettableInputStream in) {
            if (!(in instanceof Seekable)) {
                throw new IllegalArgumentException("Cannot use this deserializer without a Seekable input stream");
            }
            try {
                return new XmlXpathDeserializer(context, in);
            } catch (IOException ex) {
                throw new RuntimeException(ex);
            }
        }
    }

}