org.voyanttools.trombone.input.expand.XmlExpander.java Source code

Introduction

Here is the source code for org.voyanttools.trombone.input.expand.XmlExpander.java
Source

/*******************************************************************************
 * Trombone is a flexible text processing and analysis library used
 * primarily by Voyant Tools (voyant-tools.org).
 * 
 * Copyright () 2007-2012 Stfan Sinclair & Geoffrey Rockwell
 * 
 * This file is part of Trombone.
 * 
 * Trombone is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Trombone is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Trombone.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package org.voyanttools.trombone.input.expand;

import it.svario.xpathapi.jaxp.XPathAPI;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathException;

import net.sf.saxon.lib.NamespaceConstant;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.detect.XmlRootExtractor;
import org.voyanttools.trombone.input.source.InputSource;
import org.voyanttools.trombone.input.source.Source;
import org.voyanttools.trombone.input.source.StringInputSource;
import org.voyanttools.trombone.model.DocumentFormat;
import org.voyanttools.trombone.model.DocumentMetadata;
import org.voyanttools.trombone.model.StoredDocumentSource;
import org.voyanttools.trombone.storage.StoredDocumentSourceStorage;
import org.voyanttools.trombone.util.FlexibleParameters;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;

/**
 * An expander that looks for sub-documents within an XML document, especially
 * if the {@code xmlDocumentsXpath} parameter is set to a valid XPath
 * expression. The XPath expression should now support namespaces declared in
 * the root element. When a single XPath expression is provided, documents are
 * created from each of the matching {@link Node}s. When multiple XPath
 * expressions are provided, all the nodes matching each XPath expression are
 * combined into one document (so one document per XPath expression). To simply
 * extract all of the content from one XML document into one source document,
 * don't use an expander, use the xmlContentXpath parameter instead (which will
 * be handled by the XML parser).
 * 
 * @author "Stfan Sinclair"
 */
class XmlExpander implements Expander {

    /**
     * all parameters sent, only some of which may be relevant to some expanders
     */
    private FlexibleParameters parameters;

    /**
     * the stored document storage strategy
     */
    private StoredDocumentSourceStorage storedDocumentSourceStorage;

    /**
     * the Transformer used to produce XML output from nodes
     */
    private Transformer transformer;

    /**
     * Create a new instance of this expander (this should only be done by
     * {@link StoredDocumentSourceExpander}.
     * 
     * @param storedDocumentSourceStorage
     *            a stored storage strategy
     * @param parameters
     *            that may be relevant to this expander, including
     *            {@code xmlDocumentsXapth}
     */
    XmlExpander(StoredDocumentSourceStorage storedDocumentSourceStorage, FlexibleParameters parameters) {
        this.storedDocumentSourceStorage = storedDocumentSourceStorage;
        this.parameters = parameters;
        try {
            transformer = TransformerFactory.newInstance().newTransformer();
        } catch (TransformerConfigurationException e) {
            throw new IllegalStateException("Unable to create XML transformer.", e);
        }
        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.voyanttools.trombone.input.expand.Expander#
     * getExpandedStoredDocumentSources
     * (org.voyanttools.trombone.document.StoredDocumentSource)
     */
    public List<StoredDocumentSource> getExpandedStoredDocumentSources(StoredDocumentSource storedDocumentSource)
            throws IOException {

        List<StoredDocumentSource> childStoredDocumentSources = new ArrayList<StoredDocumentSource>();

        String xmlDocumentsXpath = parameters.getParameterValue("xmlDocumentsXpath", "");

        // no format specified, so let's have a peek at the contents to see if we can determine a sub-format
        DocumentFormat guessedFormat = DocumentFormat.UNKNOWN;
        if (parameters.getParameterValue("inputFormat", "").isEmpty()) {
            InputStream is = null;
            try {
                is = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(storedDocumentSource.getId());
                XmlRootExtractor xmlRootExtractor = new XmlRootExtractor();
                QName qname = xmlRootExtractor.extractRootElement(is);
                String name = qname.getLocalPart();
                if (name.equals("feed") && qname.getNamespaceURI().toLowerCase().contains("atom"))
                    guessedFormat = DocumentFormat.ATOM;
                else if (name.equals("TEI"))
                    guessedFormat = DocumentFormat.TEI;
                else if (name.equals("teiCorpus"))
                    guessedFormat = DocumentFormat.TEICORPUS;
                else if (name.equals("rss"))
                    guessedFormat = DocumentFormat.RSS;
                else if (name.equals("EEBO"))
                    guessedFormat = DocumentFormat.EEBODREAM;
            } finally {
                if (is != null)
                    is.close();
            }
        }

        // check to see if we need to set xmlDocumentsXpath using defaults for format
        if (xmlDocumentsXpath.isEmpty() && (parameters.getParameterValue("inputFormat", "").isEmpty() == false
                || guessedFormat != DocumentFormat.UNKNOWN)) {

            if (guessedFormat == DocumentFormat.UNKNOWN) {
                guessedFormat = DocumentFormat
                        .valueOf(parameters.getParameterValue("inputFormat", "").toUpperCase());
            }
            Properties properties = new Properties();
            String resourcePath = "/org/voyanttools/trombone/input-formats/" + guessedFormat.name().toLowerCase()
                    + ".xml";
            URL url = this.getClass().getResource(resourcePath);
            if (url != null) {
                File file = new File(url.getPath());
                if (file.exists()) {
                    FileInputStream in = null;
                    try {
                        in = new FileInputStream(file);
                        properties.loadFromXML(in);

                    } finally {
                        if (in != null) {
                            in.close();
                        }
                    }
                }
                if (properties.containsKey("xmlDocumentsXpath")) {
                    xmlDocumentsXpath = properties.getProperty("xmlDocumentsXpath");
                }
            }

        }

        String xmlGroupByXpath = parameters.getParameterValue("xmlGroupByXpath", "");

        if (xmlDocumentsXpath.isEmpty()) {
            childStoredDocumentSources.add(storedDocumentSource);
            return childStoredDocumentSources;
        }

        DocumentMetadata parentMetadata = storedDocumentSource.getMetadata();
        String parentId = storedDocumentSource.getId();
        String multipleExpandedStoredDocumentSourcesPrefix = DigestUtils
                .md5Hex(xmlDocumentsXpath + xmlGroupByXpath);
        childStoredDocumentSources = storedDocumentSourceStorage.getMultipleExpandedStoredDocumentSources(parentId,
                multipleExpandedStoredDocumentSourcesPrefix);
        if (childStoredDocumentSources != null && childStoredDocumentSources.isEmpty() == false) {
            return childStoredDocumentSources;
        }

        // for some reason XPathAPI doesn't work properly with the default
        // XPathFactory, so we'll use Saxon
        System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON,
                "net.sf.saxon.xpath.XPathFactoryImpl");

        InputStream inputStream = null;
        Document doc;
        try {

            inputStream = storedDocumentSourceStorage
                    .getStoredDocumentSourceInputStream(storedDocumentSource.getId());
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            factory.setFeature("http://xml.org/sax/features/validation", false);
            factory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
            factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
            factory.setFeature("http://xml.org/sax/features/external-general-entities", false);
            factory.setIgnoringComments(true);
            DocumentBuilder builder = factory.newDocumentBuilder();
            doc = builder.parse(inputStream);

        } catch (ParserConfigurationException e) {
            throw new IOException("Error with XML parser configuration for " + storedDocumentSource, e);
        } catch (SAXException e) {
            throw new IOException("Error with XML parsing for " + storedDocumentSource, e);
        } finally {
            if (inputStream != null)
                inputStream.close();
        }

        List<NodeInputSource> nodeInputSources = getChildStoredDocumentSources(doc, xmlDocumentsXpath, parentId,
                parentMetadata);

        if (nodeInputSources.isEmpty() == false) {
            if (xmlGroupByXpath.isEmpty() == false) {
                Map<String, List<NodeInputSource>> groupedNodeInputSources = new HashMap<String, List<NodeInputSource>>();
                for (NodeInputSource nodeInputSource : nodeInputSources) {
                    List<String> keys;
                    try {
                        Node fragment = doc.createDocumentFragment();
                        fragment.appendChild(nodeInputSource.node);
                        keys = XPathAPI.selectNodeListAsStrings(fragment, xmlGroupByXpath);
                    } catch (XPathException e) {
                        throw new IllegalArgumentException("Unable to use this XPath: " + xmlGroupByXpath, e);
                    }
                    if (keys.isEmpty() == false) {
                        String key = StringUtils.join(keys, " ");
                        if (groupedNodeInputSources.containsKey(key) == false) {
                            groupedNodeInputSources.put(key, new ArrayList<NodeInputSource>());
                        }
                        groupedNodeInputSources.get(key).add(nodeInputSource);
                    }
                }
                for (Map.Entry<String, List<NodeInputSource>> mappedNodeInputSources : groupedNodeInputSources
                        .entrySet()) {
                    List<NodeInputSource> mappedNodeInputSourcesList = mappedNodeInputSources.getValue();
                    //               if (mappedNodeInputSourcesList.size()==1) { // just one, so use it
                    //                  childStoredDocumentSources.add(getStoredDocumentSource(mappedNodeInputSourcesList.get(0)));
                    //               }
                    //               else { // multiple, we need to wrap with root node
                    String key = mappedNodeInputSources.getKey();
                    Node newParentNode = doc.getDocumentElement().cloneNode(false);
                    for (NodeInputSource nodeInputSource : mappedNodeInputSourcesList) {
                        newParentNode.appendChild(nodeInputSource.node);
                    }
                    NodeInputSource newNodeInputSource = getChildStoredDocumentSource(newParentNode, parentId,
                            parentMetadata, parentId + ";group:" + key);
                    newNodeInputSource.documentMetadata.setTitle(key);
                    childStoredDocumentSources.add(getStoredDocumentSource(newNodeInputSource));
                    //               }
                }
            } else {
                for (NodeInputSource nodeInputSource : nodeInputSources) {
                    childStoredDocumentSources.add(getStoredDocumentSource(nodeInputSource));
                }
            }

        }
        // each node is a separate document
        //      if (xmlDocumentsXpaths.length == 1) {
        //         childStoredDocumentSources.addAll(getChildStoredDocumentSources(
        //               doc, xmlDocumentsXpaths[0], parentId, parentMetadata));
        //      }
        //
        //      // each xpath is a separate document
        //      else {
        //         childStoredDocumentSources.addAll(getChildStoredDocumentSources(
        //               doc, xmlDocumentsXpaths, parentId, parentMetadata));
        //      }

        storedDocumentSourceStorage.setMultipleExpandedStoredDocumentSources(parentId, childStoredDocumentSources,
                multipleExpandedStoredDocumentSourcesPrefix);

        return childStoredDocumentSources;
    }

    /**
     * Get a list of stored document sources. Matching nodes for each XPath
     * expression are concatenated into a single document (one document per
     * XPath).
     * 
     * @param doc
     *            the {@link Document} to be searched
     * @param xmlDocumentsXpaths
     *            the list of XPath expressions to find nodes
     * @param parentId
     *            the ID of the stored parent document
     * @param parentMetadata
     *            the metadata of the stored parent document
     * @return a list of {@link StoredDocumentSource}s
     * @throws IOException
     *             an exception that occurs during processing
     */
    /*
    private List<StoredDocumentSource> getChildStoredDocumentSources(
     Document doc, String[] xmlDocumentsXpaths, String parentId,
     DocumentMetadata parentMetadata) throws IOException {
       List<StoredDocumentSource> childStoredDocumentSources = new ArrayList<StoredDocumentSource>();
       for (int i = 0, len = xmlDocumentsXpaths.length; i < len; i++) {
        
     List<Node> docs;
     try {
        docs = XPathAPI.selectListOfNodes(doc.getDocumentElement(),
              xmlDocumentsXpaths[i], doc.getDocumentElement());
     } catch (XPathException e) {
        throw new IllegalArgumentException(
              "A problem was encountered proccesing this XPath query: "
                    + xmlDocumentsXpaths[i], e);
     }
     if (docs.isEmpty()) {
        continue;
     }
     Node newParentNode = doc.getDocumentElement().cloneNode(false);
     for (Node node : docs) {
        newParentNode.appendChild(node);
     }
     StoredDocumentSource childStoredDocumentSource = getChildStoredDocumentSource(
           newParentNode, parentId, parentMetadata,
           xmlDocumentsXpaths[i] + "[" + (i) + "]");
     childStoredDocumentSources.add(childStoredDocumentSource);
       }
       return childStoredDocumentSources;
    }
    */

    /**
     * Get a list of stored document sources. Each node matching the specified
     * XPath expression becomes a separate document.
     * 
     * @param doc
     *            the {@link Document} to be searched
     * @param xmlDocumentsXpath
     *            the XPath expressions to find nodes
     * @param parentId
     *            the ID of the stored parent document
     * @param parentMetadata
     *            the metadata of the stored parent document
     * @return a list of {@link StoredDocumentSource}s
     * @throws IOException
     *             an exception that occurs during processing
     */
    private List<NodeInputSource> getChildStoredDocumentSources(Document doc, String xmlDocumentsXpath,
            String parentId, DocumentMetadata parentMetadata) throws IOException {
        List<NodeInputSource> childNodeInputSources = new ArrayList<NodeInputSource>();
        List<Node> docs;
        try {
            docs = XPathAPI.selectListOfNodes(doc.getDocumentElement(), xmlDocumentsXpath,
                    doc.getDocumentElement());
        } catch (XPathException e) {
            throw new IllegalArgumentException(
                    "A problem was encountered proccesing this XPath query: " + xmlDocumentsXpath, e);
        }
        for (int i = 0, len = docs.size(); i < len; i++) {
            NodeInputSource childStoredDocumentSource = getChildStoredDocumentSource(docs.get(i), parentId,
                    parentMetadata, xmlDocumentsXpath + "[" + (i) + "]");
            childNodeInputSources.add(childStoredDocumentSource);

        }
        return childNodeInputSources;
    }

    /**
     * Get a {@link StoredDocumentSource} from the specified {@link Node} and
     * parent information.
     * 
     * @param node
     *            the {@link Node} from with to produce an XML document
     * @param parentId
     *            the ID of the stored parent document
     * @param parentMetadata
     *            the metadata of the stored parent document
     * @param location
     *            the approximate XPath location that can help generate a unique
     *            identifier
     * @return a {@link StoredDocumentSource}
     * @throws IOException
     *             an exception that occurs during IO processing
     */
    private NodeInputSource getChildStoredDocumentSource(Node node, String parentId,
            DocumentMetadata parentMetadata, String location) throws IOException {
        DocumentMetadata metadata = parentMetadata.asParent(parentId, DocumentMetadata.ParentType.EXPANSION);
        metadata.setModified(parentMetadata.getModified());
        metadata.setSource(Source.STRING);
        metadata.setLocation(location);
        metadata.setDocumentFormat(DocumentFormat.XML);
        String id = DigestUtils.md5Hex(parentId + location);
        return new NodeInputSource(id, node, metadata);
    }

    private StoredDocumentSource getStoredDocumentSource(NodeInputSource nodeInputSource) throws IOException {
        StringWriter sw = new StringWriter(); // no need to close
        Result streamResult = new StreamResult(sw);
        try {
            transformer.transform(new DOMSource(nodeInputSource.node), streamResult);
        } catch (TransformerException e) {
            throw new IOException(
                    "Unable to transform node from stored document: " + nodeInputSource.documentMetadata);
        }
        InputSource inputSource = new StringInputSource(nodeInputSource.id, nodeInputSource.documentMetadata,
                sw.toString());
        return storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
    }

    private class NodeInputSource {
        private Node node;
        private String id;
        private DocumentMetadata documentMetadata;

        private NodeInputSource(String id, Node node, DocumentMetadata documentMetadata) {
            this.node = node;
            this.id = id;
            this.documentMetadata = documentMetadata;
        }
    }
}