org.ala.documentmapper.XMLDocumentMapper.java Source code

Introduction

Here is the source code for org.ala.documentmapper.XMLDocumentMapper.java
Source

/***************************************************************************
 * Copyright (C) 2009 Atlas of Living Australia
 * All Rights Reserved.
 *
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 ***************************************************************************/
package org.ala.documentmapper;

import java.io.StringReader;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathException;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.ala.model.Licence;
import org.ala.repository.ParsedDocument;
import org.ala.repository.Predicates;
import org.ala.repository.Triple;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.jdom.input.DOMBuilder;
import org.jdom.output.XMLOutputter;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

/**
 * A Document Mapper for mapping XML Documents. This is intended to be extended by 
 * DocumentMapper implementation that are processing XML or XHTML.
 * 
 * @author Dave Martin
 */
public abstract class XMLDocumentMapper implements DocumentMapper {

    protected Logger logger = Logger.getLogger(XMLDocumentMapper.class);

    /** The URI of the current document */
    protected String uri = null;

    /** Collection of XPaths where data will be extracted from for Dublin Core properties */
    private List<Mapping> dcMappingList = new ArrayList<Mapping>();

    /** Collection of XPaths where data will be extracted from for triples */
    private List<Mapping> tripleMappingList = new ArrayList<Mapping>();

    /** The default content type to add */
    protected String contentType = "text/xml";

    /** A map of licences */
    protected Map<String, Licence> licencesMap;

    /**
     * @see org.ala.documentmapper.DocumentMapper#map(java.lang.String, byte[])
     */
    public List<ParsedDocument> map(String uri, byte[] content) throws Exception {

        this.uri = uri;

        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        dbFactory.setNamespaceAware(false);

        InputSource is = new InputSource(new StringReader(new String(content, "UTF-8")));
        DocumentBuilder parser = dbFactory.newDocumentBuilder();
        Document document = null;

        try {
            document = parser.parse(is);
        } catch (Exception e) {
            logger.warn("Unable to process document. Message:" + e.getMessage(), e);
            return new ArrayList<ParsedDocument>();
        }

        ParsedDocument pd = new ParsedDocument();
        pd.setGuid(URLDecoder.decode(uri, "UTF-8"));
        pd.getDublinCore().put(Predicates.DC_IDENTIFIER.toString(), URLDecoder.decode(uri, "UTF-8"));
        pd.setContent(content);
        pd.setContentType(this.contentType);

        //map the dublin core properties
        doMapping(dcMappingList, document, pd, true);

        //map the triple properties      
        doMapping(tripleMappingList, document, pd, false);

        List<ParsedDocument> pds = new ArrayList<ParsedDocument>();
        pds.add(pd);

        extractProperties(pds, document);

        return pds;
    }

    /**
     * Map the fields configured in the supplied <code>mappingList</code>.
     * 
     * @param mappingList
     * @param document
     * @param parsedDoc
     * @param isDublinCore
     */
    private void doMapping(List<Mapping> mappingList, Document document, ParsedDocument parsedDoc,
            boolean isDublinCore) {

        XPathFactory factory = XPathFactory.newInstance();
        XPath xpath = factory.newXPath();
        // if (getNamespaceContext() != null)
        // xpath.setNamespaceContext(getNamespaceContext());
        for (Mapping mapping : mappingList) {

            if (mapping.mappingType == MappingType.XPATH) {
                performXPathMapping(document, parsedDoc, isDublinCore, xpath, mapping);
            } else if (mapping.mappingType == MappingType.REGEX) {
                performRegexMapping(document, parsedDoc, isDublinCore, mapping);
            }
        }
    }

    /**
     * Map properties using a regular expression.
     * 
     * @param document
     * @param parsedDoc
     * @param isDublinCore
     * @param mapping
     */
    private void performRegexMapping(Document document, ParsedDocument parsedDoc, boolean isDublinCore,
            Mapping mapping) {
        // regex handler
        DOMBuilder builder = new DOMBuilder();
        XMLOutputter xml = new XMLOutputter();
        String docString = new String();

        try {
            docString = xml.outputString(builder.build(document));
        } catch (Exception e) {

        }

        String value = new String();

        Pattern p = Pattern.compile(mapping.getQueryString());
        Matcher m = p.matcher(docString);

        int searchIdx = 0;

        while (m.find(searchIdx)) {
            int endIdx = m.end();

            for (int i = 1; i <= m.groupCount(); i++) {
                value += " " + m.group(i);
            }

            createTriples(parsedDoc, isDublinCore, mapping, value);

            searchIdx = endIdx;
        }
    }

    /**
     * Uses the supplied xpath to retrieve values
     * 
     * @param document
     * @param xpathAsString
     * @return
     * @throws Exception
     */
    protected List<String> getXPathValues(Document document, String xpathAsString) throws Exception {

        XPathFactory factory = XPathFactory.newInstance();
        XPath xpath = factory.newXPath();

        List<String> extractedValues = new ArrayList<String>();
        NodeList nodes = (NodeList) xpath.evaluate(xpathAsString, document, XPathConstants.NODESET);

        for (int i = 0; i < nodes.getLength(); i++) {
            String value = extractValue(nodes.item(i));
            value = StringUtils.trimToNull(value);
            if (value != null) {
                extractedValues.add(value);
            }
        }
        return extractedValues;
    }

    /**
     * Uses the supplied xpath to retrieve values
     * 
     * @param document
     * @param xpathAsString
     * @return
     * @throws Exception
     */
    protected String getXPathSingleValue(Document document, String xpathAsString) throws Exception {

        XPathFactory factory = XPathFactory.newInstance();
        XPath xpath = factory.newXPath();

        NodeList nodes = null;

        try {
            nodes = (NodeList) xpath.evaluate(xpathAsString, document, XPathConstants.NODESET);

            for (int i = 0; i < nodes.getLength(); i++) {
                String value = extractValue(nodes.item(i));
                value = StringUtils.trimToNull(value);
                if (value != null) {
                    return value;
                }
            }
        } catch (XPathException e) {
            String value = (String) xpath.evaluate(xpathAsString, document, XPathConstants.STRING);
            return value;
        }

        return null;
    }

    /**
     * Map the property using the xpath mapping supplied.
     * 
     * @param document
     * @param parsedDoc
     * @param isDublinCore
     * @param xpath
     * @param mapping
     */
    private void performXPathMapping(Document document, ParsedDocument parsedDoc, boolean isDublinCore, XPath xpath,
            Mapping mapping) {
        try {

            NodeList nodes = (NodeList) xpath.evaluate(mapping.getQueryString(), document, XPathConstants.NODESET);

            for (int i = 0; i < nodes.getLength(); i++) {

                String value = extractValue(nodes.item(i));
                if (value != null) {

                    if (mapping.isGuid) {
                        parsedDoc.setGuid(value);
                    }
                    //create the triples
                    createTriples(parsedDoc, isDublinCore, mapping, value);
                }
            }
        } catch (XPathExpressionException e) {
            //throw new Exception("Failed to extract XPath property: "+ e.getMessage(), e);
            // To handle the XPATH expressions which don't represent a nodelist. 
            String value;
            try {
                value = trim(xpath.evaluate(mapping.getQueryString(), document));
                if (value != null || !"".equals(value)) {
                    value = value.replaceAll("[\\s&&[^ ]]{1,}", "");
                    value = value.replaceAll("[ ]{2,}", " ");
                    //create the triples
                    createTriples(parsedDoc, isDublinCore, mapping, value);
                }
            } catch (XPathExpressionException e1) {
                logger.warn(e.getMessage(), e);
            }
        }
    }

    /**
     * Create triples for this mapping, adding them to the triples or dublin core properties.
     * 
     * @param parsedDoc
     * @param isDublinCore
     * @param mapping
     * @param value
     */
    private void createTriples(ParsedDocument parsedDoc, boolean isDublinCore, Mapping mapping, String value) {

        value = StringUtils.trimToNull(value);

        if (value != null) {
            for (Predicates predicate : mapping.predicates) {
                if (isDublinCore) {
                    parsedDoc.getDublinCore().put(predicate.toString(), value);
                } else {
                    Triple<String, String, String> triple = new Triple(mapping.subject, predicate.toString(),
                            value);
                    parsedDoc.getTriples().add(triple);
                }
            }
        }
    }

    /**
     * Get the literal value for this predicate.
     * 
     * @param pds
     * @param triplePredicateLocalPart
     * @return
     */
    public String getTripleObjectLiteral(List<ParsedDocument> pds, String triplePredicateLocalPart) {

        ParsedDocument pd = pds.get(0);
        List<Triple<String, String, String>> triples = pd.getTriples();

        for (Triple<String, String, String> triple : triples) {
            if (triple.getPredicate().endsWith(triplePredicateLocalPart)) {
                return triple.getObject().toString();
            }
        }
        return null;
    }

    /**
     * Retrieve a Node value and append to the supplied StringBuffer.
     * 
     * @param sb
     * @param node
     */
    private void getNodeValue(StringBuilder sb, Node node) {
        String value = node.getNodeValue();
        if (value != null && value.length() > 0) {
            sb.append(value);
        }

        // It seems Attribute Nodes - do this implicitly.
        if (node.getNodeType() != Node.ATTRIBUTE_NODE) {
            NodeList nodes = node.getChildNodes();
            for (int i = 0; i < nodes.getLength(); i++) {
                sb.append(" ");
                getNodeValue(sb, nodes.item(i));
            }
        }
    }

    /**
     * If false only the value of the node selected by the XPath is extracted. If true, recursively the values of all children are added too.
     */
    private boolean recursiveValueExtraction = false;

    /**
     * @return the recursiveValueExtraction
     */
    public boolean isRecursiveValueExtraction() {
        return recursiveValueExtraction;
    }

    /**
     * Enables or disables recursive extraction of properties from a
     * XPath node.
     * <br />
     *
     * If enabled the value of the XPath node plus recursively the values of its
     * children are extracted. If not enabled only the value of the selected node
     * is extracted (as before). Disabled by default to be backwards compatible.
     *
     * @param recursiveValueExtraction the recursiveValueExtraction to set
     */
    public void setRecursiveValueExtraction(boolean recursiveValueExtraction) {
        this.recursiveValueExtraction = recursiveValueExtraction;
    }

    /**
     * To be overridden by subclasses, allow subclasses to add some functionality
     * after the basic mapping has been completed.
     * 
     * @param pds
     * @param xmlDocument
     * @throws Exception
     */
    protected void extractProperties(List<ParsedDocument> pds, Document xmlDocument) throws Exception {
    }

    /**
     * Add a mapping for a dublin core property.
     * 
     * FIXME the subject isnt used downstream - should be removed
     * 
     * @param xpath
     * @param targets
     */
    protected void addDCMapping(String query, String subject, Predicates... predicates) {
        dcMappingList.add(new Mapping(query, subject, predicates));
    }

    /**
     * Add a mapping for a dublin core property.
     * 
     * @param xpath
     * @param targets
     */
    protected void addDCMapping(String query, MappingType mappingType, String subject, Predicates... predicates) {
        dcMappingList.add(new Mapping(query, subject, predicates, mappingType));
    }

    /**
     * Add a mapping for a triple.
     * 
     * @param xpath
     * @param targets
     */
    protected void addTripleMapping(String query, String subject, Predicates... predicates) {
        tripleMappingList.add(new Mapping(query, subject, predicates));
    }

    /**
     * Add a mapping for a triple, supplying the mapping type (e.e. xpath or regex).
     * 
     * @param xpath
     * @param targets
     */
    protected void addTripleMapping(String query, MappingType mappingType, String subject,
            Predicates... predicates) {
        tripleMappingList.add(new Mapping(query, subject, predicates, mappingType));
    }

    /**
     * Extract the string value from a Node
     * 
     * @param node
     * @return
     */
    public String extractValue(org.w3c.dom.Node node) {

        String value = null;

        if (isRecursiveValueExtraction()) {
            StringBuilder sb = new StringBuilder();
            getNodeValue(sb, node);
            value = trim(sb.toString());
        } else {
            value = trim(node.getNodeValue());
        }

        if (StringUtils.trimToNull(value) == null)
            return value;

        value = value.replaceAll("[\\s&&[^ ]]{1,}", "");
        value = value.replaceAll("[ ]{2,}", " ");

        return value;
    }

    /**
     * Replaces all white space characters by a single space. Removes all leading and trailing white spaces.
     * @param value The String to be trimmed.
     * @return The trimmed String.
     */
    private String trim(String value) {
        if (value != null) {
            String trimmed = value.replaceAll("[\\p{Z}\\s]+", " ");
            return trimmed.trim();
        }
        return null;
    }

    /**
     * @param licencesMap the licencesMap to set
     */
    public void setLicencesMap(Map<String, Licence> licencesMap) {
        this.licencesMap = licencesMap;
    }
}