org.psikeds.knowledgebase.xml.impl.XMLParser.java Source code

Java tutorial

Introduction

Here is the source code for org.psikeds.knowledgebase.xml.impl.XMLParser.java

Source

/*******************************************************************************
 * psiKeds :- ps induced knowledge entity delivery system
 *
 * Copyright (c) 2013, 2014 Karsten Reincke, Marco Juliano, Deutsche Telekom AG
 *
 * This file is free software: you can redistribute
 * it and/or modify it under the terms of the
 * [x] GNU Affero General Public License
 * [ ] GNU General Public License
 * [ ] GNU Lesser General Public License
 * [ ] Creatice Commons ShareAlike License
 *
 * For details see file LICENSING in the top project directory
 *******************************************************************************/
package org.psikeds.knowledgebase.xml.impl;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBElement;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.stream.EventFilter;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;

import org.xml.sax.SAXException;

import org.springframework.core.io.Resource;
import org.springframework.util.StringUtils;

import org.psikeds.knowledgebase.xml.KBParser;
import org.psikeds.knowledgebase.xml.KBParserCallback;

/**
 * This is a streaming parser suitable for big XML-Files, e.g. the
 * psiKeds Knowledge Base. It uses JAXB in combination with the
 * Iterator-API of StAX.<br>
 *
 * Note:
 * XML-Data can be supplied as either File, Inputstream or Spring-Resource.<br>
 * 
 * @author marco@juliano.de
 *
 */
public class XMLParser implements KBParser {

    private static final String DEFAULT_PACKAGE = "org.psikeds.knowledgebase.jaxb";
    private static final String DEFAULT_ENCODING = "UTF-8";
    private static final int DEFAULT_SKIPPED_ELEMENTS = 0;

    /**
     * Accept every Event triggered by the start of a new XML-Element.
     */
    private static final EventFilter DEFAULT_EVENT_FILTER = new EventFilter() {

        @Override
        public boolean accept(final XMLEvent event) {
            return event != null && event.isStartElement();
        }
    };

    // -------------------------------------------------------------

    private String encoding;
    private String xmlFilename;
    private InputStream xmlStream;
    private Resource xmlResource;
    private String packageName;
    private Class<?> elementClass;
    private KBParserCallback callbackHandler;
    private EventFilter eventFilter;
    private int numOfSkippedElements;

    /**
     * Default constructor
     */
    public XMLParser() {
        this.encoding = DEFAULT_ENCODING;
        this.xmlFilename = null;
        this.xmlStream = null;
        this.xmlResource = null;
        this.packageName = DEFAULT_PACKAGE;
        this.elementClass = null;
        this.callbackHandler = null;
        this.eventFilter = DEFAULT_EVENT_FILTER;
        this.numOfSkippedElements = DEFAULT_SKIPPED_ELEMENTS;
    }

    /**
     * Constructor using filename and encoding type.
     * 
     * @param xmlFilename
     *          Name of the XML-File
     * @param encoding
     *          Encoding type, e.g. ISO-8859-1 or UTF-8
     * @param callbackHandler
     *          Callback handler used to process every single found XML-Element
     *          (@see org.psikeds.knowledgebase.xml.KBParserCallback#handleElement(java.lang.Object))
     */
    public XMLParser(final String xmlFilename, final String encoding, final KBParserCallback callbackHandler) {
        this();
        this.xmlFilename = xmlFilename;
        this.encoding = encoding;
        this.callbackHandler = callbackHandler;
    }

    /**
     * Constructor using filename and UTF-8 encoding.
     * 
     * @param xmlFilename
     *          Name of the XML-File
     */
    public XMLParser(final String xmlFilename) {
        this(xmlFilename, DEFAULT_ENCODING, null);
    }

    /**
     * Constructor using input stream and encoding type.
     * 
     * @param xmlStream
     *          Stream to the XML-Data
     * @param encoding
     *          Encoding type, e.g. ISO-8859-1 or UTF-8
     * @param callbackHandler
     *          Callback handler used to process every single found XML-Element
     *          (@see org.psikeds.knowledgebase.xml.KBParserCallback#handleElement(java.lang.Object))
     */
    public XMLParser(final InputStream xmlStream, final String encoding, final KBParserCallback callbackHandler) {
        this();
        this.xmlStream = xmlStream;
        this.encoding = encoding;
        this.callbackHandler = callbackHandler;
    }

    /**
     * Constructor using input stream and UTF-8 encoding.
     * 
     * @param xmlStream
     *          Stream to the XML-Data
     */
    public XMLParser(final InputStream xmlStream) {
        this(xmlStream, DEFAULT_ENCODING, null);
    }

    /**
     * Constructor using Spring-Resource and encoding type.
     * 
     * @param xmlResource
     *          Spring-Resource for the XML-Data
     * @param encoding
     *          Encoding type, e.g. ISO-8859-1 or UTF-8
     * @param callbackHandler
     *          Callback handler used to process every single found XML-Element
     *          (@see org.psikeds.knowledgebase.xml.KBParserCallback#handleElement(java.lang.Object))
     */
    public XMLParser(final Resource xmlResource, final String encoding, final KBParserCallback callbackHandler) {
        this();
        this.xmlResource = xmlResource;
        this.encoding = encoding;
        this.callbackHandler = callbackHandler;
    }

    /**
     * Constructor using Spring-Resource and UTF-8 encoding.
     * 
     * @param xmlResource
     *          Spring-Resource for the XML-Data
     */
    public XMLParser(final Resource xmlResource) {
        this(xmlResource, DEFAULT_ENCODING, null);
    }

    // -------------------------------------------------------------

    /**
     * @return the xmlFilename
     */
    public String getXmlFilename() {
        return this.xmlFilename;
    }

    /**
     * @param xmlFilename
     *          the xmlFilename to set
     */
    public void setXmlFilename(final String xmlFilename) {
        this.xmlFilename = xmlFilename;
    }

    /**
     * @return the encoding
     */
    public String getEncoding() {
        return this.encoding;
    }

    /**
     * @param encoding
     *          the encoding to set
     */
    public void setEncoding(final String encoding) {
        this.encoding = encoding;
    }

    /**
     * @return the xmlStream
     */
    public InputStream getXmlStream() {
        return this.xmlStream;
    }

    /**
     * @param xmlStream
     *          the xmlStream to set
     */
    public void setXmlStream(final InputStream xmlStream) {
        this.xmlStream = xmlStream;
    }

    /**
     * @return the xmlResource
     */
    public Resource getXmlResource() {
        return this.xmlResource;
    }

    /**
     * @param xmlResource
     *          the xmlResource to set
     */
    public void setXmlResource(final Resource xmlResource) {
        this.xmlResource = xmlResource;
    }

    /**
     * @return the packageName
     */
    public String getPackageName() {
        return this.packageName;
    }

    /**
     * @param packageName
     *          the packageName to set
     */
    public void setPackageName(final String packageName) {
        this.packageName = packageName;
    }

    /**
     * @return the elementClass
     */
    public Class<?> getElementClass() {
        return this.elementClass;
    }

    /**
     * @param elementClass
     *          the elementClass to set
     */
    public void setElementClass(final Class<?> elementClass) {
        this.elementClass = elementClass;
    }

    /**
     * @return the callbackHandler
     */
    @Override
    public KBParserCallback getCallbackHandler() {
        return this.callbackHandler;
    }

    /**
     * @param callbackHandler
     *          the callbackHandler to set
     */
    @Override
    public void setCallbackHandler(final KBParserCallback callbackHandler) {
        this.callbackHandler = callbackHandler;
    }

    /**
     * @return the eventFilter
     */
    public EventFilter getEventFilter() {
        return this.eventFilter;
    }

    /**
     * @param eventFilter
     *          the eventFilter to set
     */
    public void setEventFilter(final EventFilter eventFilter) {
        this.eventFilter = eventFilter;
    }

    /**
     * @return the numOfSkippedElements
     */
    public int getNumOfSkippedElements() {
        return this.numOfSkippedElements;
    }

    /**
     * @param numSkipped
     *          the numOfSkippedElements to set
     */
    public void setNumOfSkippedElements(final int numSkipped) {
        this.numOfSkippedElements = numSkipped;
    }

    // -------------------------------------------------------------
    // ---- Methods for parsing XML
    // -------------------------------------------------------------

    /**
     * Parse the specified XML and unmarshal it to JAXB-Object-Structures.
     * Suitable for big XML files by using JAXB combination with StAX.<br>
     * All classes in the specified package can be parsed.<br>
     * 
     * @return Total number of unmarshalled XML-Elements
     * @throws XMLStreamException
     * @throws SAXException
     * @throws JAXBException
     * @throws IOException
     */
    @Override
    public long parseXmlElements() throws XMLStreamException, SAXException, JAXBException, IOException {
        Reader xml = null;
        try {
            xml = createXmlReader();
            return parseXmlElements(xml);
        } finally {
            if (xml != null) {
                try {
                    xml.close();
                } catch (final IOException ex) {
                    // ignore
                } finally {
                    xml = null;
                }
            }
        }
    }

    private Reader createXmlReader() throws UnsupportedEncodingException, IOException {
        if (!StringUtils.isEmpty(this.encoding)) {
            if (this.xmlResource != null) {
                return new InputStreamReader(this.xmlResource.getInputStream(), this.encoding);
            }
            if (this.xmlStream != null) {
                return new InputStreamReader(this.xmlStream, this.encoding);
            }
            if (!StringUtils.isEmpty(this.xmlFilename)) {
                return new InputStreamReader(new FileInputStream(this.xmlFilename), this.encoding);
            }
        }
        throw new IllegalArgumentException("Unsupported configuration settings!");
    }

    private long parseXmlElements(final Reader xml) throws XMLStreamException, JAXBException {
        if (this.callbackHandler != null) {
            if (this.elementClass != null) {
                return parseXmlElements(xml, this.elementClass, this.callbackHandler, this.eventFilter,
                        this.numOfSkippedElements);
            }
            if (!StringUtils.isEmpty(this.packageName)) {
                return parseXmlElements(xml, this.packageName, this.callbackHandler, this.eventFilter,
                        this.numOfSkippedElements);
            }
        }
        throw new IllegalArgumentException(
                "Unsupported configuration settings! Must specify a Callback-Handler and either a Package-Name or an Element-Class.");
    }

    // -------------------------------------------------------------
    // ---- Static helpers for parsing XML
    // -------------------------------------------------------------

    /**
     * Helper for parsing big XML files using JAXB in combination with StAX.<br>
     * All classes within the Package <b>org.psikeds.knowledgebase.jaxb</b> can
     * be parsed.<br>
     * The XML-Root-Element will be skipped, i.e. all Elements below will be
     * unmarshalled.<br>
     * <b>Note:</b> The XML reader will not be closed. This must be invoked by
     * the caller afterwards!<br>
     * 
     * @param xml
     *          Reader for XML-Data
     * @param handler
     *          Callback handler used to process every single found XML
     *          element (@see
     *          org.psikeds.knowledgebase.xml.KBParserCallback#handleElement
     *          (java.lang.Object))
     * @return Total number of unmarshalled XML-Elements
     * @throws XMLStreamException
     * @throws JAXBException
     */
    public static long parseXmlElements(final Reader xml, final KBParserCallback handler)
            throws XMLStreamException, JAXBException {

        return parseXmlElements(xml, DEFAULT_PACKAGE, handler, DEFAULT_EVENT_FILTER, DEFAULT_SKIPPED_ELEMENTS);
    }

    /**
     * Helper for parsing big XML files using JAXB in combination with StAX.<br>
     * All classes in the specified package can be parsed.<br>
     * <b>Note:</b> The XML reader will not be closed. This must be invoked by
     * the caller afterwards!<br>
     * 
     * @param xml
     *          Reader for XML-Data
     * @param packageName
     *          Name of the package containing the JAXB-Classes,
     *          e.g. org.psikeds.knowledgebase.jaxb
     * @param handler
     *          Callback handler used to process every single found
     *          XML-Element (@see
     *          org.psikeds.knowledgebase.xml.KBParserCallback#handleElement
     *          (java.lang.Object))
     * @param filter
     *          EventFilter used for StAX-Parsing
     * @param numSkipped
     *          Number of Elements to be skipped,
     *          e.g. numSkipped = 1 for skipping the XML-Root-Element.
     * @return Total number of unmarshalled XML-Elements
     * @throws XMLStreamException
     * @throws JAXBException
     */
    public static long parseXmlElements(final Reader xml, final String packageName, final KBParserCallback handler,
            final EventFilter filter, final int numSkipped) throws XMLStreamException, JAXBException {

        // init stream reader
        final XMLInputFactory staxFactory = XMLInputFactory.newInstance();
        final XMLEventReader staxReader = staxFactory.createXMLEventReader(xml);
        final XMLEventReader filteredReader = filter == null ? staxReader
                : staxFactory.createFilteredReader(staxReader, filter);

        skipXmlElements(filteredReader, numSkipped);

        // JAXB with specific package
        final JAXBContext jaxbCtx = JAXBContext.newInstance(packageName);
        final Unmarshaller unmarshaller = jaxbCtx.createUnmarshaller();

        // parsing und unmarshalling
        long counter = 0;
        while (filteredReader.peek() != null) {
            final Object element = unmarshaller.unmarshal(staxReader);
            handleElement(handler, element);
            counter++;
        }
        return counter;
    }

    /**
     * Helper for parsing big XML files using JAXB in combination with StAX.<br>
     * Only XML-Elements of the specified Top-Level-Class will be parsed.<br>
     * <b>Note:</b> The XML reader will not be closed. This must be invoked by
     * the caller afterwards!<br>
     * 
     * @param xml
     *          Reader for XML-Data
     * @param elementClass
     *          Top-Level-Class used for JAXB-Unmarshalling
     * @param handler
     *          Callback handler used to process every single found XML
     *          element (@see
     *          org.psikeds.knowledgebase.xml.KBParserCallback#handleElement
     *          (java.lang.Object))
     * @return Total number of unmarshalled XML-Elements
     * @throws XMLStreamException
     * @throws JAXBException
     */
    public static long parseXmlElements(final Reader xml, final Class<?> elemClazz, final KBParserCallback handler)
            throws XMLStreamException, JAXBException {

        return parseXmlElements(xml, elemClazz, handler, DEFAULT_EVENT_FILTER, DEFAULT_SKIPPED_ELEMENTS);
    }

    /**
     * Helper for parsing big XML files using JAXB in combination with StAX.<br>
     * Only XML-Elements of the specified Top-Level-Class will be parsed.<br>
     * <b>Note:</b> The XML reader will not be closed. This must be invoked by
     * the caller afterwards!<br>
     * 
     * @param xml
     *          Reader for XML-Data
     * @param elementClass
     *          Top-Level-Class used for JAXB-Unmarshalling
     * @param handler
     *          Callback handler used to process every single found XML
     *          element (@see
     *          org.psikeds.knowledgebase.xml.KBParserCallback#handleElement
     *          (java.lang.Object))
     * @param filter
     *          EventFilter used for StAX-Parsing
     * @param numSkipped
     *          Number of Elements to be skipped,
     *          e.g. numSkipped = 1 for skipping the XML-Root-Element.
     * @return Total number of unmarshalled XML-Elements
     * @throws XMLStreamException
     * @throws JAXBException
     */
    public static long parseXmlElements(final Reader xml, final Class<?> elemClazz, final KBParserCallback handler,
            final EventFilter filter, final int numSkipped) throws XMLStreamException, JAXBException {

        // init stream reader
        final XMLInputFactory staxFactory = XMLInputFactory.newInstance();
        final XMLEventReader staxReader = staxFactory.createXMLEventReader(xml);
        final XMLEventReader filteredReader = filter == null ? staxReader
                : staxFactory.createFilteredReader(staxReader, filter);

        skipXmlElements(filteredReader, numSkipped);

        // JAXB with specific top-level-class
        final JAXBContext jaxbCtx = JAXBContext.newInstance(elemClazz);
        final Unmarshaller unmarshaller = jaxbCtx.createUnmarshaller();

        // parsing und unmarshalling
        long counter = 0;
        while (filteredReader.peek() != null) {
            final Object element = unmarshaller.unmarshal(staxReader, elemClazz);
            handleElement(handler, element);
            counter++;
        }
        return counter;
    }

    private static void handleElement(final KBParserCallback handler, final Object element) {
        // Note1: The unmarshalled "element" is a complete JAXB-Object-Structure
        // containing not only the current XML-Element but also the Content-Tree
        // of all its Sub-Elements!

        // Note2: Depending on the XSD-Design (Venetian Blind vs. Salami Slice)
        // and the Kind of Invocation of the XML-Parser (Package vs. Class)
        // the unmarshalled Element might not be the expected JAXB-Object but
        // an Instance of JAXBElement wrapping the actual JAXB-Object:
        if (element instanceof JAXBElement && ((JAXBElement<?>) element).getValue() != null) {
            handler.handleElement(((JAXBElement<?>) element).getValue());
        } else {
            handler.handleElement(element);
        }
    }

    private static void skipXmlElements(final XMLEventReader filteredReader, final int numSkipped)
            throws XMLStreamException {
        for (int idx = 0; idx < numSkipped; idx++) {
            filteredReader.nextEvent();
        }
    }
}