org.graphipedia.wikipedia.parser.SimpleStaxParser.java Source code

Java tutorial

Introduction

Here is the source code for org.graphipedia.wikipedia.parser.SimpleStaxParser.java

Source

//
// Copyright (c) 2012 Mirko Nasato
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
package org.graphipedia.wikipedia.parser;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.events.XMLEvent;

import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.codehaus.stax2.XMLInputFactory2;

/**
 * A simple parser of a XML file.
 *
 */
public abstract class SimpleStaxParser {

    /**
     * Object used to get the content of a XML file.
     */
    private static final XMLInputFactory XML_INPUT_FACTORY = XMLInputFactory2.newInstance();

    /**
     * Elements in the XML file that are extracted.
     */
    private final List<String> interestingElements;

    /**
     * Elements with attributes in the XML file that are extracted.
     */
    private final List<String> interestingElementsWithAttributes;

    /**
     * Creates a new parser.
     * @param interestingElements The list of the elements in the XML file to parse that need to be considered.
     * @param interestingElementsWithAttributes The list of the elements in the XML file to parse that need to be considered and have attributes.
     */
    public SimpleStaxParser(List<String> interestingElements, List<String> interestingElementsWithAttributes) {
        this.interestingElements = interestingElements;
        this.interestingElementsWithAttributes = interestingElementsWithAttributes;
    }

    /**
     * Handles an element of a XML file.
     * @param element The element (XML tag).
     * @param value The value of the element.
     * @return {@code true} if parsing should be continued, {@code false otherwise}.
     * @throws XMLStreamException when some XML-related error occurs.
     */
    protected abstract boolean handleElement(String element, String value) throws XMLStreamException;

    /**
     * Handles an element of a XML file with attributes.
     * @param element The element (XML tag).
     * @param value The value of the element.
     * @param attributeValues The values of the attributes of the given {@code element}.
     * @return {@code true} if parsing should be continued, {@code false otherwise}.
     * @throws XMLStreamException when some XML-related error occurs.
     */
    protected abstract boolean handleElement(String element, String value, List<String> attributeValues)
            throws XMLStreamException;

    /**
     * Parses a XML file.
     * @param fileName The name of the XML file to parse.
     * @throws IOException when something goes wrong while reading the input file.
     * @throws XMLStreamException when something goes wrong while parsing the XML file.
     * @throws CompressorException when something goes wrong while opening the XML file (compressed file).
     */
    public void parse(String fileName) throws IOException, XMLStreamException, CompressorException {

        FileInputStream fin = new FileInputStream(fileName);
        BufferedInputStream bis = new BufferedInputStream(fin);
        CompressorInputStream input = new CompressorStreamFactory().createCompressorInputStream(bis);
        parse(input);
        fin.close();
        bis.close();
        input.close();
    }

    /**
     * Parses a XML file. 
     * @param inputStream The input XML stream.
     * @throws IOException when something goes wrong while reading the input file.
     * @throws XMLStreamException when something goes wrong while parsing the XML file.
     */
    private void parse(InputStream inputStream) throws IOException, XMLStreamException {
        XMLStreamReader reader = XML_INPUT_FACTORY.createXMLStreamReader(inputStream, "UTF-8");
        try {
            parseElements(reader);
        } finally {
            reader.close();
            inputStream.close();
        }
    }

    /**
     * Parses the elements in the XML file.
     * @param reader The XML stream.
     * @throws XMLStreamException when something goes wrong while parsing the XML file.
     */
    private void parseElements(XMLStreamReader reader) throws XMLStreamException {
        LinkedList<String> elementStack = new LinkedList<String>();
        StringBuilder textBuffer = new StringBuilder();
        List<String> attributeValues = new ArrayList<String>();

        while (reader.hasNext()) {
            switch (reader.next()) {
            case XMLEvent.START_ELEMENT:
                String startElement = reader.getName().getLocalPart();
                elementStack.push(startElement);
                attributeValues = new ArrayList<String>();
                if (isInterestingWithAttributes(startElement)) {
                    int noAttributes = reader.getAttributeCount();
                    for (int i = 0; i < noAttributes; i += 1)
                        attributeValues.add(reader.getAttributeValue(i));
                }
                textBuffer.setLength(0);
                break;
            case XMLEvent.END_ELEMENT:
                String element = elementStack.pop();
                if (isInterestingWithAttributes(element)) {
                    if (!handleElement(element, textBuffer.toString().trim(), attributeValues))
                        return;
                } else if (isInteresting(element)) {
                    if (!handleElement(element, textBuffer.toString().trim()))
                        return;
                }
                break;
            case XMLEvent.CHARACTERS:
                if (isInteresting(elementStack.peek())) {
                    textBuffer.append(reader.getText());
                }
                break;
            }
        }
    }

    /**
     * Returns whether a XML element has to be considered and has attributes.
     * @param element A XML element.
     * @return {@code true} if the given {@code element} has to be considered and has attributes, {@code false} otherwise.
     */
    private boolean isInterestingWithAttributes(String element) {
        return this.interestingElementsWithAttributes.contains(element);
    }

    /**
     * Returns whether a XML element has to be considered but has not attributes. 
     * @param element A XML element.
     * @return {@code true} if the given {@code element} has to be considered, {@code false} otherwise.
     */
    private boolean isInteresting(String element) {
        return interestingElements.contains(element) || isInterestingWithAttributes(element);
    }

}