com.stratio.parsers.XMLDumpParser.java Source code

Java tutorial

Introduction

Here is the source code for com.stratio.parsers.XMLDumpParser.java

Source

/*
 * Copyright  2010 Santiago M. Mola Velasco <cooldwind@gmail.com>
 */
package com.stratio.parsers;

import com.google.common.base.Charsets;
import static com.google.common.base.Preconditions.*;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.List;
import java.util.zip.GZIPInputStream;
import org.apache.commons.compress.bzip2.CBZip2InputStream;
import org.apache.xerces.parsers.SAXParser;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

public class XMLDumpParser {

    private String dumpFile;
    private InputStream inStream;
    private XMLDumpContentHandler ch;

    public XMLDumpParser(InputStream in, List<String> allowedNamespaces) {
        this.inStream = checkNotNull(in);
        ch = new XMLDumpContentHandler(allowedNamespaces);
    }

    public XMLDumpParser(InputStream in) {
        this(in, null);
    }

    public XMLDumpParser(String dumpFile, List<String> allowedNamespaces) {
        this.dumpFile = checkNotNull(dumpFile);
        ch = new XMLDumpContentHandler(allowedNamespaces);
    }

    public XMLDumpParser(String dumpFile) {
        this(dumpFile, null);
    }

    public void parse() throws SAXException, IOException {
        SAXParser parser = new SAXParser();
        parser.setContentHandler(ch); //TODO: Make sure that the ContentHandler is "reset".
        parser.parse(getInputSource());
    }

    public XMLDumpContentHandler getContentHandler() {
        return ch;
    }

    /**
     *
     * Based on wikixmlj by Delip Rao and Jason Smith.
     * Licensed under the Apache License 2.0.
     * Available at http://code.google.com/p/wikixmlj/
     *
     * @return
     * @throws FileNotFoundException
     * @throws IOException
     */
    protected InputSource getInputSource() throws FileNotFoundException, IOException {
        if (inStream != null) {
            return new InputSource(inStream);
        }

        FileInputStream fis = new FileInputStream(dumpFile);
        InputStream is;
        InputStreamReader isr;

        if (dumpFile.endsWith(".gz")) {
            is = new GZIPInputStream(fis);
        } else if (dumpFile.endsWith(".bz2")) {
            byte[] ignoreBytes = new byte[2];
            fis.read(ignoreBytes); //"B", "Z" bytes from commandline tools
            is = new CBZip2InputStream(fis);
        } else {
            is = fis;
        }

        CharsetDecoder decoder = Charsets.UTF_8.newDecoder();
        decoder.onUnmappableCharacter(CodingErrorAction.IGNORE);
        decoder.onMalformedInput(CodingErrorAction.IGNORE);
        decoder.replaceWith("?");
        isr = new InputStreamReader(is, decoder);

        return new InputSource(isr);
        //return new InputSource(new BufferedReader(isr));
    }
}