nlp.mediawiki.parser.SinglestreamXmlDumpParser.java Source code

Introduction

Here is the source code for nlp.mediawiki.parser.SinglestreamXmlDumpParser.java
Source

/**
 * This file is part of Wikiforia.
 *
 * Wikiforia is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Wikiforia is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Foobar.  If not, see <http://www.gnu.org/licenses/>.
 */
package nlp.mediawiki.parser;

import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import nlp.mediawiki.model.Page;
import nlp.pipeline.AbstractEmitter;
import nlp.pipeline.Source;

import java.io.*;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collections;

/**
 * Singlestream Mediawiki xml dump parser (supporting uncompressed and bzip2 compressed)
 */
public class SinglestreamXmlDumpParser extends AbstractEmitter<Page, Void> implements Source<Page, Void> {

    private final InputStream input;
    private final XmlDumpParser parser;
    private final int batchsize;
    private final File pageInput;

    /**
     * File constructor
     * @param input file to read from
     */
    public SinglestreamXmlDumpParser(File input) {
        this(input, 100);
    }

    /**
     * Stream constructor
     * @param stream stream to read from
     */
    public SinglestreamXmlDumpParser(InputStream stream) {
        this(stream, 100);
    }

    /**
     * File constructor with batchsize
     * @param path file to read from
     * @param batchsize the size of a batch
     */
    public SinglestreamXmlDumpParser(File path, int batchsize) {
        this.pageInput = path;
        this.batchsize = batchsize;
        try {
            if (path.getAbsolutePath().toLowerCase().endsWith(".bz2")) {
                this.input = new BZip2CompressorInputStream(new FileInputStream(path), true);
            } else {
                this.input = new FileInputStream(path);
            }
        } catch (IOException e) {
            throw new IOError(e);
        }

        parser = new XmlDumpParser(input);

    }

    /**
     * Stream constructor with batchsize
     * @param stream stream to read from
     * @param batchsize the size of a batch
     */
    public SinglestreamXmlDumpParser(InputStream stream, int batchsize) {
        this.pageInput = null;
        this.input = stream;
        this.batchsize = batchsize;
        this.parser = new XmlDumpParser(input);
    }

    @Override
    public void run() {
        ArrayList<Page> pages = new ArrayList<Page>(batchsize);
        Page page;

        while ((page = parser.next()) != null) {
            pages.add(page);
            if (pages.size() == batchsize) {
                output(pages);
                pages = new ArrayList<Page>(batchsize);
            }
        }

        if (!pages.isEmpty())
            output(pages);

        output(Collections.<Page>emptyList());
    }

    @Override
    public String toString() {
        NumberFormat nf = NumberFormat.getIntegerInstance();
        nf.setGroupingUsed(true);

        return String.format("Singlestreamed XML Dump parser { \n * Batch size: %s, \n * Input: %s \n}",
                nf.format(batchsize), pageInput == null ? "[Inputstream]" : pageInput.getAbsolutePath());
    }
}