di.uniba.it.tee2.wiki.Wikidump2Text.java Source code

Introduction

Here is the source code for di.uniba.it.tee2.wiki.Wikidump2Text.java
Source

/**
 * Copyright (c) 2014, the TEE2 AUTHORS.
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * Neither the name of the University of Bari nor the names of its contributors
 * may be used to endorse or promote products derived from this software without
 * specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007
 *
 */
package di.uniba.it.tee2.wiki;

import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;

/**
 *
 * @author pierpaolo
 */
public class Wikidump2Text {

    private static String encoding = "UTF-8";

    private static final String notValidTitle = "^[A-Za-z\\s_-]+:[A-Za-z].*$";

    static final Options options;

    static CommandLineParser cmdParser = new BasicParser();

    static {
        options = new Options();
        options.addOption("l", true, "language (italian, english)").addOption("d", true, "the Wikipedia dump")
                .addOption("o", true, "the output file").addOption("e", true, "charset encoding (optional)");
    }

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        try {
            CommandLine cmd = cmdParser.parse(options, args);
            if (cmd.hasOption("l") && cmd.hasOption("d") && cmd.hasOption("o")) {
                encoding = cmd.getOptionValue("e", "UTF-8");
                int counter = 0;
                try {
                    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
                            new GZIPOutputStream(new FileOutputStream(cmd.getOptionValue("o"))), "UTF-8"));
                    WikipediaDumpIterator it = new WikipediaDumpIterator(new File(cmd.getOptionValue("d")),
                            encoding);
                    PageCleaner cleaner = PageCleanerWrapper.getInstance(cmd.getOptionValue("l"));
                    while (it.hasNext()) {
                        WikiPage wikiPage = it.next();
                        ParsedPage parsedPage = wikiPage.getParsedPage();
                        if (parsedPage != null) {
                            String title = wikiPage.getTitle();
                            if (!title.matches(notValidTitle)) {
                                if (parsedPage.getText() != null) {
                                    writer.append(cleaner.clean(parsedPage.getText()));
                                    writer.newLine();
                                    writer.newLine();
                                    counter++;
                                    if (counter % 10000 == 0) {
                                        System.out.println(counter);
                                        writer.flush();
                                    }
                                }
                            }
                        }
                    }
                    writer.flush();
                    writer.close();
                } catch (Exception ex) {
                    Logger.getLogger(Wikidump2Text.class.getName()).log(Level.SEVERE, null, ex);
                }
                System.out.println("Indexed pages: " + counter);
            } else {
                HelpFormatter helpFormatter = new HelpFormatter();
                helpFormatter.printHelp("Wikipedia dump to text", options, true);
            }
        } catch (ParseException ex) {
            Logger.getLogger(Wikidump2Text.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

}