eu.fbk.utils.wikipedia.WikipediaPlainTextExtractor.java Source code

Introduction

Here is the source code for eu.fbk.utils.wikipedia.WikipediaPlainTextExtractor.java
Source

/*
 * Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package eu.fbk.utils.wikipedia;

import de.tudarmstadt.ukp.wikipedia.parser.Content;
import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.Section;
import eu.fbk.twm.utils.Defaults;
import eu.fbk.twm.utils.ExtractorParameters;
import eu.fbk.twm.utils.WikipediaExtractor;
import eu.fbk.twm.wiki.xmldump.AbstractWikipediaExtractor;
import eu.fbk.twm.wiki.xmldump.WikipediaTextExtractor;
import eu.fbk.twm.wiki.xmldump.util.ReversePageMap;
import eu.fbk.twm.wiki.xmldump.util.WikiMarkupParser;
import org.apache.commons.cli.*;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import java.io.*;
import java.util.Date;
import java.util.List;
import java.util.Locale;

public class WikipediaPlainTextExtractor extends AbstractWikipediaExtractor implements WikipediaExtractor {

    /**
     * Define a static logger variable so that it references the
     * Logger instance named <code>WikipediaTextExtractor</code>.
     */
    static Logger logger = Logger.getLogger(WikipediaTextExtractor.class.getName());

    private PrintWriter textWriter;

    private ReversePageMap reverseRedirectPageMap;

    public boolean isSkipTitle() {
        return skipTitle;
    }

    public void setSkipTitle(boolean skipTitle) {
        this.skipTitle = skipTitle;
    }

    private boolean skipTitle = false;

    MyWikiModel wikiModel;

    //private PageMap redirectPageMap;

    public WikipediaPlainTextExtractor(int numThreads, int numPages, Locale locale) {
        super(numThreads, numPages, locale);
    }

    @Override
    public void start(ExtractorParameters extractorParameters) {
        try {
            //redirectPageMap = new PageMap(new File(extractorParameters.getWikipediaRedirFileName()));
            //logger.info(redirectPageMap.size() + " redirect pages");

            reverseRedirectPageMap = new ReversePageMap(new File(extractorParameters.getWikipediaRedirFileName()));
            logger.info(reverseRedirectPageMap.size() + " reverse redirect pages");

            textWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(
                    new FileOutputStream(extractorParameters.getWikipediaTextFileName()), "UTF-8")));
            wikiModel = new MyWikiModel("http://www.mywiki.com/wiki/${image}",
                    "http://www.mywiki.com/wiki/${title}");
        } catch (IOException e) {
            logger.error(e);
        }
        startProcess(extractorParameters.getWikipediaXmlFileName());
    }

    @Override
    public void filePage(String text, String title, int wikiID) {
        //To change body of implemented methods use File | Settings | File Templates.
    }

    @Override
    public void contentPage(String text, String title, int wikiID) {

        try {
            //            String rawText = wikiModel.render(text);
            //            rawText = WikipediaText.br2nl(rawText, false, new Whitelist());
            //            rawText = StringEscapeUtils.unescapeHtml4(rawText);
            ////        rawText = rawText.replaceAll("\\n{2,}", "\n\n");
            //            rawText = rawText.replaceAll("\\s+", " ");
            //            rawText = rawText.trim();
            //            synchronized (this) {
            //                textWriter.println(rawText);
            //            }
            WikiMarkupParser wikiMarkupParser = WikiMarkupParser.getInstance();
            String[] prefixes = { imagePrefix, filePrefix };
            ParsedPage parsedPage = wikiMarkupParser.parsePage(text, prefixes);

            StringBuilder sb = new StringBuilder();

            for (Section section : parsedPage.getSections()) {
                List<Content> list = section.getContentList();
                for (int i = 0; i < list.size(); i++) {
                    String rawContent = list.get(i).getText();
                    sb.append(rawContent);
                    sb.append(" ");
                }
            }

            synchronized (this) {
                String ret = sb.toString();
                ret = ret.replaceAll("\\s+", " ");
                ret = ret.trim();
                textWriter.println(ret);
            }

        } catch (Exception e) {
            logger.error("Error processing page " + title + " (" + wikiID + ")");
        }
    }

    @Override
    public void disambiguationPage(String text, String title, int wikiID) {
        //To change body of implemented methods use File | Settings | File Templates.
    }

    @Override
    public void categoryPage(String text, String title, int wikiID) {
        //To change body of implemented methods use File | Settings | File Templates.
    }

    @Override
    public void templatePage(String text, String title, int wikiID) {
        //To change body of implemented methods use File | Settings | File Templates.
    }

    @Override
    public void redirectPage(String text, String title, int wikiID) {
        //To change body of implemented methods use File | Settings | File Templates.
    }

    @Override
    public void portalPage(String text, String title, int wikiID) {
        //To change body of implemented methods use File | Settings | File Templates.
    }

    @Override
    public void projectPage(String text, String title, int wikiID) {
        //To change body of implemented methods use File | Settings | File Templates.
    }

    public void endProcess() {
        super.endProcess();
        textWriter.flush();
        textWriter.close();
    }

    public static void main(String args[]) throws IOException {
        String logConfig = System.getProperty("log-config");
        if (logConfig == null) {
            logConfig = "configuration/log-config.txt";
        }

        PropertyConfigurator.configure(logConfig);

        Options options = new Options();
        try {
            Option wikipediaDumpOpt = OptionBuilder.withArgName("file").hasArg()
                    .withDescription("wikipedia xml dump file").isRequired().withLongOpt("wikipedia-dump")
                    .create("d");
            Option outputDirOpt = OptionBuilder.withArgName("dir").hasArg()
                    .withDescription("output directory in which to store output files").isRequired()
                    .withLongOpt("output-dir").create("o");
            Option numThreadOpt = OptionBuilder.withArgName("int").hasArg()
                    .withDescription("number of threads (default " + Defaults.DEFAULT_THREADS_NUMBER + ")")
                    .withLongOpt("num-threads").create("t");
            Option numPageOpt = OptionBuilder.withArgName("int").hasArg()
                    .withDescription("number of pages to process (default all)").withLongOpt("num-pages")
                    .create("p");
            Option notificationPointOpt = OptionBuilder.withArgName("int").hasArg()
                    .withDescription("receive notification every n pages (default "
                            + Defaults.DEFAULT_NOTIFICATION_POINT + ")")
                    .withLongOpt("notification-point").create("b");

            options.addOption(null, "text-only", false, "skipt title in file");

            options.addOption("h", "help", false, "print this message");
            options.addOption("v", "version", false, "output version information and exit");

            options.addOption(wikipediaDumpOpt);
            options.addOption(outputDirOpt);
            options.addOption(numThreadOpt);
            options.addOption(numPageOpt);
            options.addOption(notificationPointOpt);
            CommandLineParser parser = new PosixParser();
            CommandLine line = parser.parse(options, args);
            logger.debug(line);

            if (line.hasOption("help") || line.hasOption("version")) {
                throw new ParseException("");
            }

            int numThreads = Defaults.DEFAULT_THREADS_NUMBER;
            boolean textOnly = line.hasOption("text-only");
            if (line.hasOption("num-threads")) {
                numThreads = Integer.parseInt(line.getOptionValue("num-threads"));
            }

            int numPages = Defaults.DEFAULT_NUM_PAGES;
            if (line.hasOption("num-pages")) {
                numPages = Integer.parseInt(line.getOptionValue("num-pages"));
            }

            int notificationPoint = Defaults.DEFAULT_NOTIFICATION_POINT;
            if (line.hasOption("notification-point")) {
                notificationPoint = Integer.parseInt(line.getOptionValue("notification-point"));
            }

            ExtractorParameters extractorParameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"),
                    line.getOptionValue("output-dir"));
            WikipediaPlainTextExtractor wikipediaPageParser = new WikipediaPlainTextExtractor(numThreads, numPages,
                    extractorParameters.getLocale());
            wikipediaPageParser.setNotificationPoint(notificationPoint);
            //            wikipediaPageParser.setSkipTitle(textOnly);
            wikipediaPageParser.start(extractorParameters);

            logger.info("extraction ended " + new Date());

        } catch (ParseException e) {
            // oops, something went wrong
            System.out.println("Parsing failed: " + e.getMessage() + "\n");
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(400,
                    "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaTextExtractor",
                    "\n", options, "\n", true);
        }
    }

}