ivory.app.PreprocessWikipedia.java Source code

Introduction

Here is the source code for ivory.app.PreprocessWikipedia.java
Source

/*
 * Ivory: A Hadoop toolkit for web-scale information retrieval research
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.app;

import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.preprocess.BuildDictionary;
import ivory.core.preprocess.BuildIntDocVectors;
import ivory.core.preprocess.BuildTargetLangWeightedIntDocVectors;
import ivory.core.preprocess.BuildTermDocVectors;
import ivory.core.preprocess.BuildTranslatedTermDocVectors;
import ivory.core.preprocess.BuildWeightedIntDocVectors;
import ivory.core.preprocess.BuildWeightedTermDocVectors;
import ivory.core.preprocess.ComputeGlobalTermStatistics;
import ivory.core.tokenize.TokenizerFactory;
import java.io.IOException;
import java.util.Arrays;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMappingBuilder;
import edu.umd.cloud9.collection.wikipedia.RepackWikipedia;
import edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMapping;
import edu.umd.hooka.Vocab;
import edu.umd.hooka.alignment.HadoopAlign;

/**
 * Driver class that preprocesses a Wikipedia collection in any language. 
 * 
 * @author ferhanture
 *
 */
public class PreprocessWikipedia extends Configured implements Tool {
    private static final Logger LOG = Logger.getLogger(PreprocessWikipedia.class);

    /*
     * DEFINED PARAMETERS HERE:
     */
    private static final int MinDF = 2, MinNumTermsPerArticle = 5, TermIndexWindow = 8;
    private static final boolean IsNormalized = true;
    private static int MONO_LINGUAL = 0, CROSS_LINGUAL_E = 1, CROSS_LINGUAL_F = 2;
    private String indexRootPath, rawCollection, seqCollection, tokenizerClass, f_stopwordList, e_stopwordList,
            e_tokenizerModel, targetLang;
    private String collectionLang = null, tokenizerModel = null, collectionVocab = null, targetIndexPath = null,
            fVocab_f2e = null, eVocab_f2e = null, fVocab_e2f, eVocab_e2f = null, ttable_f2e = null,
            ttable_e2f = null;
    private int mode;
    private Options options;

    private void printUsage() {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getCanonicalName(), options);
    }

    /**
     * Runs this tool.
     */
    public int run(String[] args) throws Exception {
        if (parseArgs(args) < 0) {
            printUsage();
            return -1;
        }
        Configuration conf = getConf();

        conf.set(Constants.Language, collectionLang);
        conf.setBoolean(Constants.Stemming, true); // default behavior of tokenizer is currently to stem, but we shouldnt rely on that

        if (tokenizerModel != null) {
            conf.set(Constants.TokenizerData, tokenizerModel);
        }

        // user can either provide a tokenizer class as a program argument, 
        // or let the factory find an appropriate class based on language code
        try {
            Class.forName(tokenizerClass);
        } catch (Exception e) {
            tokenizerClass = TokenizerFactory.getTokenizerClass(collectionLang, tokenizerModel).getCanonicalName();
        }

        if (collectionVocab != null) {
            conf.set(Constants.CollectionVocab, collectionVocab); // vocabulary to read collection from
        }
        if (e_stopwordList != null) {
            conf.set(Constants.StopwordList, e_stopwordList);
            conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed");
        }
        // CROSS-LINGUAL CASE
        if (mode == CROSS_LINGUAL_E) { // English side
            conf.set("Ivory.FinalVocab", collectionVocab); // vocabulary to map terms to integers in BuildTargetLang...
            conf.set(Constants.StopwordList, e_stopwordList);
            conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed");
        }

        if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated
            conf.set(Constants.TargetIndexPath, targetIndexPath);
            conf.set("Ivory.F_Vocab_F2E", fVocab_f2e);
            conf.set("Ivory.E_Vocab_F2E", eVocab_f2e);
            conf.set("Ivory.TTable_F2E", ttable_f2e);
            conf.set("Ivory.E_Vocab_E2F", eVocab_e2f);
            conf.set("Ivory.F_Vocab_E2F", fVocab_e2f);
            conf.set("Ivory.TTable_E2F", ttable_e2f);
            conf.set(Constants.CollectionVocab, fVocab_f2e); // vocabulary to read collection from
            conf.set("Ivory.FinalVocab", eVocab_f2e); // vocabulary to map terms to integers in BuildTargetLang...
            if (f_stopwordList != null) {
                conf.set(Constants.StopwordList, f_stopwordList);
                conf.set(Constants.StemmedStopwordList, f_stopwordList + ".stemmed");
            }
            if (e_stopwordList != null) {
                conf.set(Constants.TargetStopwordList, e_stopwordList);
                conf.set(Constants.TargetStemmedStopwordList, e_stopwordList + ".stemmed");
            }
            if (e_tokenizerModel != null) {
                conf.set(Constants.TargetTokenizer, e_tokenizerModel);
            }
            conf.set(Constants.TargetLanguage, targetLang);
        }

        int numMappers = 100;
        int numReducers = 100;

        // Print out options
        LOG.info("Tool name: WikipediaDriver");
        LOG.info(" - Index path: " + indexRootPath);
        LOG.info(" - Raw collection path: " + rawCollection);
        LOG.info(" - Compressed collection path: " + seqCollection);
        LOG.info(" - Collection language: " + collectionLang);
        LOG.info(" - Tokenizer class: " + tokenizerClass);
        LOG.info(" - Tokenizer model: " + tokenizerModel);
        LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle);
        LOG.info(" - Stopwords file: " + e_stopwordList);

        if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) {
            LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side.");
            LOG.info(" - Collection vocab file: " + conf.get(Constants.CollectionVocab));
            LOG.info(" - Tokenizer model: " + tokenizerModel);

            if (mode == CROSS_LINGUAL_F) {
                LOG.info(" - TTable file " + collectionLang + " --> " + targetLang + " : " + ttable_f2e);
                LOG.info(" - Source vocab file: " + fVocab_f2e);
                LOG.info(" - Target vocab file: " + eVocab_f2e);
                LOG.info(" - TTable file " + targetLang + " --> " + collectionLang + " : " + ttable_e2f);
                LOG.info(" - Source vocab file: " + eVocab_e2f);
                LOG.info(" - Target vocab file: " + fVocab_e2f);
                LOG.info(" - Source stopwords file: " + f_stopwordList);
                LOG.info(" - Target stopwords file: " + e_stopwordList);
                LOG.info(" - Target stemmed stopwords file: " + conf.get(Constants.TargetStemmedStopwordList));
                LOG.info(" - Target tokenizer path: " + e_tokenizerModel);
            }
        }

        FileSystem fs = FileSystem.get(conf);

        Path p = new Path(indexRootPath);
        if (!fs.exists(p)) {
            LOG.info("Index path doesn't exist, creating...");
            fs.mkdirs(p);
        }
        RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

        // Build docno mapping from raw collection
        Path mappingFile = env.getDocnoMappingData();
        if (!fs.exists(mappingFile)) {
            LOG.info(mappingFile + " doesn't exist, creating...");
            String[] arr = new String[] { "-input=" + rawCollection, "-output_file=" + mappingFile.toString(),
                    "-wiki_language=" + collectionLang };
            LOG.info("Running WikipediaDocnoMappingBuilder with args " + Arrays.toString(arr));

            WikipediaDocnoMappingBuilder tool = new WikipediaDocnoMappingBuilder();
            tool.setConf(conf);
            tool.run(arr);

            fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
        } else {
            LOG.info("Docno mapping already exists at: " + mappingFile);
        }

        // Repack Wikipedia into sequential compressed block
        if (!fs.exists(new Path(seqCollection + "/part-00000"))) {
            LOG.info(seqCollection + " doesn't exist, creating...");
            String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection,
                    "-mapping_file=" + mappingFile.toString(), "-compression_type=block",
                    "-wiki_language=" + collectionLang };
            LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

            RepackWikipedia tool = new RepackWikipedia();
            tool.setConf(conf);
            tool.run(arr);
        } else {
            LOG.info("Repacked collection already exists at: " + seqCollection);
        }

        conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang);
        conf.setInt(Constants.NumMapTasks, numMappers);
        conf.setInt(Constants.NumReduceTasks, numReducers);
        conf.set(Constants.CollectionPath, seqCollection);
        conf.set(Constants.IndexPath, indexRootPath);
        conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
        conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName());
        conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer"
        conf.setInt(Constants.MinDf, MinDF);
        conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
        conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
        conf.setInt(Constants.TermIndexWindow, TermIndexWindow);

        // Builds term doc vectors from document collection, and filters the terms that are not included
        // in Ivory.SrcVocab.
        long startTime = System.currentTimeMillis();
        long preprocessStartTime = System.currentTimeMillis();
        LOG.info("Building term doc vectors...");
        int exitCode = new BuildTermDocVectors(conf).run();
        if (exitCode >= 0) {
            LOG.info("Job BuildTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0
                    + " seconds");
        } else {
            LOG.info("Error: BuildTermDocVectors. Terminating...");
            return -1;
        }

        // Get CF and DF counts.
        startTime = System.currentTimeMillis();
        LOG.info("Counting terms...");
        exitCode = new ComputeGlobalTermStatistics(conf).run();
        LOG.info("TermCount = " + env.readCollectionTermCount());
        if (exitCode >= 0) {
            LOG.info("Job ComputeGlobalTermStatistics finished in "
                    + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        } else {
            LOG.info("Error: ComputeGlobalTermStatistics. Terminating...");
            return -1;
        }
        // Build a map from terms to sequentially generated integer term ids.
        startTime = System.currentTimeMillis();
        LOG.info("Building term-to-integer id mapping...");
        exitCode = new BuildDictionary(conf).run();
        if (exitCode >= 0) {
            LOG.info("Job BuildDictionary finished in " + (System.currentTimeMillis() - startTime) / 1000.0
                    + " seconds");
        } else {
            LOG.info("Error: BuildDictionary. Terminating...");
            return -1;
        }

        // Compute term weights, and output weighted term doc vectors.
        LOG.info("Building weighted term doc vectors...");
        startTime = System.currentTimeMillis();

        conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25");
        conf.setBoolean("Ivory.Normalize", IsNormalized);
        conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle);

        if (mode == CROSS_LINGUAL_F) {
            // Translate term doc vectors into English.
            exitCode = new BuildTranslatedTermDocVectors(conf).run();
        } else {
            // Build weighted term doc vectors.
            exitCode = new BuildWeightedTermDocVectors(conf).run();
        }
        if (exitCode >= 0) {
            LOG.info("Job BuildTranslated/WeightedTermDocVectors finished in "
                    + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        } else {
            LOG.info("Error: BuildTranslated/WeightedTermDocVectors. Terminating...");
            return -1;
        }

        // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency
        startTime = System.currentTimeMillis();
        LOG.info("Building weighted integer doc vectors...");
        conf.setBoolean("Ivory.Normalize", IsNormalized);
        if (mode == MONO_LINGUAL) {
            exitCode = new BuildIntDocVectors(conf).run();
            exitCode = new BuildWeightedIntDocVectors(conf).run();
            if (exitCode >= 0) {
                LOG.info("Job BuildWeightedIntDocVectors finished in "
                        + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
            } else {
                LOG.info("Error: BuildWeightedIntDocVectors. Terminating...");
                return -1;
            }
        } else {
            BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors(
                    conf);

            int finalNumDocs = weightedIntVectorsTool.run();

            LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in "
                    + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
            if (finalNumDocs > 0) {
                LOG.info("Changed doc count: " + env.readCollectionDocumentCount() + " => " + finalNumDocs);
                env.writeCollectionDocumentCount(finalNumDocs);
            } else {
                LOG.info("No document output! Terminating...");
                return -1;
            }
            // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
            Vocab engVocabH = null;
            try {
                engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
            } catch (IOException e) {
                e.printStackTrace();
            }
            LOG.info("Changed term count: " + env.readCollectionTermCount() + " => " + engVocabH.size());
            env.writeCollectionTermCount(engVocabH.size());
        }

        LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0
                + " seconds");

        return 0;
    }

    private static final String MODE_OPTION = "mode";
    private static final String INDEX_PATH_OPTION = "index";
    private static final String TARGET_INDEX_PATH_OPTION = "targetindex";
    private static final String XML_PATH_OPTION = "xml";
    private static final String COMPRESSED_PATH_OPTION = "compressed";
    private static final String TOKENIZER_CLASS_OPTION = "tokenizerclass";
    private static final String TOKENIZER_MODEL_OPTION = "tokenizermodel";
    private static final String COLLECTION_VOCAB_OPTION = "collectionvocab";
    private static final String LANGUAGE_OPTION = "lang";
    private static final String E_LANGUAGE_OPTION = "target_lang";
    private static final String F_STOPWORD_OPTION = "f_stopword";
    private static final String E_STOPWORD_OPTION = "e_stopword";
    private static final String FVOCAB_F2E_OPTION = "f_f2e_vocab";
    private static final String EVOCAB_F2E_OPTION = "e_f2e_vocab";
    private static final String FVOCAB_E2F_OPTION = "f_e2f_vocab";
    private static final String EVOCAB_E2F_OPTION = "e_e2f_vocab";
    private static final String TTABLE_F2E_OPTION = "f2e_ttable";
    private static final String TTABLE_E2F_OPTION = "e2f_ttable";
    private static final String E_TOKENIZER_MODEL_OPTION = "e_tokenizermodel";
    private static final String LIBJARS_OPTION = "libjars";

    @SuppressWarnings("static-access")
    private int parseArgs(String[] args) {
        options = new Options();
        options.addOption(OptionBuilder.withDescription("preprocessing mode")
                .withArgName("mono|crosslingF|crosslingE").hasArg().isRequired().create(MODE_OPTION));
        options.addOption(OptionBuilder.withDescription("path to index directory").withArgName("path").hasArg()
                .isRequired().create(INDEX_PATH_OPTION));
        options.addOption(OptionBuilder.withDescription("path to target index directory (if processing f-side)")
                .withArgName("path").hasArg().create(TARGET_INDEX_PATH_OPTION));
        options.addOption(OptionBuilder.withDescription("path to XML file").withArgName("path").hasArg()
                .isRequired().create(XML_PATH_OPTION));
        options.addOption(OptionBuilder.withDescription("path to compressed collection").withArgName("path")
                .hasArg().isRequired().create(COMPRESSED_PATH_OPTION));
        options.addOption(OptionBuilder.withDescription("tokenizer class").withArgName("class").hasArg()
                .create(TOKENIZER_CLASS_OPTION));
        options.addOption(OptionBuilder.withDescription("path to tokenizer model file/directory")
                .withArgName("path").hasArg().create(TOKENIZER_MODEL_OPTION));
        options.addOption(OptionBuilder.withDescription("path to target-side tokenizer model file/directory")
                .withArgName("path").hasArg().create(E_TOKENIZER_MODEL_OPTION));
        options.addOption(OptionBuilder.withDescription("path to collection vocab file").withArgName("path")
                .hasArg().create(COLLECTION_VOCAB_OPTION));
        options.addOption(OptionBuilder.withDescription("two-letter collection language code")
                .withArgName("en|de|fr|zh|es|ar|tr").hasArg().isRequired().create(LANGUAGE_OPTION));
        options.addOption(OptionBuilder.withDescription("two-letter target language code")
                .withArgName("en|de|fr|zh|es|ar|tr").hasArg().create(E_LANGUAGE_OPTION));
        options.addOption(OptionBuilder.withDescription("path to f-side stopwords list").withArgName("path")
                .hasArg().create(F_STOPWORD_OPTION));
        options.addOption(OptionBuilder.withDescription("path to e-side stopwords list").withArgName("path")
                .hasArg().create(E_STOPWORD_OPTION));
        options.addOption(OptionBuilder.withDescription("path to f-side vocab file of f-to-e translation table")
                .withArgName("path").hasArg().create(FVOCAB_F2E_OPTION));
        options.addOption(OptionBuilder.withDescription("path to e-side vocab file of f-to-e translation table")
                .withArgName("path").hasArg().create(EVOCAB_F2E_OPTION));
        options.addOption(OptionBuilder.withDescription("path to f-side vocab file of e-to-f translation table")
                .withArgName("path").hasArg().create(FVOCAB_E2F_OPTION));
        options.addOption(OptionBuilder.withDescription("path to e-side vocab file of e-to-f translation table")
                .withArgName("path").hasArg().create(EVOCAB_E2F_OPTION));
        options.addOption(OptionBuilder.withDescription("path to f-to-e translation table").withArgName("path")
                .hasArg().create(TTABLE_F2E_OPTION));
        options.addOption(OptionBuilder.withDescription("path to e-to-f translation table").withArgName("path")
                .hasArg().create(TTABLE_E2F_OPTION));
        options.addOption(OptionBuilder.withDescription("Hadoop option to load external jars")
                .withArgName("jar packages").hasArg().create(LIBJARS_OPTION));

        CommandLine cmdline;
        CommandLineParser parser = new GnuParser();
        try {
            cmdline = parser.parse(options, args);
        } catch (ParseException exp) {
            System.err.println("Error parsing command line: " + exp.getMessage());
            return -1;
        }

        String m = cmdline.getOptionValue(MODE_OPTION);
        mode = m.equals("mono") ? MONO_LINGUAL
                : (m.equals("crosslingF") ? CROSS_LINGUAL_F : (m.equals("crosslingE")) ? CROSS_LINGUAL_E : -1);
        if (mode < 0)
            throw new RuntimeException("Incorrect mode selection!");
        if (mode == CROSS_LINGUAL_F) {
            if (!options.hasOption(FVOCAB_F2E_OPTION) || !options.hasOption(FVOCAB_E2F_OPTION)
                    || !options.hasOption(EVOCAB_F2E_OPTION) || !options.hasOption(EVOCAB_E2F_OPTION)
                    || !options.hasOption(TTABLE_F2E_OPTION) || !options.hasOption(TTABLE_E2F_OPTION)
                    || !options.hasOption(E_LANGUAGE_OPTION)) {
                System.err.println(
                        "Error, missing translation table arguments: " + FVOCAB_F2E_OPTION + "," + EVOCAB_F2E_OPTION
                                + "," + FVOCAB_E2F_OPTION + "," + EVOCAB_E2F_OPTION + "," + TTABLE_F2E_OPTION + ","
                                + TTABLE_E2F_OPTION + "," + E_TOKENIZER_MODEL_OPTION + "," + E_LANGUAGE_OPTION);
                return -1;
            }
        }
        if (mode == CROSS_LINGUAL_E) {
            if (!options.hasOption(COLLECTION_VOCAB_OPTION)) {
                System.err.println("Error, missing collection vocab argument: " + COLLECTION_VOCAB_OPTION);
                return -1;
            }
            if (options.hasOption(FVOCAB_F2E_OPTION) || options.hasOption(FVOCAB_E2F_OPTION)
                    || options.hasOption(EVOCAB_F2E_OPTION) || options.hasOption(EVOCAB_E2F_OPTION)
                    || options.hasOption(TTABLE_F2E_OPTION) || options.hasOption(TTABLE_E2F_OPTION)) {
                System.err.println("Warning, translation table arguments are ignored in this mode!");
            }
        }
        indexRootPath = cmdline.getOptionValue(INDEX_PATH_OPTION);
        targetIndexPath = cmdline.getOptionValue(TARGET_INDEX_PATH_OPTION);
        rawCollection = cmdline.getOptionValue(XML_PATH_OPTION);
        seqCollection = cmdline.getOptionValue(COMPRESSED_PATH_OPTION);
        tokenizerClass = cmdline.getOptionValue(TOKENIZER_CLASS_OPTION);
        tokenizerModel = cmdline.getOptionValue(TOKENIZER_MODEL_OPTION);
        collectionVocab = cmdline.getOptionValue(COLLECTION_VOCAB_OPTION);
        collectionLang = cmdline.getOptionValue(LANGUAGE_OPTION);
        f_stopwordList = cmdline.getOptionValue(F_STOPWORD_OPTION);
        e_stopwordList = cmdline.getOptionValue(E_STOPWORD_OPTION);
        fVocab_f2e = cmdline.getOptionValue(FVOCAB_F2E_OPTION);
        eVocab_f2e = cmdline.getOptionValue(EVOCAB_F2E_OPTION);
        fVocab_e2f = cmdline.getOptionValue(FVOCAB_E2F_OPTION);
        eVocab_e2f = cmdline.getOptionValue(EVOCAB_E2F_OPTION);
        ttable_f2e = cmdline.getOptionValue(TTABLE_F2E_OPTION);
        ttable_e2f = cmdline.getOptionValue(TTABLE_E2F_OPTION);
        e_tokenizerModel = cmdline.getOptionValue(E_TOKENIZER_MODEL_OPTION);
        targetLang = cmdline.getOptionValue(E_LANGUAGE_OPTION);

        return 1;
    }

    /**
     * Dispatches command-line arguments to the tool via the {@code ToolRunner}.
     */
    public static void main(String[] args) throws Exception {
        ToolRunner.run(new PreprocessWikipedia(), args);
    }
}