Java tutorial
/* * Ivory: A Hadoop toolkit for Web-scale information retrieval * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package ivory.core.driver; import ivory.core.RetrievalEnvironment; import ivory.core.preprocess.BuildIntDocVectors; import ivory.core.preprocess.BuildTargetLangWeightedIntDocVectors; import ivory.core.preprocess.BuildTermDocVectors; import ivory.core.preprocess.BuildDictionary; import ivory.core.preprocess.BuildTranslatedTermDocVectors; import ivory.core.preprocess.BuildWeightedIntDocVectors; import ivory.core.preprocess.BuildWeightedTermDocVectors; import ivory.core.preprocess.ComputeGlobalTermStatistics; import ivory.core.Constants; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import edu.umd.cloud9.collection.wikipedia.BuildWikipediaDocnoMapping; import edu.umd.cloud9.collection.wikipedia.RepackWikipedia; import edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMapping; import edu.umd.hooka.Vocab; import edu.umd.hooka.alignment.HadoopAlign; /** * Driver class that preprocesses a Wikipedia collection in any language. * * @author ferhanture * */ public class PreprocessWikipedia extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(PreprocessWikipedia.class); /* * DEFINED PARAMETERS HERE: */ static final int MinDF = 2, MinNumTermsPerArticle = 5, TermIndexWindow = 8; static final boolean IsNormalized = true; static final int MONO_LINGUAL = 4, CROSS_LINGUAL_E = 7, CROSS_LINGUAL_F = 12; private static int printUsage() { System.out.println( "\nThis program can be run in three different \"modes\":\n=====================\nInput: English Wikipedia collection\nOutput: English weighted document vectors" + "\nusage: [index-path] [raw-path] [compressed-path] [tokenizer-class]" + "\n\nInput: English side of cross-lingual Wikipedia collection\nOutput: English weighted document vectors (comparable with the document vectors generated from non-English side)" + "\nusage: [index-path] [raw-path] [compressed-path] [tokenizer-class] [collection-lang] [tokenizer-model] [collection-vocab]" + "\n\nInput: Non-English side of cross-lingual Wikipedia collection\nOutput: English weighted document vectors (comparable with the document vectors generated from English side)" + "\nusage: [index-path] [raw-path] [compressed-path] [tokenizer-class] [collection-lang] [tokenizer-model] [src-vocab_f] [trg-vocab_e] [prob-table_f-->e] [src-vocab_e] [trg-vocab_f] [prob-table_e-->f])"); return -1; } /** * Runs this tool. */ public int run(String[] args) throws Exception { int mode = args.length; if (mode != MONO_LINGUAL && mode != CROSS_LINGUAL_E && mode != CROSS_LINGUAL_F) { printUsage(); return -1; } String indexRootPath = args[0]; String rawCollection = args[1]; //"/shared/Wikipedia/raw/dewiki-20100117-pages-articles.xml"; String seqCollection = args[2]; //"/umd-lin/fture/pwsim/de-wikipedia/compressed.block/de-20100117"; String tokenizerClass = args[3]; Configuration conf = new Configuration(); String collectionLang = null, tokenizerModel = null, collectionVocab = null; String fVocab_f2e = null, eVocab_f2e = null, fVocab_e2f, eVocab_e2f = null, ttable_f2e = null, ttable_e2f = null; if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { // CROSS-LINGUAL CASE collectionLang = args[4]; tokenizerModel = args[5]; collectionVocab = args[6]; conf.set("Ivory.Lang", collectionLang); conf.set("Ivory.TokenizerModel", tokenizerModel); conf.set("Ivory.CollectionVocab", collectionVocab); conf.set("Ivory.FinalVocab", collectionVocab); if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated fVocab_f2e = args[6]; // same as collection vocab eVocab_f2e = args[7]; ttable_f2e = args[8]; eVocab_e2f = args[9]; fVocab_e2f = args[10]; ttable_e2f = args[11]; conf.set("Ivory.F_Vocab_F2E", fVocab_f2e); conf.set("Ivory.E_Vocab_F2E", eVocab_f2e); conf.set("Ivory.TTable_F2E", ttable_f2e); conf.set("Ivory.E_Vocab_E2F", eVocab_e2f); conf.set("Ivory.F_Vocab_E2F", fVocab_e2f); conf.set("Ivory.TTable_E2F", ttable_e2f); conf.set("Ivory.FinalVocab", eVocab_e2f); } } int numMappers = 100; int numReducers = 100; LOG.info("Tool name: WikipediaDriver"); LOG.info(" - Index path: " + indexRootPath); LOG.info(" - Raw collection path: " + rawCollection); LOG.info(" - Compressed collection path: " + seqCollection); LOG.info(" - Tokenizer class: " + tokenizerClass); LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle); if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side."); LOG.info(" - Collection vocab file: " + collectionVocab); LOG.info(" - Tokenizer model: " + tokenizerModel); if (mode == CROSS_LINGUAL_F) { LOG.info(" - TTable file " + collectionLang + " --> English : " + ttable_f2e); LOG.info(" - Source vocab file: " + fVocab_f2e); LOG.info(" - Target vocab file: " + eVocab_f2e); LOG.info(" - TTable file " + "English --> " + collectionLang + " : " + ttable_e2f); LOG.info(" - Source vocab file: " + fVocab_f2e); LOG.info(" - Target vocab file: " + eVocab_f2e); } } LOG.info("Launching with " + numMappers + " mappers, " + numReducers + " reducers..."); FileSystem fs = FileSystem.get(conf); Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("Index path doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Build docno mapping from raw collection Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.info(mappingFile + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output_path=" + indexRootPath + "/wiki-docid-tmp", "-output_file=" + mappingFile.toString(), "-keep_all=false" }; BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping(); tool.setConf(conf); tool.run(arr); fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true); } else { LOG.info(p + " exists"); } // Repack Wikipedia into sequential compressed block p = new Path(seqCollection); if (!fs.exists(p)) { LOG.info(seqCollection + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection, "-mapping_file=" + mappingFile.toString(), "-compression_type=block", "-wiki_language=" + collectionLang }; RepackWikipedia tool = new RepackWikipedia(); tool.setConf(conf); tool.run(arr); } conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang); conf.setInt(Constants.NumMapTasks, numMappers); conf.setInt(Constants.NumReduceTasks, numReducers); conf.set(Constants.CollectionPath, seqCollection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName()); conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer" conf.setInt(Constants.MinDf, MinDF); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.TermIndexWindow, TermIndexWindow); // Builds term doc vectors from document collection, and filters the terms that are not included in Ivory.SrcVocab long startTime = System.currentTimeMillis(); long preprocessStartTime = System.currentTimeMillis(); LOG.info("Building term doc vectors..."); new BuildTermDocVectors(conf).run(); LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Get CF and DF counts startTime = System.currentTimeMillis(); LOG.info("Counting terms..."); new ComputeGlobalTermStatistics(conf).run(); LOG.info("TermCount = " + env.readCollectionTermCount() + "\nJob finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Build a map from terms to sequentially generated integer term ids startTime = System.currentTimeMillis(); LOG.info("Building term-to-integer id mapping..."); new BuildDictionary(conf).run(); LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Compute term weights, and output weighted term doc vectors startTime = System.currentTimeMillis(); LOG.info("Building weighted term doc vectors..."); conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25"); if (mode == CROSS_LINGUAL_F) { conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle); // translate term doc vectors into English. conf.setBoolean("Ivory.Normalize", true); new BuildTranslatedTermDocVectors(conf).run(); } else { conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle); // get weighted term doc vectors conf.setBoolean("Ivory.Normalize", true); new BuildWeightedTermDocVectors(conf).run(); } LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency startTime = System.currentTimeMillis(); LOG.info("Building weighted integer doc vectors..."); conf.setBoolean("Ivory.Normalize", IsNormalized); if (mode == MONO_LINGUAL) { new BuildIntDocVectors(conf).run(); new BuildWeightedIntDocVectors(conf).run(); LOG.info("Job BuildWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors( conf); LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); int finalNumDocs = weightedIntVectorsTool.run(); if (finalNumDocs > 0) { LOG.info("Changed doc count from " + env.readCollectionDocumentCount() + " to = " + finalNumDocs); env.writeCollectionDocumentCount(finalNumDocs); } // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures. Vocab engVocabH = null; try { engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf); } catch (IOException e) { e.printStackTrace(); } LOG.info("Changed term count to : " + env.readCollectionTermCount() + " = " + engVocabH.size()); env.writeCollectionTermCount(engVocabH.size()); } LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0 + " seconds"); return 0; } /** * Dispatches command-line arguments to the tool via the * <code>ToolRunner</code>. */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new PreprocessWikipedia(), args); System.exit(res); } }