List of usage examples for org.apache.hadoop.conf Configuration setInt
public void setInt(String name, int value)
name
property to an int
. From source file:ivory.core.driver.PreprocessAquaint2.java
License:Apache License
/** * Runs this tool./*from ww w . j av a 2s.com*/ */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; LOG.info("Tool name: " + PreprocessAquaint2.class.getCanonicalName()); LOG.info(" - Collection path: " + collection); LOG.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { fs.mkdirs(p); } else { LOG.info("Index directory already exists, skipping!"); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); conf.set(XMLInputFormat.START_TAG_KEY, Aquaint2Document.getXmlStartTag(fs, collection)); conf.set(XMLInputFormat.END_TAG_KEY, Aquaint2Document.getXmlEndTag()); // Look for the docno mapping, which maps from docid (String) to docno // (sequentially-number integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() }; NumberAquaint2Documents2 tool = new NumberAquaint2Documents2(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } else { LOG.info("DocnoMapping already exists, skipping!"); } Aquaint2DocnoMapping dm = new Aquaint2DocnoMapping(); dm.loadMapping(mappingFile, fs); int docno; int expectedDocno; String expectedDocid; String docid; boolean testAquaint2 = false; if (testAquaint2) { docno = 500; expectedDocid = "AFP_ENG_20041001.0500"; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", " + (expectedDocid.equals(docid))); docno = 600; expectedDocid = "AFP_ENG_20041001.0600"; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", " + (expectedDocid.equals(docid))); docno = 700; expectedDocid = "AFP_ENG_20041001.0701"; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", " + (expectedDocid.equals(docid))); docno = 800; expectedDocid = "AFP_ENG_20041003.0019"; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", " + (expectedDocid.equals(docid))); expectedDocno = 500; docid = "AFP_ENG_20041001.0500"; docno = dm.getDocno(docid); System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", " + (expectedDocno == docno)); expectedDocno = 600; docid = "AFP_ENG_20041001.0600"; docno = dm.getDocno(docid); System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", " + (expectedDocno == docno)); expectedDocno = 700; docid = "AFP_ENG_20041001.0701"; docno = dm.getDocno(docid); System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", " + (expectedDocno == docno)); expectedDocno = 800; docid = "AFP_ENG_20041003.0019"; docno = dm.getDocno(docid); System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", " + (expectedDocno == docno)); System.out.println("finished testing, now exiting"); return 0; } boolean testGigaword = false; if (testGigaword) { for (int i = 1; i < 301; i++) { docno = i * 1000; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid); } System.out.println("finished testing, now exiting"); return 0; } conf.set(Constants.CollectionName, "Aquaint2"); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, Aquaint2DocumentInputFormat2.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, Aquaint2DocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.MinDf, 2); // toss away singleton terms conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); //new BuildTermDocVectorsForwardIndex(conf).run(); new BuildIPInvertedIndexDocSorted(conf).run(); conf.set(Constants.ScoringModel, "ivory.pwsim.score.TfIdf"); conf.setBoolean(Constants.Normalize, true); new BuildIntPostingsForwardIndex(conf).run(); boolean buildingVectors = true; //boolean buildingVectors = false; if (buildingVectors) { //new BuildWeightedIntDocVectors(conf).run(); //conf.setBoolean(Constants.BuildWeighted, true); //new BuildIntDocVectorsForwardIndex(conf).run(); String findexDirPath = indexRootPath + "/findex"; String findexFilePath = indexRootPath + "/findex.dat"; if (fs.exists(new Path(findexDirPath))) { LOG.info("ForwardIndex already exists: Skipping!"); } else { new BuildAquaint2ForwardIndex().runTool(conf, collection, findexDirPath, findexFilePath, mappingFile.toString()); } } return 0; }
From source file:ivory.core.driver.PreprocessClueWebEnglish.java
License:Apache License
/** * Runs this tool.//w w w. ja v a 2 s .c o m */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String collection = args[0]; String indexPath = args[1]; int segment = Integer.parseInt(args[2]); LOG.info("Tool name: " + PreprocessClueWebEnglish.class.getCanonicalName()); LOG.info(" - Collection path: " + collection); LOG.info(" - Index path: " + indexPath); LOG.info(" - segement number: " + segment); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path p = new Path(indexPath); if (!fs.exists(p)) { LOG.error("Error: index path doesn't exist!"); return 0; } if (!fs.exists(env.getDocnoMappingData())) { LOG.error("Error: docno mapping data doesn't exist!"); return 0; } conf.set(Constants.CollectionName, "ClueWeb:English:Segment" + segment); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, ClueWarcDocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, DocnoOffsets[segment]); conf.setInt(Constants.MinDf, 10); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.core.driver.PreprocessClueWebEnglishMultipleSegments.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 4) { System.out.println("usage: [index-path] [num-of-mappers] [num-of-reducers] [input-path]..."); ToolRunner.printGenericCommandUsage(System.out); return -1; }// w ww. j a v a 2 s. co m String indexPath = args[0]; int numMappers = Integer.parseInt(args[1]); int numReducers = Integer.parseInt(args[2]); StringBuilder sb = new StringBuilder(args[3]); if (args.length > 4) { for (int i = 4; i < args.length; i++) { sb.append(","); sb.append(args[i]); } } String collection = sb.toString(); LOG.info("Tool name: PreprocessClueWebEnglishMultipleSegments"); LOG.info(" - Index path: " + indexPath); LOG.info(" - Collections: " + collection); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path p = new Path(indexPath); if (!fs.exists(p)) { LOG.error("Error: index path doesn't exist!"); return 0; } if (!fs.exists(env.getDocnoMappingData())) { LOG.error("Error: docno mapping data doesn't exist!"); return 0; } conf.setInt(Constants.NumMapTasks, numMappers); conf.setInt(Constants.NumReduceTasks, numReducers); conf.set(Constants.CollectionName, "ClueWeb:English"); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, ClueWarcDocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, 0); conf.setInt(Constants.MinDf, 50); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.core.driver.PreprocessGov2.java
License:Apache License
/** * Runs this tool.//ww w . ja v a2 s . c o m */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; LOG.info("Tool name: " + PreprocessGov2.class.getCanonicalName()); LOG.info(" - Collection path: " + collection); LOG.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("index directory doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Look for the docno mapping, which maps from docid (String) to docno // (sequentially-number integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { LOG.info("docno-mapping.dat doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), "100" }; NumberTrecWebDocuments tool = new NumberTrecWebDocuments(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } conf.set(Constants.CollectionName, "Gov2"); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, Gov2DocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, mappingFile.toString()); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.MinDf, 10); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.core.driver.PreprocessMedline.java
License:Apache License
/** * Runs this tool./*from w ww. j a v a 2 s . com*/ */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String collection = args[0]; String indexPath = args[1]; LOG.info("Tool name: ProcessMedline"); LOG.info(" - Collection path: " + collection); LOG.info(" - Index path: " + indexPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexPath); if (!fs.exists(p)) { LOG.info("index path doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); // Look for the docno mapping, which maps from docid (String) to docno (sequentially-number // integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { LOG.info(mappingFile + " doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() }; NumberMedlineCitations2 tool = new NumberMedlineCitations2(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } conf.set(Constants.CollectionName, "Medline"); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexPath); conf.set(Constants.InputFormat, MedlineCitationInputFormat2.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, MedlineDocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.MinDf, 2); // toss away singleton terms conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.core.driver.PreprocessTREC.java
License:Apache License
/** * Runs this tool.//from w w w . j av a 2 s . c o m */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; LOG.info("Tool name: " + PreprocessTREC.class.getCanonicalName()); LOG.info(" - Collection path: " + collection); LOG.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("index directory doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Look for the docno mapping, which maps from docid (String) to docno (sequentially-number // integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { LOG.info("docno-mapping.dat doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() }; NumberTrecDocuments2 tool = new NumberTrecDocuments2(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } conf.set(Constants.CollectionName, "TREC_vol45"); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, TrecDocumentInputFormat2.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, TrecDocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.MinDf, 2); // toss away singleton terms conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.core.driver.PreprocessWikipedia.java
License:Apache License
/** * Runs this tool.//from w ww. j a v a 2 s . c o m */ public int run(String[] args) throws Exception { int mode = args.length; if (mode != MONO_LINGUAL && mode != CROSS_LINGUAL_E && mode != CROSS_LINGUAL_F) { printUsage(); return -1; } String indexRootPath = args[0]; String rawCollection = args[1]; //"/shared/Wikipedia/raw/dewiki-20100117-pages-articles.xml"; String seqCollection = args[2]; //"/umd-lin/fture/pwsim/de-wikipedia/compressed.block/de-20100117"; String tokenizerClass = args[3]; Configuration conf = new Configuration(); String collectionLang = null, tokenizerModel = null, collectionVocab = null; String fVocab_f2e = null, eVocab_f2e = null, fVocab_e2f, eVocab_e2f = null, ttable_f2e = null, ttable_e2f = null; if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { // CROSS-LINGUAL CASE collectionLang = args[4]; tokenizerModel = args[5]; collectionVocab = args[6]; conf.set("Ivory.Lang", collectionLang); conf.set("Ivory.TokenizerModel", tokenizerModel); conf.set("Ivory.CollectionVocab", collectionVocab); conf.set("Ivory.FinalVocab", collectionVocab); if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated fVocab_f2e = args[6]; // same as collection vocab eVocab_f2e = args[7]; ttable_f2e = args[8]; eVocab_e2f = args[9]; fVocab_e2f = args[10]; ttable_e2f = args[11]; conf.set("Ivory.F_Vocab_F2E", fVocab_f2e); conf.set("Ivory.E_Vocab_F2E", eVocab_f2e); conf.set("Ivory.TTable_F2E", ttable_f2e); conf.set("Ivory.E_Vocab_E2F", eVocab_e2f); conf.set("Ivory.F_Vocab_E2F", fVocab_e2f); conf.set("Ivory.TTable_E2F", ttable_e2f); conf.set("Ivory.FinalVocab", eVocab_e2f); } } int numMappers = 100; int numReducers = 100; LOG.info("Tool name: WikipediaDriver"); LOG.info(" - Index path: " + indexRootPath); LOG.info(" - Raw collection path: " + rawCollection); LOG.info(" - Compressed collection path: " + seqCollection); LOG.info(" - Tokenizer class: " + tokenizerClass); LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle); if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side."); LOG.info(" - Collection vocab file: " + collectionVocab); LOG.info(" - Tokenizer model: " + tokenizerModel); if (mode == CROSS_LINGUAL_F) { LOG.info(" - TTable file " + collectionLang + " --> English : " + ttable_f2e); LOG.info(" - Source vocab file: " + fVocab_f2e); LOG.info(" - Target vocab file: " + eVocab_f2e); LOG.info(" - TTable file " + "English --> " + collectionLang + " : " + ttable_e2f); LOG.info(" - Source vocab file: " + fVocab_f2e); LOG.info(" - Target vocab file: " + eVocab_f2e); } } LOG.info("Launching with " + numMappers + " mappers, " + numReducers + " reducers..."); FileSystem fs = FileSystem.get(conf); Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("Index path doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Build docno mapping from raw collection Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.info(mappingFile + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output_path=" + indexRootPath + "/wiki-docid-tmp", "-output_file=" + mappingFile.toString(), "-keep_all=false" }; BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping(); tool.setConf(conf); tool.run(arr); fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true); } else { LOG.info(p + " exists"); } // Repack Wikipedia into sequential compressed block p = new Path(seqCollection); if (!fs.exists(p)) { LOG.info(seqCollection + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection, "-mapping_file=" + mappingFile.toString(), "-compression_type=block", "-wiki_language=" + collectionLang }; RepackWikipedia tool = new RepackWikipedia(); tool.setConf(conf); tool.run(arr); } conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang); conf.setInt(Constants.NumMapTasks, numMappers); conf.setInt(Constants.NumReduceTasks, numReducers); conf.set(Constants.CollectionPath, seqCollection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName()); conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer" conf.setInt(Constants.MinDf, MinDF); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.TermIndexWindow, TermIndexWindow); // Builds term doc vectors from document collection, and filters the terms that are not included in Ivory.SrcVocab long startTime = System.currentTimeMillis(); long preprocessStartTime = System.currentTimeMillis(); LOG.info("Building term doc vectors..."); new BuildTermDocVectors(conf).run(); LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Get CF and DF counts startTime = System.currentTimeMillis(); LOG.info("Counting terms..."); new ComputeGlobalTermStatistics(conf).run(); LOG.info("TermCount = " + env.readCollectionTermCount() + "\nJob finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Build a map from terms to sequentially generated integer term ids startTime = System.currentTimeMillis(); LOG.info("Building term-to-integer id mapping..."); new BuildDictionary(conf).run(); LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Compute term weights, and output weighted term doc vectors startTime = System.currentTimeMillis(); LOG.info("Building weighted term doc vectors..."); conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25"); if (mode == CROSS_LINGUAL_F) { conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle); // translate term doc vectors into English. conf.setBoolean("Ivory.Normalize", true); new BuildTranslatedTermDocVectors(conf).run(); } else { conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle); // get weighted term doc vectors conf.setBoolean("Ivory.Normalize", true); new BuildWeightedTermDocVectors(conf).run(); } LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency startTime = System.currentTimeMillis(); LOG.info("Building weighted integer doc vectors..."); conf.setBoolean("Ivory.Normalize", IsNormalized); if (mode == MONO_LINGUAL) { new BuildIntDocVectors(conf).run(); new BuildWeightedIntDocVectors(conf).run(); LOG.info("Job BuildWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors( conf); LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); int finalNumDocs = weightedIntVectorsTool.run(); if (finalNumDocs > 0) { LOG.info("Changed doc count from " + env.readCollectionDocumentCount() + " to = " + finalNumDocs); env.writeCollectionDocumentCount(finalNumDocs); } // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures. Vocab engVocabH = null; try { engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf); } catch (IOException e) { e.printStackTrace(); } LOG.info("Changed term count to : " + env.readCollectionTermCount() + " = " + engVocabH.size()); env.writeCollectionTermCount(engVocabH.size()); } LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.core.driver.PreprocessWt10g.java
License:Apache License
/** * Runs this tool.//w w w.j ava2 s . c o m */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; LOG.info("Tool name: " + PreprocessWt10g.class.getCanonicalName()); LOG.info(" - Collection path: " + collection); LOG.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("index directory doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Look for the docno mapping, which maps from docid (String) to docno (sequentially-number // integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { LOG.info("docno-mapping.dat doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), "100" }; NumberTrecWebDocuments tool = new NumberTrecWebDocuments(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } conf.set(Constants.CollectionName, "Wt10g"); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, Wt10gDocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, mappingFile.toString()); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.MinDf, 10); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.core.index.BuildIntPostingsForwardIndex.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); int minSplitSize = conf.getInt(Constants.MinSplitSize, 0); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); LOG.info("Tool: " + BuildIntPostingsForwardIndex.class.getCanonicalName()); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); Job job = new Job(getConf(), BuildIntPostingsForwardIndex.class.getSimpleName() + ":" + collectionName); job.setJarByClass(BuildIntPostingsForwardIndex.class); Path inputPath = new Path(env.getPostingsDirectory()); FileInputFormat.setInputPaths(job, inputPath); Path postingsIndexPath = new Path(env.getPostingsIndexData()); if (fs.exists(postingsIndexPath)) { LOG.info("Postings forward index path already exists!"); return 0; }//w w w .ja v a2 s . co m job.setNumReduceTasks(1); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:ivory.core.index.BuildIPInvertedIndexDocSorted.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0); int minSplitSize = conf.getInt(Constants.MinSplitSize, 0); int collectionDocCnt = env.readCollectionDocumentCount(); String postingsType = conf.get(Constants.PostingsListsType, ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName()); @SuppressWarnings("unchecked") Class<? extends PostingsList> postingsClass = (Class<? extends PostingsList>) Class.forName(postingsType); LOG.info("PowerTool: " + BuildIPInvertedIndexDocSorted.class.getCanonicalName()); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCnt)); LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName())); LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks)); LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize)); if (!fs.exists(new Path(indexPath))) { fs.mkdirs(new Path(indexPath)); }/*w w w .j ava 2 s . com*/ Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path postingsPath = new Path(env.getPostingsDirectory()); if (fs.exists(postingsPath)) { LOG.info("Postings already exist: no indexing will be performed."); return 0; } conf.setInt(Constants.CollectionDocumentCount, collectionDocCnt); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); Job job = new Job(conf, BuildIPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName); job.setJarByClass(BuildIPInvertedIndexDocSorted.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, postingsPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(PairOfInts.class); job.setMapOutputValueClass(TermPositions.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(postingsClass); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); env.writePostingsType(postingsClass.getCanonicalName()); return 0; }