List of usage examples for org.apache.hadoop.conf Configuration setInt
public void setInt(String name, int value)
name
property to an int
. From source file:io.svectors.hbase.sink.HbaseTestUtil.java
License:Apache License
/** * Returns a new HBaseTestingUtility instance. *//* ww w . j a va 2 s . c o m*/ private static HBaseTestingUtility createTestingUtility() { final Configuration hbaseConf = HBaseConfiguration.create(); hbaseConf.setInt("replication.stats.thread.period.seconds", 5); hbaseConf.setLong("replication.sleep.before.failover", 2000); hbaseConf.setInt("replication.source.maxretriesmultiplier", 10); return new HBaseTestingUtility(hbaseConf); }
From source file:it.crs4.features.GetMeta.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); GenericOptionsParser parser = new GenericOptionsParser(conf, args); String[] otherArgs = parser.getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: <prog> IN OUT"); System.exit(2);/*from w w w.j a v a 2 s . c o m*/ } conf.setInt(JobContext.NUM_REDUCES, 0); Job job = Job.getInstance(conf, "get bioimg meta"); job.setInputFormatClass(NLineInputFormat.class); job.setJarByClass(GetMeta.class); job.setMapperClass(GetMetaMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); NLineInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:it.crs4.seal.prq.PrqOptionParser.java
License:Open Source License
@Override protected CommandLine parseOptions(Configuration conf, String[] args) throws IOException, ParseException { conf.setInt(MinBasesThresholdConfigName, DefaultMinBasesThreshold); conf.setBoolean(DropFailedFilterConfigName, DropFailedFilterDefault); conf.setBoolean(WarningOnlyIfUnpairedConfigName, WarningOnlyIfUnpairedDefault); conf.setInt(NumReadsExpectedConfigName, NumReadsExpectedDefault); CommandLine line = super.parseOptions(conf, args); /* **** handle deprected properties **** */ if (conf.get(PrqOptionParser.OLD_INPUT_FORMAT_CONF) != null) { throw new ParseException("The property " + PrqOptionParser.OLD_INPUT_FORMAT_CONF + " is no longer supported.\n" + "Please use the command line option --input-format instead."); }//from w w w . jav a 2 s . com Utils.checkDeprecatedProp(conf, LOG, MinBasesThresholdConfigName_deprecated, MinBasesThresholdConfigName); Utils.checkDeprecatedProp(conf, LOG, DropFailedFilterConfigName_deprecated, DropFailedFilterConfigName); Utils.checkDeprecatedProp(conf, LOG, WarningOnlyIfUnpairedConfigName_deprecated, WarningOnlyIfUnpairedConfigName); // Let the deprecated properties override the new ones, unless the new ones have a non-default value. // If the new property has a non-default value, it must have been set by the user. // If, on the other hand, the deprecated property has a value, it must have been set by the user since // we're not setting them here. if (conf.get(MinBasesThresholdConfigName_deprecated) != null && conf.getInt(MinBasesThresholdConfigName, DefaultMinBasesThreshold) == DefaultMinBasesThreshold) { conf.setInt(MinBasesThresholdConfigName, conf.getInt(MinBasesThresholdConfigName_deprecated, DefaultMinBasesThreshold)); } if (conf.get(DropFailedFilterConfigName_deprecated) != null && conf.getBoolean(DropFailedFilterConfigName, DropFailedFilterDefault) == DropFailedFilterDefault) { conf.setBoolean(DropFailedFilterConfigName, conf.getBoolean(DropFailedFilterConfigName_deprecated, DropFailedFilterDefault)); } if (conf.get(WarningOnlyIfUnpairedConfigName_deprecated) != null && conf.getBoolean(WarningOnlyIfUnpairedConfigName, WarningOnlyIfUnpairedDefault) == WarningOnlyIfUnpairedDefault) { conf.setBoolean(WarningOnlyIfUnpairedConfigName, conf.getBoolean(WarningOnlyIfUnpairedConfigName_deprecated, WarningOnlyIfUnpairedDefault)); } /* **** end handle deprecated properties **** */ if (line.hasOption(opt_traditionalIds.getOpt())) conf.setBoolean(PairReadsQSeq.PRQ_CONF_TRADITIONAL_IDS, true); if (line.hasOption(opt_numReads.getOpt())) { int numReads; try { numReads = Integer.valueOf(line.getOptionValue(opt_numReads.getOpt())); if (numReads <= 0) throw new ParseException("Number of reads per fragment must be >= 0 (got " + numReads + ")"); if (numReads > 2) { throw new ParseException( "Working with more than two reads per template is not supported at the moment.\n" + "If you're interested in seeing this feature implemented contact the Seal developers."); } } catch (NumberFormatException e) { throw new ParseException(e.getMessage()); } conf.setInt(NumReadsExpectedConfigName, numReads); } // set number of reduce tasks to use conf.set(ClusterUtils.NUM_RED_TASKS_PROPERTY, String.valueOf(getNReduceTasks())); return line; }
From source file:ivory.app.BuildIndex.java
License:Apache License
@SuppressWarnings({ "static-access" }) @Override/* w w w . j ava 2s . co m*/ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(POSITIONAL_INDEX_IP, "build positional index (IP algorithm)")); options.addOption(new Option(POSITIONAL_INDEX_LP, "build positional index (LP algorithm)")); options.addOption(new Option(NONPOSITIONAL_INDEX_IP, "build nonpositional index (IP algorithm)")); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) index path") .create(INDEX_PATH)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("(optional) number of index partitions: 64 default").create(INDEX_PARTITIONS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INDEX_PATH)) { HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String indexPath = cmdline.getOptionValue(INDEX_PATH); int indexPartitions = cmdline.hasOption(INDEX_PARTITIONS) ? Integer.parseInt(cmdline.getOptionValue(INDEX_PARTITIONS)) : 64; Configuration conf = getConf(); LOG.info("Tool name: " + this.getClass().getSimpleName()); LOG.info(String.format(" -%s %s", INDEX_PATH, indexPath)); LOG.info(String.format(" -%s %d", INDEX_PARTITIONS, indexPartitions)); if (cmdline.hasOption(POSITIONAL_INDEX_IP)) { LOG.info(String.format(" -%s", POSITIONAL_INDEX_IP)); conf.set(Constants.IndexPath, indexPath); conf.setInt(Constants.NumReduceTasks, indexPartitions); conf.set(Constants.PostingsListsType, ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName()); new BuildIPInvertedIndexDocSorted(conf).run(); new BuildIntPostingsForwardIndex(conf).run(); } else if (cmdline.hasOption(POSITIONAL_INDEX_LP)) { LOG.info(String.format(" -%s", POSITIONAL_INDEX_LP)); conf.set(Constants.IndexPath, indexPath); conf.setInt(Constants.NumReduceTasks, indexPartitions); conf.set(Constants.PostingsListsType, ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName()); conf.setFloat("Ivory.IndexingMapMemoryThreshold", 0.9f); conf.setFloat("Ivory.IndexingReduceMemoryThreshold", 0.9f); conf.setInt("Ivory.MaxHeap", 2048); conf.setInt("Ivory.MaxNDocsBeforeFlush", 50000); new BuildLPInvertedIndexDocSorted(conf).run(); new BuildIntPostingsForwardIndex(conf).run(); } else if (cmdline.hasOption(NONPOSITIONAL_INDEX_IP)) { LOG.info(String.format(" -%s", NONPOSITIONAL_INDEX_IP)); conf.set(Constants.IndexPath, indexPath); conf.setInt(Constants.NumReduceTasks, indexPartitions); conf.set(Constants.PostingsListsType, ivory.core.data.index.PostingsListDocSortedNonPositional.class.getCanonicalName()); new BuildIPInvertedIndexDocSorted(conf).run(); new BuildIntPostingsForwardIndex(conf).run(); } else { LOG.info(String.format("Nothing to do. Specify one of the following: %s, %s, %s", POSITIONAL_INDEX_IP, POSITIONAL_INDEX_LP, NONPOSITIONAL_INDEX_IP)); } return 0; }
From source file:ivory.app.PreprocessClueWebEnglish.java
License:Apache License
/** * Runs this tool.//from w w w .ja v a 2s .c om */ @SuppressWarnings({ "static-access" }) @Override public int run(String[] args) throws Exception { Options options = new Options(); ; options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path") .create(PreprocessCollection.COLLECTION_PATH)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) index path") .create(PreprocessCollection.INDEX_PATH)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("(required) segment").create(SEGMENT)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(PreprocessCollection.COLLECTION_PATH) || !cmdline.hasOption(PreprocessCollection.INDEX_PATH) || !cmdline.hasOption(SEGMENT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String collection = cmdline.getOptionValue(PreprocessCollection.COLLECTION_PATH); String indexPath = cmdline.getOptionValue(PreprocessCollection.INDEX_PATH); int segment = Integer.parseInt(cmdline.getOptionValue(SEGMENT)); LOG.info("Tool name: " + PreprocessClueWebEnglish.class.getSimpleName()); LOG.info(" - collection path: " + collection); LOG.info(" - index path: " + indexPath); LOG.info(" - segement: " + segment); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexPath); if (!fs.exists(p)) { LOG.info("index path doesn't exist, creating..."); fs.mkdirs(p); } else { LOG.info("Index directory " + p + " already exists!"); return -1; } RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path mappingFile = env.getDocnoMappingData(); new ClueWarcDocnoMappingBuilder().build(new Path(collection), mappingFile, conf); conf.set(Constants.CollectionName, "ClueWeb:English:Segment" + segment); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, ClueWarcDocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, DOCNO_OFFSETS[segment]); conf.setInt(Constants.MinDf, 10); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.app.PreprocessCollection.java
License:Apache License
/** * Runs this tool./*from ww w . j a va2 s . co m*/ */ @Override public int run(String[] args) throws Exception { Options options = createOptions(); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(COLLECTION_PATH) || !cmdline.hasOption(COLLECTION_NAME) || !cmdline.hasOption(INDEX_PATH) || !cmdline.hasOption(DOCNO_MAPPING)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String collection = cmdline.getOptionValue(COLLECTION_PATH); String collectionName = cmdline.getOptionValue(COLLECTION_NAME); String indexPath = cmdline.getOptionValue(INDEX_PATH); int docnoOffset = 0; if (cmdline.hasOption(DOCNO_OFFSET)) { docnoOffset = Integer.parseInt(cmdline.getOptionValue(DOCNO_OFFSET)); } Class<? extends DocnoMapping> docnoMappingClass = null; try { docnoMappingClass = (Class<? extends DocnoMapping>) Class .forName(cmdline.getOptionValue(DOCNO_MAPPING)); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } @SuppressWarnings("rawtypes") Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; if (cmdline.hasOption(INPUTFORMAT)) { try { inputFormatClass = (Class<? extends InputFormat<?, ?>>) Class .forName(cmdline.getOptionValue(INPUTFORMAT)); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } Class<? extends Tokenizer> tokenizerClass = GalagoTokenizer.class; if (cmdline.hasOption(TOKENIZER)) { try { tokenizerClass = (Class<? extends Tokenizer>) Class.forName(cmdline.getOptionValue(TOKENIZER)); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } int minDf = 2; if (cmdline.hasOption(MIN_DF)) { minDf = Integer.parseInt(cmdline.getOptionValue(MIN_DF)); } LOG.info("Tool name: " + this.getClass().getSimpleName()); LOG.info(String.format(" -%s %s", COLLECTION_PATH, collection)); LOG.info(String.format(" -%s %s", COLLECTION_NAME, collectionName)); LOG.info(String.format(" -%s %s", INDEX_PATH, indexPath)); LOG.info(String.format(" -%s %s", DOCNO_MAPPING, docnoMappingClass.getCanonicalName())); LOG.info(String.format(" -%s %s", INPUTFORMAT, inputFormatClass.getCanonicalName())); LOG.info(String.format(" -%s %s", TOKENIZER, tokenizerClass.getCanonicalName())); LOG.info(String.format(" -%s %d", MIN_DF, minDf)); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexPath); if (!fs.exists(p)) { LOG.info("Index directory " + p + " doesn't exist, creating."); fs.mkdirs(p); } else { LOG.info("Index directory " + p + " already exists!"); return -1; } RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); conf.set(Constants.CollectionName, collectionName); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexPath); conf.set(Constants.InputFormat, inputFormatClass.getCanonicalName()); conf.set(Constants.Tokenizer, tokenizerClass.getCanonicalName()); conf.set(Constants.DocnoMappingClass, docnoMappingClass.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, docnoOffset); conf.setInt(Constants.MinDf, minDf); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); Path mappingFile = env.getDocnoMappingData(); docnoMappingClass.newInstance().getBuilder().build(new Path(collection), mappingFile, conf); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.app.PreprocessTrecForeign.java
License:Apache License
/** * Runs this tool.//w w w . j a v a2 s.c o m */ public int run(String[] args) throws Exception { Configuration conf = parseArgs(args); FileSystem fs = FileSystem.get(conf); String indexRootPath = conf.get(Constants.IndexPath); String collection = conf.get(Constants.CollectionPath); RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); Path mappingFile = env.getDocnoMappingData(); new TrecDocnoMappingBuilder().build(new Path(collection), mappingFile, conf); conf.set(Constants.DocnoMappingClass, TrecDocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.MinDf, 2); // toss away singleton terms conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); conf.set(Constants.InputFormat, TrecDocumentInputFormat.class.getCanonicalName()); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.app.PreprocessWikipedia.java
License:Apache License
/** * Runs this tool./* www. j av a2 s . c o m*/ */ public int run(String[] args) throws Exception { if (parseArgs(args) < 0) { printUsage(); return -1; } Configuration conf = getConf(); conf.set(Constants.Language, collectionLang); conf.setBoolean(Constants.Stemming, true); // default behavior of tokenizer is currently to stem, but we shouldnt rely on that if (tokenizerModel != null) { conf.set(Constants.TokenizerData, tokenizerModel); } // user can either provide a tokenizer class as a program argument, // or let the factory find an appropriate class based on language code try { Class.forName(tokenizerClass); } catch (Exception e) { tokenizerClass = TokenizerFactory.getTokenizerClass(collectionLang, tokenizerModel).getCanonicalName(); } if (collectionVocab != null) { conf.set(Constants.CollectionVocab, collectionVocab); // vocabulary to read collection from } if (e_stopwordList != null) { conf.set(Constants.StopwordList, e_stopwordList); conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed"); } // CROSS-LINGUAL CASE if (mode == CROSS_LINGUAL_E) { // English side conf.set("Ivory.FinalVocab", collectionVocab); // vocabulary to map terms to integers in BuildTargetLang... conf.set(Constants.StopwordList, e_stopwordList); conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed"); } if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated conf.set(Constants.TargetIndexPath, targetIndexPath); conf.set("Ivory.F_Vocab_F2E", fVocab_f2e); conf.set("Ivory.E_Vocab_F2E", eVocab_f2e); conf.set("Ivory.TTable_F2E", ttable_f2e); conf.set("Ivory.E_Vocab_E2F", eVocab_e2f); conf.set("Ivory.F_Vocab_E2F", fVocab_e2f); conf.set("Ivory.TTable_E2F", ttable_e2f); conf.set(Constants.CollectionVocab, fVocab_f2e); // vocabulary to read collection from conf.set("Ivory.FinalVocab", eVocab_f2e); // vocabulary to map terms to integers in BuildTargetLang... if (f_stopwordList != null) { conf.set(Constants.StopwordList, f_stopwordList); conf.set(Constants.StemmedStopwordList, f_stopwordList + ".stemmed"); } if (e_stopwordList != null) { conf.set(Constants.TargetStopwordList, e_stopwordList); conf.set(Constants.TargetStemmedStopwordList, e_stopwordList + ".stemmed"); } if (e_tokenizerModel != null) { conf.set(Constants.TargetTokenizer, e_tokenizerModel); } conf.set(Constants.TargetLanguage, targetLang); } int numMappers = 100; int numReducers = 100; // Print out options LOG.info("Tool name: WikipediaDriver"); LOG.info(" - Index path: " + indexRootPath); LOG.info(" - Raw collection path: " + rawCollection); LOG.info(" - Compressed collection path: " + seqCollection); LOG.info(" - Collection language: " + collectionLang); LOG.info(" - Tokenizer class: " + tokenizerClass); LOG.info(" - Tokenizer model: " + tokenizerModel); LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle); LOG.info(" - Stopwords file: " + e_stopwordList); if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side."); LOG.info(" - Collection vocab file: " + conf.get(Constants.CollectionVocab)); LOG.info(" - Tokenizer model: " + tokenizerModel); if (mode == CROSS_LINGUAL_F) { LOG.info(" - TTable file " + collectionLang + " --> " + targetLang + " : " + ttable_f2e); LOG.info(" - Source vocab file: " + fVocab_f2e); LOG.info(" - Target vocab file: " + eVocab_f2e); LOG.info(" - TTable file " + targetLang + " --> " + collectionLang + " : " + ttable_e2f); LOG.info(" - Source vocab file: " + eVocab_e2f); LOG.info(" - Target vocab file: " + fVocab_e2f); LOG.info(" - Source stopwords file: " + f_stopwordList); LOG.info(" - Target stopwords file: " + e_stopwordList); LOG.info(" - Target stemmed stopwords file: " + conf.get(Constants.TargetStemmedStopwordList)); LOG.info(" - Target tokenizer path: " + e_tokenizerModel); } } FileSystem fs = FileSystem.get(conf); Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("Index path doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Build docno mapping from raw collection Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.info(mappingFile + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output_file=" + mappingFile.toString(), "-wiki_language=" + collectionLang }; LOG.info("Running WikipediaDocnoMappingBuilder with args " + Arrays.toString(arr)); WikipediaDocnoMappingBuilder tool = new WikipediaDocnoMappingBuilder(); tool.setConf(conf); tool.run(arr); fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true); } else { LOG.info("Docno mapping already exists at: " + mappingFile); } // Repack Wikipedia into sequential compressed block if (!fs.exists(new Path(seqCollection + "/part-00000"))) { LOG.info(seqCollection + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection, "-mapping_file=" + mappingFile.toString(), "-compression_type=block", "-wiki_language=" + collectionLang }; LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr)); RepackWikipedia tool = new RepackWikipedia(); tool.setConf(conf); tool.run(arr); } else { LOG.info("Repacked collection already exists at: " + seqCollection); } conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang); conf.setInt(Constants.NumMapTasks, numMappers); conf.setInt(Constants.NumReduceTasks, numReducers); conf.set(Constants.CollectionPath, seqCollection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName()); conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer" conf.setInt(Constants.MinDf, MinDF); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.TermIndexWindow, TermIndexWindow); // Builds term doc vectors from document collection, and filters the terms that are not included // in Ivory.SrcVocab. long startTime = System.currentTimeMillis(); long preprocessStartTime = System.currentTimeMillis(); LOG.info("Building term doc vectors..."); int exitCode = new BuildTermDocVectors(conf).run(); if (exitCode >= 0) { LOG.info("Job BuildTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: BuildTermDocVectors. Terminating..."); return -1; } // Get CF and DF counts. startTime = System.currentTimeMillis(); LOG.info("Counting terms..."); exitCode = new ComputeGlobalTermStatistics(conf).run(); LOG.info("TermCount = " + env.readCollectionTermCount()); if (exitCode >= 0) { LOG.info("Job ComputeGlobalTermStatistics finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: ComputeGlobalTermStatistics. Terminating..."); return -1; } // Build a map from terms to sequentially generated integer term ids. startTime = System.currentTimeMillis(); LOG.info("Building term-to-integer id mapping..."); exitCode = new BuildDictionary(conf).run(); if (exitCode >= 0) { LOG.info("Job BuildDictionary finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: BuildDictionary. Terminating..."); return -1; } // Compute term weights, and output weighted term doc vectors. LOG.info("Building weighted term doc vectors..."); startTime = System.currentTimeMillis(); conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25"); conf.setBoolean("Ivory.Normalize", IsNormalized); conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle); if (mode == CROSS_LINGUAL_F) { // Translate term doc vectors into English. exitCode = new BuildTranslatedTermDocVectors(conf).run(); } else { // Build weighted term doc vectors. exitCode = new BuildWeightedTermDocVectors(conf).run(); } if (exitCode >= 0) { LOG.info("Job BuildTranslated/WeightedTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: BuildTranslated/WeightedTermDocVectors. Terminating..."); return -1; } // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency startTime = System.currentTimeMillis(); LOG.info("Building weighted integer doc vectors..."); conf.setBoolean("Ivory.Normalize", IsNormalized); if (mode == MONO_LINGUAL) { exitCode = new BuildIntDocVectors(conf).run(); exitCode = new BuildWeightedIntDocVectors(conf).run(); if (exitCode >= 0) { LOG.info("Job BuildWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: BuildWeightedIntDocVectors. Terminating..."); return -1; } } else { BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors( conf); int finalNumDocs = weightedIntVectorsTool.run(); LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); if (finalNumDocs > 0) { LOG.info("Changed doc count: " + env.readCollectionDocumentCount() + " => " + finalNumDocs); env.writeCollectionDocumentCount(finalNumDocs); } else { LOG.info("No document output! Terminating..."); return -1; } // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures. Vocab engVocabH = null; try { engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf); } catch (IOException e) { e.printStackTrace(); } LOG.info("Changed term count: " + env.readCollectionTermCount() + " => " + engVocabH.size()); env.writeCollectionTermCount(engVocabH.size()); } LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.core.driver.BuildNonPositionalIndexIP.java
License:Apache License
/** * Runs this tool.//ww w . j a va 2s. c om */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = args[0]; Path p = new Path(indexPath); if (!fs.exists(p)) { LOG.warn("Index path doesn't exist..."); return -1; } int numReducers = Integer.parseInt(args[1]); LOG.info("Tool name: " + BuildPositionalIndexIP.class.getCanonicalName()); LOG.info(" - Index path: " + indexPath); conf.set(Constants.IndexPath, indexPath); conf.setInt(Constants.NumReduceTasks, numReducers); conf.set(Constants.PostingsListsType, ivory.core.data.index.PostingsListDocSortedNonPositional.class.getCanonicalName()); new BuildIPInvertedIndexDocSorted(conf).run(); new BuildIntPostingsForwardIndex(conf).run(); return 0; }
From source file:ivory.core.driver.BuildPositionalIndexIP.java
License:Apache License
/** * Runs this tool.//from w w w.ja va 2 s . com */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = args[0]; Path p = new Path(indexPath); if (!fs.exists(p)) { LOG.warn("Index path doesn't exist..."); return -1; } int numReducers = Integer.parseInt(args[1]); LOG.info("Tool name: " + BuildPositionalIndexIP.class.getCanonicalName()); LOG.info(" - Index path: " + indexPath); conf.set(Constants.IndexPath, indexPath); conf.setInt(Constants.NumReduceTasks, numReducers); conf.set(Constants.PostingsListsType, ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName()); new BuildIPInvertedIndexDocSorted(conf).run(); new BuildIntPostingsForwardIndex(conf).run(); return 0; }