List of usage examples for org.apache.hadoop.conf Configuration setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:it.crs4.pydoop.mapreduce.pipes.TestPipeApplication.java
License:Apache License
/** * test org.apache.hadoop.mapreduce.pipes.PipesReducer * test the transfer of data: key and value * * @throws Exception/*from w ww. ja v a 2 s. c o m*/ */ @Test public void testPipesReducer() throws Exception { System.err.println("testPipesReducer"); File[] psw = cleanTokenPasswordFile(); try { JobID jobId = new JobID("201408272347", 0); TaskID taskId = new TaskID(jobId, TaskType.MAP, 0); TaskAttemptID taskAttemptid = new TaskAttemptID(taskId, 0); Job job = new Job(new Configuration()); job.setJobID(jobId); Configuration conf = job.getConfiguration(); conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskAttemptid.toString()); FileSystem fs = new RawLocalFileSystem(); fs.setConf(conf); File fCommand = getFileCommand("it.crs4.pydoop.mapreduce.pipes.PipeReducerStub"); conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath()); System.err.println("fCommand" + fCommand.getAbsolutePath()); Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>("user".getBytes(), "password".getBytes(), new Text("kind"), new Text("service")); TokenCache.setJobToken(token, job.getCredentials()); conf.setBoolean(MRJobConfig.SKIP_RECORDS, true); TestReporter reporter = new TestReporter(); DummyInputFormat input_format = new DummyInputFormat(); List<InputSplit> isplits = input_format.getSplits(job); InputSplit isplit = isplits.get(0); TaskAttemptContextImpl tcontext = new TaskAttemptContextImpl(conf, taskAttemptid); RecordWriter<IntWritable, Text> writer = new TestRecordWriter( new FileOutputStream(workSpace.getAbsolutePath() + File.separator + "outfile")); BooleanWritable bw = new BooleanWritable(true); List<Text> texts = new ArrayList<Text>(); texts.add(new Text("first")); texts.add(new Text("second")); texts.add(new Text("third")); DummyRawKeyValueIterator kvit = new DummyRawKeyValueIterator(); ReduceContextImpl<BooleanWritable, Text, IntWritable, Text> context = new ReduceContextImpl<BooleanWritable, Text, IntWritable, Text>( conf, taskAttemptid, kvit, null, null, writer, null, null, null, BooleanWritable.class, Text.class); PipesReducer<BooleanWritable, Text, IntWritable, Text> reducer = new PipesReducer<BooleanWritable, Text, IntWritable, Text>(); reducer.setup(context); initStdOut(conf); reducer.reduce(bw, texts, context); reducer.cleanup(context); String stdOut = readStdOut(conf); // test data: key assertTrue(stdOut.contains("reducer key :true")); // and values assertTrue(stdOut.contains("reduce value :first")); assertTrue(stdOut.contains("reduce value :second")); assertTrue(stdOut.contains("reduce value :third")); } finally { if (psw != null) { // remove password files for (File file : psw) { file.deleteOnExit(); } } } }
From source file:it.crs4.seal.demux.Demux.java
License:Open Source License
@Override public int run(String[] args) throws Exception { LOG.info("starting"); Configuration conf = getConf(); DemuxOptionParser parser = new DemuxOptionParser(); parser.parse(conf, args);/*from w ww .jav a 2 s . c om*/ conf.setBoolean(CONF_NO_INDEX_READS, parser.getNoIndexReads()); conf.setBoolean(CONF_SEPARATE_READS, parser.getSeparateReads()); LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks"); if (parser.getNoIndexReads()) LOG.info("Not expecting to find any index reads. Will demultiplex based only on lane."); // load sample sheet to fail early in case of problems DemuxUtils.loadSampleSheet(parser.getSampleSheetPath(), conf); // must be called before creating the job, since the job // *copies* the Configuration. distributeSampleSheet(parser.getSampleSheetPath()); // Create a Job using the processed conf Job job = new Job(getConf(), makeJobName(parser.getInputPaths().get(0))); job.setJarByClass(Demux.class); // input paths for (Path p : parser.getInputPaths()) FileInputFormat.addInputPath(job, p); job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName("qseq"))); job.setMapperClass(Map.class); job.setMapOutputKeyClass(SequenceId.class); job.setMapOutputValueClass(SequencedFragment.class); job.setPartitionerClass(SequenceIdLocationPartitioner.class); job.setGroupingComparatorClass(GroupByLocationComparator.class); job.setSortComparatorClass(TwoOneThreeSortComparator.class); job.setReducerClass(Red.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SequencedFragment.class); // output job.setOutputFormatClass(DemuxOutputFormat.class); FileOutputFormat.setOutputPath(job, parser.getOutputPath()); // Submit the job, then poll for progress until the job is complete boolean result = job.waitForCompletion(true); if (result) { LOG.info("done"); if (parser.getCreateLaneContent()) createLaneContentFiles(parser.getOutputPath(), parser.getSampleSheetPath()); return 0; } else { LOG.fatal(this.getClass().getName() + " failed!"); return 1; } }
From source file:it.crs4.seal.prq.PrqOptionParser.java
License:Open Source License
@Override protected CommandLine parseOptions(Configuration conf, String[] args) throws IOException, ParseException { conf.setInt(MinBasesThresholdConfigName, DefaultMinBasesThreshold); conf.setBoolean(DropFailedFilterConfigName, DropFailedFilterDefault); conf.setBoolean(WarningOnlyIfUnpairedConfigName, WarningOnlyIfUnpairedDefault); conf.setInt(NumReadsExpectedConfigName, NumReadsExpectedDefault); CommandLine line = super.parseOptions(conf, args); /* **** handle deprected properties **** */ if (conf.get(PrqOptionParser.OLD_INPUT_FORMAT_CONF) != null) { throw new ParseException("The property " + PrqOptionParser.OLD_INPUT_FORMAT_CONF + " is no longer supported.\n" + "Please use the command line option --input-format instead."); }/*from w ww. ja v a 2s . c o m*/ Utils.checkDeprecatedProp(conf, LOG, MinBasesThresholdConfigName_deprecated, MinBasesThresholdConfigName); Utils.checkDeprecatedProp(conf, LOG, DropFailedFilterConfigName_deprecated, DropFailedFilterConfigName); Utils.checkDeprecatedProp(conf, LOG, WarningOnlyIfUnpairedConfigName_deprecated, WarningOnlyIfUnpairedConfigName); // Let the deprecated properties override the new ones, unless the new ones have a non-default value. // If the new property has a non-default value, it must have been set by the user. // If, on the other hand, the deprecated property has a value, it must have been set by the user since // we're not setting them here. if (conf.get(MinBasesThresholdConfigName_deprecated) != null && conf.getInt(MinBasesThresholdConfigName, DefaultMinBasesThreshold) == DefaultMinBasesThreshold) { conf.setInt(MinBasesThresholdConfigName, conf.getInt(MinBasesThresholdConfigName_deprecated, DefaultMinBasesThreshold)); } if (conf.get(DropFailedFilterConfigName_deprecated) != null && conf.getBoolean(DropFailedFilterConfigName, DropFailedFilterDefault) == DropFailedFilterDefault) { conf.setBoolean(DropFailedFilterConfigName, conf.getBoolean(DropFailedFilterConfigName_deprecated, DropFailedFilterDefault)); } if (conf.get(WarningOnlyIfUnpairedConfigName_deprecated) != null && conf.getBoolean(WarningOnlyIfUnpairedConfigName, WarningOnlyIfUnpairedDefault) == WarningOnlyIfUnpairedDefault) { conf.setBoolean(WarningOnlyIfUnpairedConfigName, conf.getBoolean(WarningOnlyIfUnpairedConfigName_deprecated, WarningOnlyIfUnpairedDefault)); } /* **** end handle deprecated properties **** */ if (line.hasOption(opt_traditionalIds.getOpt())) conf.setBoolean(PairReadsQSeq.PRQ_CONF_TRADITIONAL_IDS, true); if (line.hasOption(opt_numReads.getOpt())) { int numReads; try { numReads = Integer.valueOf(line.getOptionValue(opt_numReads.getOpt())); if (numReads <= 0) throw new ParseException("Number of reads per fragment must be >= 0 (got " + numReads + ")"); if (numReads > 2) { throw new ParseException( "Working with more than two reads per template is not supported at the moment.\n" + "If you're interested in seeing this feature implemented contact the Seal developers."); } } catch (NumberFormatException e) { throw new ParseException(e.getMessage()); } conf.setInt(NumReadsExpectedConfigName, numReads); } // set number of reduce tasks to use conf.set(ClusterUtils.NUM_RED_TASKS_PROPERTY, String.valueOf(getNReduceTasks())); return line; }
From source file:ivory.app.PreprocessWikipedia.java
License:Apache License
/** * Runs this tool./*from w w w . j av a2 s . co m*/ */ public int run(String[] args) throws Exception { if (parseArgs(args) < 0) { printUsage(); return -1; } Configuration conf = getConf(); conf.set(Constants.Language, collectionLang); conf.setBoolean(Constants.Stemming, true); // default behavior of tokenizer is currently to stem, but we shouldnt rely on that if (tokenizerModel != null) { conf.set(Constants.TokenizerData, tokenizerModel); } // user can either provide a tokenizer class as a program argument, // or let the factory find an appropriate class based on language code try { Class.forName(tokenizerClass); } catch (Exception e) { tokenizerClass = TokenizerFactory.getTokenizerClass(collectionLang, tokenizerModel).getCanonicalName(); } if (collectionVocab != null) { conf.set(Constants.CollectionVocab, collectionVocab); // vocabulary to read collection from } if (e_stopwordList != null) { conf.set(Constants.StopwordList, e_stopwordList); conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed"); } // CROSS-LINGUAL CASE if (mode == CROSS_LINGUAL_E) { // English side conf.set("Ivory.FinalVocab", collectionVocab); // vocabulary to map terms to integers in BuildTargetLang... conf.set(Constants.StopwordList, e_stopwordList); conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed"); } if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated conf.set(Constants.TargetIndexPath, targetIndexPath); conf.set("Ivory.F_Vocab_F2E", fVocab_f2e); conf.set("Ivory.E_Vocab_F2E", eVocab_f2e); conf.set("Ivory.TTable_F2E", ttable_f2e); conf.set("Ivory.E_Vocab_E2F", eVocab_e2f); conf.set("Ivory.F_Vocab_E2F", fVocab_e2f); conf.set("Ivory.TTable_E2F", ttable_e2f); conf.set(Constants.CollectionVocab, fVocab_f2e); // vocabulary to read collection from conf.set("Ivory.FinalVocab", eVocab_f2e); // vocabulary to map terms to integers in BuildTargetLang... if (f_stopwordList != null) { conf.set(Constants.StopwordList, f_stopwordList); conf.set(Constants.StemmedStopwordList, f_stopwordList + ".stemmed"); } if (e_stopwordList != null) { conf.set(Constants.TargetStopwordList, e_stopwordList); conf.set(Constants.TargetStemmedStopwordList, e_stopwordList + ".stemmed"); } if (e_tokenizerModel != null) { conf.set(Constants.TargetTokenizer, e_tokenizerModel); } conf.set(Constants.TargetLanguage, targetLang); } int numMappers = 100; int numReducers = 100; // Print out options LOG.info("Tool name: WikipediaDriver"); LOG.info(" - Index path: " + indexRootPath); LOG.info(" - Raw collection path: " + rawCollection); LOG.info(" - Compressed collection path: " + seqCollection); LOG.info(" - Collection language: " + collectionLang); LOG.info(" - Tokenizer class: " + tokenizerClass); LOG.info(" - Tokenizer model: " + tokenizerModel); LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle); LOG.info(" - Stopwords file: " + e_stopwordList); if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side."); LOG.info(" - Collection vocab file: " + conf.get(Constants.CollectionVocab)); LOG.info(" - Tokenizer model: " + tokenizerModel); if (mode == CROSS_LINGUAL_F) { LOG.info(" - TTable file " + collectionLang + " --> " + targetLang + " : " + ttable_f2e); LOG.info(" - Source vocab file: " + fVocab_f2e); LOG.info(" - Target vocab file: " + eVocab_f2e); LOG.info(" - TTable file " + targetLang + " --> " + collectionLang + " : " + ttable_e2f); LOG.info(" - Source vocab file: " + eVocab_e2f); LOG.info(" - Target vocab file: " + fVocab_e2f); LOG.info(" - Source stopwords file: " + f_stopwordList); LOG.info(" - Target stopwords file: " + e_stopwordList); LOG.info(" - Target stemmed stopwords file: " + conf.get(Constants.TargetStemmedStopwordList)); LOG.info(" - Target tokenizer path: " + e_tokenizerModel); } } FileSystem fs = FileSystem.get(conf); Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("Index path doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Build docno mapping from raw collection Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.info(mappingFile + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output_file=" + mappingFile.toString(), "-wiki_language=" + collectionLang }; LOG.info("Running WikipediaDocnoMappingBuilder with args " + Arrays.toString(arr)); WikipediaDocnoMappingBuilder tool = new WikipediaDocnoMappingBuilder(); tool.setConf(conf); tool.run(arr); fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true); } else { LOG.info("Docno mapping already exists at: " + mappingFile); } // Repack Wikipedia into sequential compressed block if (!fs.exists(new Path(seqCollection + "/part-00000"))) { LOG.info(seqCollection + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection, "-mapping_file=" + mappingFile.toString(), "-compression_type=block", "-wiki_language=" + collectionLang }; LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr)); RepackWikipedia tool = new RepackWikipedia(); tool.setConf(conf); tool.run(arr); } else { LOG.info("Repacked collection already exists at: " + seqCollection); } conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang); conf.setInt(Constants.NumMapTasks, numMappers); conf.setInt(Constants.NumReduceTasks, numReducers); conf.set(Constants.CollectionPath, seqCollection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName()); conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer" conf.setInt(Constants.MinDf, MinDF); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.TermIndexWindow, TermIndexWindow); // Builds term doc vectors from document collection, and filters the terms that are not included // in Ivory.SrcVocab. long startTime = System.currentTimeMillis(); long preprocessStartTime = System.currentTimeMillis(); LOG.info("Building term doc vectors..."); int exitCode = new BuildTermDocVectors(conf).run(); if (exitCode >= 0) { LOG.info("Job BuildTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: BuildTermDocVectors. Terminating..."); return -1; } // Get CF and DF counts. startTime = System.currentTimeMillis(); LOG.info("Counting terms..."); exitCode = new ComputeGlobalTermStatistics(conf).run(); LOG.info("TermCount = " + env.readCollectionTermCount()); if (exitCode >= 0) { LOG.info("Job ComputeGlobalTermStatistics finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: ComputeGlobalTermStatistics. Terminating..."); return -1; } // Build a map from terms to sequentially generated integer term ids. startTime = System.currentTimeMillis(); LOG.info("Building term-to-integer id mapping..."); exitCode = new BuildDictionary(conf).run(); if (exitCode >= 0) { LOG.info("Job BuildDictionary finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: BuildDictionary. Terminating..."); return -1; } // Compute term weights, and output weighted term doc vectors. LOG.info("Building weighted term doc vectors..."); startTime = System.currentTimeMillis(); conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25"); conf.setBoolean("Ivory.Normalize", IsNormalized); conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle); if (mode == CROSS_LINGUAL_F) { // Translate term doc vectors into English. exitCode = new BuildTranslatedTermDocVectors(conf).run(); } else { // Build weighted term doc vectors. exitCode = new BuildWeightedTermDocVectors(conf).run(); } if (exitCode >= 0) { LOG.info("Job BuildTranslated/WeightedTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: BuildTranslated/WeightedTermDocVectors. Terminating..."); return -1; } // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency startTime = System.currentTimeMillis(); LOG.info("Building weighted integer doc vectors..."); conf.setBoolean("Ivory.Normalize", IsNormalized); if (mode == MONO_LINGUAL) { exitCode = new BuildIntDocVectors(conf).run(); exitCode = new BuildWeightedIntDocVectors(conf).run(); if (exitCode >= 0) { LOG.info("Job BuildWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { LOG.info("Error: BuildWeightedIntDocVectors. Terminating..."); return -1; } } else { BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors( conf); int finalNumDocs = weightedIntVectorsTool.run(); LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); if (finalNumDocs > 0) { LOG.info("Changed doc count: " + env.readCollectionDocumentCount() + " => " + finalNumDocs); env.writeCollectionDocumentCount(finalNumDocs); } else { LOG.info("No document output! Terminating..."); return -1; } // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures. Vocab engVocabH = null; try { engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf); } catch (IOException e) { e.printStackTrace(); } LOG.info("Changed term count: " + env.readCollectionTermCount() + " => " + engVocabH.size()); env.writeCollectionTermCount(engVocabH.size()); } LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.core.driver.PreprocessAquaint2.java
License:Apache License
/** * Runs this tool.//w w w .j av a2 s .c o m */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; LOG.info("Tool name: " + PreprocessAquaint2.class.getCanonicalName()); LOG.info(" - Collection path: " + collection); LOG.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { fs.mkdirs(p); } else { LOG.info("Index directory already exists, skipping!"); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); conf.set(XMLInputFormat.START_TAG_KEY, Aquaint2Document.getXmlStartTag(fs, collection)); conf.set(XMLInputFormat.END_TAG_KEY, Aquaint2Document.getXmlEndTag()); // Look for the docno mapping, which maps from docid (String) to docno // (sequentially-number integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() }; NumberAquaint2Documents2 tool = new NumberAquaint2Documents2(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } else { LOG.info("DocnoMapping already exists, skipping!"); } Aquaint2DocnoMapping dm = new Aquaint2DocnoMapping(); dm.loadMapping(mappingFile, fs); int docno; int expectedDocno; String expectedDocid; String docid; boolean testAquaint2 = false; if (testAquaint2) { docno = 500; expectedDocid = "AFP_ENG_20041001.0500"; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", " + (expectedDocid.equals(docid))); docno = 600; expectedDocid = "AFP_ENG_20041001.0600"; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", " + (expectedDocid.equals(docid))); docno = 700; expectedDocid = "AFP_ENG_20041001.0701"; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", " + (expectedDocid.equals(docid))); docno = 800; expectedDocid = "AFP_ENG_20041003.0019"; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", " + (expectedDocid.equals(docid))); expectedDocno = 500; docid = "AFP_ENG_20041001.0500"; docno = dm.getDocno(docid); System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", " + (expectedDocno == docno)); expectedDocno = 600; docid = "AFP_ENG_20041001.0600"; docno = dm.getDocno(docid); System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", " + (expectedDocno == docno)); expectedDocno = 700; docid = "AFP_ENG_20041001.0701"; docno = dm.getDocno(docid); System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", " + (expectedDocno == docno)); expectedDocno = 800; docid = "AFP_ENG_20041003.0019"; docno = dm.getDocno(docid); System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", " + (expectedDocno == docno)); System.out.println("finished testing, now exiting"); return 0; } boolean testGigaword = false; if (testGigaword) { for (int i = 1; i < 301; i++) { docno = i * 1000; docid = dm.getDocid(docno); System.out.println("dm.getDocid(" + docno + "): " + docid); } System.out.println("finished testing, now exiting"); return 0; } conf.set(Constants.CollectionName, "Aquaint2"); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, Aquaint2DocumentInputFormat2.class.getCanonicalName()); conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, Aquaint2DocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.MinDf, 2); // toss away singleton terms conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); new BuildTermDocVectors(conf).run(); new ComputeGlobalTermStatistics(conf).run(); new BuildDictionary(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); //new BuildTermDocVectorsForwardIndex(conf).run(); new BuildIPInvertedIndexDocSorted(conf).run(); conf.set(Constants.ScoringModel, "ivory.pwsim.score.TfIdf"); conf.setBoolean(Constants.Normalize, true); new BuildIntPostingsForwardIndex(conf).run(); boolean buildingVectors = true; //boolean buildingVectors = false; if (buildingVectors) { //new BuildWeightedIntDocVectors(conf).run(); //conf.setBoolean(Constants.BuildWeighted, true); //new BuildIntDocVectorsForwardIndex(conf).run(); String findexDirPath = indexRootPath + "/findex"; String findexFilePath = indexRootPath + "/findex.dat"; if (fs.exists(new Path(findexDirPath))) { LOG.info("ForwardIndex already exists: Skipping!"); } else { new BuildAquaint2ForwardIndex().runTool(conf, collection, findexDirPath, findexFilePath, mappingFile.toString()); } } return 0; }
From source file:ivory.core.driver.PreprocessWikipedia.java
License:Apache License
/** * Runs this tool.//from w w w .j av a2s . c om */ public int run(String[] args) throws Exception { int mode = args.length; if (mode != MONO_LINGUAL && mode != CROSS_LINGUAL_E && mode != CROSS_LINGUAL_F) { printUsage(); return -1; } String indexRootPath = args[0]; String rawCollection = args[1]; //"/shared/Wikipedia/raw/dewiki-20100117-pages-articles.xml"; String seqCollection = args[2]; //"/umd-lin/fture/pwsim/de-wikipedia/compressed.block/de-20100117"; String tokenizerClass = args[3]; Configuration conf = new Configuration(); String collectionLang = null, tokenizerModel = null, collectionVocab = null; String fVocab_f2e = null, eVocab_f2e = null, fVocab_e2f, eVocab_e2f = null, ttable_f2e = null, ttable_e2f = null; if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { // CROSS-LINGUAL CASE collectionLang = args[4]; tokenizerModel = args[5]; collectionVocab = args[6]; conf.set("Ivory.Lang", collectionLang); conf.set("Ivory.TokenizerModel", tokenizerModel); conf.set("Ivory.CollectionVocab", collectionVocab); conf.set("Ivory.FinalVocab", collectionVocab); if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated fVocab_f2e = args[6]; // same as collection vocab eVocab_f2e = args[7]; ttable_f2e = args[8]; eVocab_e2f = args[9]; fVocab_e2f = args[10]; ttable_e2f = args[11]; conf.set("Ivory.F_Vocab_F2E", fVocab_f2e); conf.set("Ivory.E_Vocab_F2E", eVocab_f2e); conf.set("Ivory.TTable_F2E", ttable_f2e); conf.set("Ivory.E_Vocab_E2F", eVocab_e2f); conf.set("Ivory.F_Vocab_E2F", fVocab_e2f); conf.set("Ivory.TTable_E2F", ttable_e2f); conf.set("Ivory.FinalVocab", eVocab_e2f); } } int numMappers = 100; int numReducers = 100; LOG.info("Tool name: WikipediaDriver"); LOG.info(" - Index path: " + indexRootPath); LOG.info(" - Raw collection path: " + rawCollection); LOG.info(" - Compressed collection path: " + seqCollection); LOG.info(" - Tokenizer class: " + tokenizerClass); LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle); if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side."); LOG.info(" - Collection vocab file: " + collectionVocab); LOG.info(" - Tokenizer model: " + tokenizerModel); if (mode == CROSS_LINGUAL_F) { LOG.info(" - TTable file " + collectionLang + " --> English : " + ttable_f2e); LOG.info(" - Source vocab file: " + fVocab_f2e); LOG.info(" - Target vocab file: " + eVocab_f2e); LOG.info(" - TTable file " + "English --> " + collectionLang + " : " + ttable_e2f); LOG.info(" - Source vocab file: " + fVocab_f2e); LOG.info(" - Target vocab file: " + eVocab_f2e); } } LOG.info("Launching with " + numMappers + " mappers, " + numReducers + " reducers..."); FileSystem fs = FileSystem.get(conf); Path p = new Path(indexRootPath); if (!fs.exists(p)) { LOG.info("Index path doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Build docno mapping from raw collection Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.info(mappingFile + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output_path=" + indexRootPath + "/wiki-docid-tmp", "-output_file=" + mappingFile.toString(), "-keep_all=false" }; BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping(); tool.setConf(conf); tool.run(arr); fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true); } else { LOG.info(p + " exists"); } // Repack Wikipedia into sequential compressed block p = new Path(seqCollection); if (!fs.exists(p)) { LOG.info(seqCollection + " doesn't exist, creating..."); String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection, "-mapping_file=" + mappingFile.toString(), "-compression_type=block", "-wiki_language=" + collectionLang }; RepackWikipedia tool = new RepackWikipedia(); tool.setConf(conf); tool.run(arr); } conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang); conf.setInt(Constants.NumMapTasks, numMappers); conf.setInt(Constants.NumReduceTasks, numReducers); conf.set(Constants.CollectionPath, seqCollection); conf.set(Constants.IndexPath, indexRootPath); conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName()); conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer" conf.setInt(Constants.MinDf, MinDF); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1 conf.setInt(Constants.TermIndexWindow, TermIndexWindow); // Builds term doc vectors from document collection, and filters the terms that are not included in Ivory.SrcVocab long startTime = System.currentTimeMillis(); long preprocessStartTime = System.currentTimeMillis(); LOG.info("Building term doc vectors..."); new BuildTermDocVectors(conf).run(); LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Get CF and DF counts startTime = System.currentTimeMillis(); LOG.info("Counting terms..."); new ComputeGlobalTermStatistics(conf).run(); LOG.info("TermCount = " + env.readCollectionTermCount() + "\nJob finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Build a map from terms to sequentially generated integer term ids startTime = System.currentTimeMillis(); LOG.info("Building term-to-integer id mapping..."); new BuildDictionary(conf).run(); LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Compute term weights, and output weighted term doc vectors startTime = System.currentTimeMillis(); LOG.info("Building weighted term doc vectors..."); conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25"); if (mode == CROSS_LINGUAL_F) { conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle); // translate term doc vectors into English. conf.setBoolean("Ivory.Normalize", true); new BuildTranslatedTermDocVectors(conf).run(); } else { conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle); // get weighted term doc vectors conf.setBoolean("Ivory.Normalize", true); new BuildWeightedTermDocVectors(conf).run(); } LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency startTime = System.currentTimeMillis(); LOG.info("Building weighted integer doc vectors..."); conf.setBoolean("Ivory.Normalize", IsNormalized); if (mode == MONO_LINGUAL) { new BuildIntDocVectors(conf).run(); new BuildWeightedIntDocVectors(conf).run(); LOG.info("Job BuildWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); } else { BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors( conf); LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); int finalNumDocs = weightedIntVectorsTool.run(); if (finalNumDocs > 0) { LOG.info("Changed doc count from " + env.readCollectionDocumentCount() + " to = " + finalNumDocs); env.writeCollectionDocumentCount(finalNumDocs); } // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures. Vocab engVocabH = null; try { engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf); } catch (IOException e) { e.printStackTrace(); } LOG.info("Changed term count to : " + env.readCollectionTermCount() + " = " + engVocabH.size()); env.writeCollectionTermCount(engVocabH.size()); } LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.core.preprocess.BuildTermDocVectors.java
License:Apache License
@SuppressWarnings("unchecked") public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); String collectionName = conf.get(Constants.CollectionName); String collectionPath = conf.get(Constants.CollectionPath); String inputFormat = conf.get(Constants.InputFormat); String tokenizer = conf.get(Constants.Tokenizer); String mappingClass = conf.get(Constants.DocnoMappingClass); int docnoOffset = conf.getInt(Constants.DocnoOffset, 0); int numReducers = conf.getInt(Constants.TermDocVectorSegments, 0); LOG.info("PowerTool: " + BuildTermDocVectors.class.getCanonicalName()); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath)); LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat)); LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer)); LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass)); LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset)); LOG.info(String.format(" - %s: %s", Constants.TermDocVectorSegments, numReducers)); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.error("Error, docno mapping data file " + mappingFile + " doesn't exist!"); return 0; }//from www. j a v a 2s .c o m DistributedCache.addCacheFile(mappingFile.toUri(), conf); Path outputPath = new Path(env.getTermDocVectorsDirectory()); if (fs.exists(outputPath)) { LOG.info("TermDocVectors already exist: Skipping!"); return 0; } env.writeCollectionName(collectionName); env.writeCollectionPath(collectionPath); env.writeInputFormat(inputFormat); env.writeDocnoMappingClass(mappingClass); env.writeTokenizerClass(tokenizer); env.writeDocnoOffset(docnoOffset); conf.set("mapred.child.java.opts", "-Xmx2048m"); Job job1 = new Job(conf, BuildTermDocVectors.class.getSimpleName() + ":" + collectionName); job1.setJarByClass(BuildTermDocVectors.class); job1.setNumReduceTasks(numReducers); FileInputFormat.addInputPaths(job1, collectionPath); FileOutputFormat.setOutputPath(job1, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD); job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat)); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setMapOutputKeyClass(IntWritable.class); job1.setMapOutputValueClass(LazyTermDocVector.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(LazyTermDocVector.class); job1.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); job1.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Write out number of postings. int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue(); env.writeCollectionDocumentCount(collectionDocCount); Path dlFile = env.getDoclengthsData(); if (fs.exists(dlFile)) { LOG.info("DocLength data exists: Skipping!"); return 0; } conf.setInt(Constants.CollectionDocumentCount, collectionDocCount); conf.set(InputPath, env.getDoclengthsDirectory().toString()); conf.set(DocLengthDataFile, dlFile.toString()); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); LOG.info("Writing doc length data to " + dlFile + "..."); Job job2 = new Job(conf, "DocLengthTable:" + collectionName); job2.setJarByClass(BuildTermDocVectors.class); job2.setNumReduceTasks(0); job2.setInputFormatClass(NullInputFormat.class); job2.setOutputFormatClass(NullOutputFormat.class); job2.setMapperClass(DocLengthDataWriterMapper.class); startTime = System.currentTimeMillis(); job2.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue(); env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount); return 0; }
From source file:ivory.preprocess.BuildTermDocVectors2.java
License:Apache License
@SuppressWarnings("unchecked") public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); String collectionName = conf.get(Constants.CollectionName); String collectionPath = conf.get(Constants.CollectionPath); String inputFormat = conf.get(Constants.InputFormat); String tokenizer = conf.get(Constants.Tokenizer); String mappingClass = conf.get(Constants.DocnoMappingClass); int docnoOffset = conf.getInt(Constants.DocnoOffset, 0); LOG.info("PowerTool: BuildTermDocVectors2"); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath)); LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat)); LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer)); LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass)); LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset)); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.error("Error, docno mapping data file " + mappingFile + "doesn't exist!"); return 0; }/* ww w. j a v a 2s. c o m*/ DistributedCache.addCacheFile(mappingFile.toUri(), conf); Path outputPath = new Path(env.getTermDocVectorsDirectory()); if (fs.exists(outputPath)) { LOG.info("TermDocVectors already exist: Skipping!"); return 0; } env.writeCollectionName(collectionName); env.writeCollectionPath(collectionPath); env.writeInputFormat(inputFormat); env.writeDocnoMappingClass(mappingClass); env.writeTokenizerClass(tokenizer); env.writeDocnoOffset(docnoOffset); Job job1 = new Job(conf, "BuildTermDocVectors2:" + collectionName); job1.setJarByClass(BuildTermDocVectors2.class); job1.setNumReduceTasks(0); FileInputFormat.addInputPaths(job1, collectionPath); FileOutputFormat.setOutputPath(job1, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD); job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat)); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setMapOutputKeyClass(IntWritable.class); job1.setMapOutputValueClass(LazyTermDocVector.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(LazyTermDocVector.class); job1.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); job1.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // write out number of postings int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue(); env.writeCollectionDocumentCount(collectionDocCount); Path dlFile = env.getDoclengthsData(); if (fs.exists(dlFile)) { LOG.info("DocLength data exists: Skipping!"); return 0; } conf.setInt(Constants.CollectionDocumentCount, collectionDocCount); conf.set(InputPath, env.getDoclengthsDirectory().toString()); conf.set(DocLengthDataFile, dlFile.toString()); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); LOG.info("Writing doc length data to " + dlFile + "..."); Job job2 = new Job(conf, "DocLengthTable2:" + collectionName); job2.setJarByClass(BuildTermDocVectors2.class); job2.setNumReduceTasks(0); job2.setInputFormatClass(NullInputFormat.class); job2.setOutputFormatClass(NullOutputFormat.class); job2.setMapperClass(DocLengthDataWriterMapper.class); startTime = System.currentTimeMillis(); job2.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue(); env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount); return 0; }
From source file:jobs.MatrixBlockMult.java
License:Apache License
public int run(String[] args) throws Exception { Configuration conf = getConf(); conf.setFloat("SCALAR", Float.parseFloat(args[3])); conf.setBoolean("LTRANS", Boolean.parseBoolean(args[4])); conf.setBoolean("RTRANS", Boolean.parseBoolean(args[5])); conf.setInt("NRL", Integer.parseInt(args[6])); conf.setInt("NCL", Integer.parseInt(args[7])); conf.setInt("NRR", Integer.parseInt(args[8])); conf.setInt("NCR", Integer.parseInt(args[9])); //set # of reducers conf.setInt("mapred.reduce.tasks", Integer.parseInt(args[10])); //Get optional blocksize parameters if (args.length >= 12) conf.setInt("SRL", Integer.parseInt(args[11])); if (args.length >= 13) conf.setInt("SCL", Integer.parseInt(args[12])); if (args.length >= 14) conf.setInt("SRR", Integer.parseInt(args[13])); if (args.length >= 15) conf.setInt("SCR", Integer.parseInt(args[14])); conf.set("LEFTNAME", args[0]); conf.set("RIGHTNAME", args[1]); conf.set("RESNAME", args[2]); //heap space - should be entered with the -D format and not dealt with by the program. conf.set("mapred.map.child.java.opts", "-Xmx3G"); conf.set("mapred.reduce.child.java.opts", "-Xmx3G"); //job//w w w .java 2 s. co m Job job1 = new Job(conf, "MatrixBlockMult"); job1.setJarByClass(MatrixBlockMult.class); // Map FileInputFormat.addInputPath(job1, new Path(args[0])); FileInputFormat.addInputPath(job1, new Path(args[1])); job1.setInputFormatClass(SequenceFileInputFormat.class); job1.setMapperClass(BlockMultiplicationGroupingMapper.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(MatrixBlock.class); //Reduce job1.setReducerClass(MatrixBlockMultReducer.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(MatrixBlock.class); FileOutputFormat.setOutputPath(job1, new Path(args[2])); job1.setOutputFormatClass(SequenceFileOutputFormat.class); //job1.setOutputFormatClass(TextOutputFormat.class); return job1.waitForCompletion(false) ? 0 : 1; }
From source file:jobs.MatrixBlockTraceMult.java
License:Apache License
public int run(String[] args) throws Exception { Configuration conf = getConf(); conf.setFloat("SCALAR", Float.parseFloat(args[3])); conf.setBoolean("LTRANS", Boolean.parseBoolean(args[4])); conf.setBoolean("RTRANS", Boolean.parseBoolean(args[5])); //set # of reducers conf.setInt("mapred.reduce.tasks", Integer.parseInt(args[6])); //Get optional blocksize parameters if (args.length >= 8) conf.setInt("SRL", Integer.parseInt(args[7])); if (args.length >= 9) conf.setInt("SCL", Integer.parseInt(args[8])); if (args.length >= 10) conf.setInt("SRR", Integer.parseInt(args[9])); if (args.length >= 11) conf.setInt("SCR", Integer.parseInt(args[10])); conf.set("LEFTNAME", args[0]); conf.set("RIGHTNAME", args[1]); //heap space - should be entered with the -D format and not dealt with by the program. conf.set("mapred.map.child.java.opts", "-Xmx3G"); conf.set("mapred.reduce.child.java.opts", "-Xmx3G"); //job/*from w w w.j a v a 2 s . c om*/ Job job1 = new Job(conf, "MatrixBlockTraceMult"); job1.setJarByClass(MatrixBlockMult.class); // Map FileInputFormat.addInputPath(job1, new Path(args[0])); FileInputFormat.addInputPath(job1, new Path(args[1])); job1.setInputFormatClass(SequenceFileInputFormat.class); job1.setMapperClass(SquareBlockTraceMultiplicationGroupingMapper.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(MatrixBlock.class); //Reduce job1.setReducerClass(SquareMatrixBlockTraceMultReducer.class); job1.setOutputKeyClass(NullWritable.class); job1.setOutputValueClass(DoubleWritable.class); FileOutputFormat.setOutputPath(job1, new Path(args[2])); job1.setOutputFormatClass(TextOutputFormat.class); return job1.waitForCompletion(false) ? 0 : 1; }