List of usage examples for org.apache.hadoop.conf Configuration setInt
public void setInt(String name, int value)
name
property to an int
. From source file:ivory.driver.PreprocessTREC.java
License:Apache License
/** * Runs this tool.// ww w . j a va 2 s . c o m */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; int numMappers = Integer.parseInt(args[2]); int numReducers = Integer.parseInt(args[3]); sLogger.info("Tool name: PreprocessTREC"); sLogger.info(" - Collection path: " + collection); sLogger.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { sLogger.info("index directory doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Look for the docno mapping, which maps from docid (String) to docno // (sequentially-number integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { sLogger.info("docno-mapping.dat doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), new Integer(numMappers).toString() }; NumberTrecDocuments tool = new NumberTrecDocuments(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } // Now we're ready to start the preprocessing pipeline... set // appropriate properties. conf.setInt("Ivory.NumMapTasks", numMappers); conf.setInt("Ivory.NumReduceTasks", numReducers); conf.set("Ivory.CollectionName", "TREC_vol45"); conf.set("Ivory.CollectionPath", collection); conf.set("Ivory.IndexPath", indexRootPath); conf.set("Ivory.InputFormat", "edu.umd.cloud9.collection.trec.TrecDocumentInputFormat"); conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer"); conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.trec.TrecDocnoMapping"); conf.set("Ivory.DocnoMappingFile", env.getDocnoMappingData().toString()); conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1 conf.setInt("Ivory.MinDf", 2); // toss away singleton terms conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE); conf.setInt("Ivory.TermIndexWindow", 8); new BuildTermDocVectors(conf).run(); new GetTermCount(conf).run(); new BuildTermIdMap(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.driver.PreprocessWt10g.java
License:Apache License
/** * Runs this tool./*from w w w .j a v a 2 s . c o m*/ */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; int numMappers = Integer.parseInt(args[2]); int numReducers = Integer.parseInt(args[3]); sLogger.info("Tool name: PreprocessWt10g"); sLogger.info(" - Collection path: " + collection); sLogger.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { sLogger.info("index directory doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Look for the docno mapping, which maps from docid (String) to docno // (sequentially-number integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { sLogger.info("docno-mapping.dat doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), new Integer(numMappers).toString() }; NumberTrecWebDocuments tool = new NumberTrecWebDocuments(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } // Now we're ready to start the preprocessing pipeline... set // appropriate properties. conf.setInt("Ivory.NumMapTasks", numMappers); conf.setInt("Ivory.NumReduceTasks", numReducers); conf.set("Ivory.CollectionName", "Wt10g"); conf.set("Ivory.CollectionPath", collection); conf.set("Ivory.IndexPath", indexRootPath); conf.set("Ivory.InputFormat", "org.apache.hadoop.mapred.SequenceFileInputFormat"); conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer"); conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.trecweb.Wt10gDocnoMapping"); conf.set("Ivory.DocnoMappingFile", mappingFile.toString()); conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1 conf.setInt("Ivory.MinDf", 10); conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE); conf.setInt("Ivory.TermIndexWindow", 8); new BuildTermDocVectors(conf).run(); new GetTermCount(conf).run(); new BuildTermIdMap(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.preprocess.BuildTermDocVectors2.java
License:Apache License
@SuppressWarnings("unchecked") public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); String collectionName = conf.get(Constants.CollectionName); String collectionPath = conf.get(Constants.CollectionPath); String inputFormat = conf.get(Constants.InputFormat); String tokenizer = conf.get(Constants.Tokenizer); String mappingClass = conf.get(Constants.DocnoMappingClass); int docnoOffset = conf.getInt(Constants.DocnoOffset, 0); LOG.info("PowerTool: BuildTermDocVectors2"); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath)); LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat)); LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer)); LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass)); LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset)); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.error("Error, docno mapping data file " + mappingFile + "doesn't exist!"); return 0; }/*from w w w .j ava2 s . co m*/ DistributedCache.addCacheFile(mappingFile.toUri(), conf); Path outputPath = new Path(env.getTermDocVectorsDirectory()); if (fs.exists(outputPath)) { LOG.info("TermDocVectors already exist: Skipping!"); return 0; } env.writeCollectionName(collectionName); env.writeCollectionPath(collectionPath); env.writeInputFormat(inputFormat); env.writeDocnoMappingClass(mappingClass); env.writeTokenizerClass(tokenizer); env.writeDocnoOffset(docnoOffset); Job job1 = new Job(conf, "BuildTermDocVectors2:" + collectionName); job1.setJarByClass(BuildTermDocVectors2.class); job1.setNumReduceTasks(0); FileInputFormat.addInputPaths(job1, collectionPath); FileOutputFormat.setOutputPath(job1, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD); job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat)); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setMapOutputKeyClass(IntWritable.class); job1.setMapOutputValueClass(LazyTermDocVector.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(LazyTermDocVector.class); job1.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); job1.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // write out number of postings int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue(); env.writeCollectionDocumentCount(collectionDocCount); Path dlFile = env.getDoclengthsData(); if (fs.exists(dlFile)) { LOG.info("DocLength data exists: Skipping!"); return 0; } conf.setInt(Constants.CollectionDocumentCount, collectionDocCount); conf.set(InputPath, env.getDoclengthsDirectory().toString()); conf.set(DocLengthDataFile, dlFile.toString()); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); LOG.info("Writing doc length data to " + dlFile + "..."); Job job2 = new Job(conf, "DocLengthTable2:" + collectionName); job2.setJarByClass(BuildTermDocVectors2.class); job2.setNumReduceTasks(0); job2.setInputFormatClass(NullInputFormat.class); job2.setOutputFormatClass(NullOutputFormat.class); job2.setMapperClass(DocLengthDataWriterMapper.class); startTime = System.currentTimeMillis(); job2.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue(); env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount); return 0; }
From source file:ivory.preprocess.BuildTermIdMap2.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); String collectionName = conf.get(Constants.CollectionName); LOG.info("PowerTool: BuildTermIdMap2"); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); if (!fs.exists(new Path(indexPath))) { LOG.error("index path doesn't existing: skipping!"); return 0; }//w w w.j ava2 s . c o m Path termsFilePath = new Path(env.getIndexTermsData()); Path termIDsFilePath = new Path(env.getIndexTermIdsData()); Path idToTermFilePath = new Path(env.getIndexTermIdMappingData()); Path dfByTermFilePath = new Path(env.getDfByTermData()); Path cfByTermFilePath = new Path(env.getCfByTermData()); Path dfByIntFilePath = new Path(env.getDfByIntData()); Path cfByIntFilePath = new Path(env.getCfByIntData()); if (fs.exists(termsFilePath) || fs.exists(termIDsFilePath) || fs.exists(idToTermFilePath) || fs.exists(dfByTermFilePath) || fs.exists(cfByTermFilePath) || fs.exists(dfByIntFilePath) || fs.exists(cfByIntFilePath)) { LOG.info("term and term id data exist: skipping!"); return 0; } conf.setInt(Constants.CollectionTermCount, (int) env.readCollectionTermCount()); Path tmpPath = new Path(env.getTempDirectory()); fs.delete(tmpPath, true); Job job = new Job(conf, "BuildTermIdMap2:" + collectionName); job.setJarByClass(BuildTermIdMap2.class); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(env.getTermDfCfDirectory())); FileOutputFormat.setOutputPath(job, tmpPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfIntLong.class); job.setOutputKeyClass(Text.class); job.setMapperClass(Mapper.class); job.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); fs.delete(tmpPath, true); return 0; }
From source file:ivory.ptc.driver.BuildAnchorTextInvertedIndex.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 5) { printUsage();//from w w w . j av a 2 s . c o m return -1; } Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Command line arguments String inPath = args[0]; String outPath = args[1]; int numReducers = Integer.parseInt(args[2]); String weightingSchemeClass = args[3]; String weightingSchemeParameters = args[4]; int numMappers = 1; Path inputPath = new Path(inPath); if (!fs.exists(inputPath)) { LOG.warn("Input webgraph doesn't exist..."); return -1; } conf.set("Ivory.InputPath", inPath); conf.set("Ivory.OutputPath", outPath); conf.setInt("Ivory.NumMapTasks", numMappers); conf.setInt("Ivory.NumReduceTasks", numReducers); conf.set("Ivory.WeightingScheme", weightingSchemeClass); conf.set("Ivory.WeightingSchemeParameters", weightingSchemeParameters); AnchorTextInvertedIndex indexTool = new AnchorTextInvertedIndex(conf); indexTool.run(); return 0; }
From source file:ivory.pwsim.RunPCP.java
License:Apache License
/** * Runs this tool.//from w w w . ja va 2 s . com */ public int run(String[] args) throws Exception { if (args.length < 6) { printUsage(); return -1; } String indexPath = args[0]; int numMappers = Integer.parseInt(args[1]); int numReducers = Integer.parseInt(args[2]); Configuration config = new Configuration(); config.setInt("Ivory.NumMapTasks", numMappers); config.setInt("Ivory.NumReduceTasks", numReducers); int dfCut = Integer.parseInt(args[3]); int blockSize = Integer.parseInt(args[4]); String scoringModel = args[5]; String fn = args[5]; int i = scoringModel.lastIndexOf("."); if (i >= 0) fn = scoringModel.substring(i + 1); int topN = -1; if (args.length == 7) topN = Integer.parseInt(args[6]); config.set("Ivory.IndexPath", indexPath); config.set("Ivory.OutputPath", indexPath + "/pcp-dfCut=" + dfCut + "-blk=" + blockSize + "-" + fn + (topN > 0 ? "-topN=" + topN : "")); config.set("Ivory.ScoringModel", scoringModel); config.setInt("Ivory.DfCut", dfCut); config.setInt("Ivory.BlockSize", blockSize); config.setInt("Ivory.TopN", topN); PCP pwsimTask = new PCP(config); pwsimTask.run(); return 0; }
From source file:jobs.CreateUniformDoublyStochastic.java
License:Apache License
public int run(String[] args) throws Exception { Configuration conf = getConf(); int N = Integer.parseInt(args[2]); conf.setInt("N", N); int sR = Integer.parseInt(args[3]); conf.setInt("SR", sR); int sC = Integer.parseInt(args[4]); conf.setInt("SC", sC); String delim = args[5];//from www . j av a2 s . c o m conf.set("DELIM", delim); conf.setInt("mapred.reduce.tasks", Integer.parseInt(args[6])); conf.set("RESNAME", args[1]); //heap space - should be entered with the -D format and not dealt with by the program. conf.set("mapred.map.child.java.opts", "-Xmx3G"); conf.set("mapred.reduce.child.java.opts", "-Xmx3G"); //Create the File that We are mapping. //open the file in hdfs Path outFile = new Path(args[0]); FileSystem fs = FileSystem.get(conf); FSDataOutputStream out = fs.create(outFile); //write out an entry for the block int nR = N / sR + (N % sR > 0 ? 1 : 0); int nC = N / sC + (N % sC > 0 ? 1 : 0); for (int r = 0; r < nR; r++) { for (int c = 0; c < nC; c++) { out.writeUTF(String.valueOf(r) + delim + String.valueOf(c) + "\n"); } } //close file out.close(); //job Job job1 = new Job(conf, "CreateUniformDoubleStochastic"); job1.setJarByClass(CreateUniformDoublyStochastic.class); // Map FileInputFormat.addInputPath(job1, outFile); job1.setInputFormatClass(TextInputFormat.class); job1.setMapperClass(UniformDoublyStochasticMapper.class); //Reduce job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(MatrixBlock.class); FileOutputFormat.setOutputPath(job1, new Path(args[1])); job1.setOutputFormatClass(SequenceFileOutputFormat.class); //job1.setOutputFormatClass(TextOutputFormat.class); return job1.waitForCompletion(false) ? 0 : 1; }
From source file:jobs.EdgeListToMatrixBlock.java
License:Apache License
public int run(String[] args) throws Exception { Configuration conf = getConf(); //get params//from w ww . j a v a2 s .co m conf.setInt("SR", Integer.parseInt(args[2])); conf.setInt("SC", Integer.parseInt(args[3])); conf.set("DELIM", args[4]); conf.setInt("ONE", Integer.parseInt(args[5])); //set # of reducers conf.setInt("mapred.reduce.tasks", Integer.parseInt(args[6])); conf.set("RESNAME", args[1]); //heap space - this should be configurable conf.set("mapred.map.child.java.opts", "-Xmx3G"); conf.set("mapred.reduce.child.java.opts", "-Xmx3G"); //job Job job1 = new Job(conf, "EdgeListToMatrixBlock"); job1.setJarByClass(EdgeListToMatrixBlock.class); // Map FileInputFormat.setInputPaths(job1, new Path(args[0])); job1.setInputFormatClass(TextInputFormat.class); job1.setMapperClass(EdgeListBlockEntryMapper.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(BlockEntry.class); //Reduce job1.setReducerClass(BlockEntryMatrixBlockReducer.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(MatrixBlock.class); FileOutputFormat.setOutputPath(job1, new Path(args[1])); job1.setOutputFormatClass(SequenceFileOutputFormat.class); //job1.setOutputFormatClass(TextOutputFormat.class); return job1.waitForCompletion(false) ? 0 : 1; }
From source file:jobs.MatrixBlockAdd.java
License:Apache License
public int run(String[] args) throws Exception { Configuration conf = getConf(); conf.setFloat("ALPHA", Float.parseFloat(args[3])); conf.setFloat("BETA", Float.parseFloat(args[4])); conf.setInt("mapred.reduce.tasks", Integer.parseInt(args[5])); if (args.length >= 7) conf.setInt("SR", Integer.parseInt(args[6])); if (args.length >= 8) conf.setInt("SC", Integer.parseInt(args[7])); conf.set("LEFTNAME", args[0]); conf.set("RESNAME", args[2]); //heap space - again - should be passed with the -D option conf.set("mapred.map.child.java.opts", "-Xmx3G"); conf.set("mapred.reduce.child.java.opts", "-Xmx3G"); //job// w w w .ja v a2 s. c o m Job job1 = new Job(conf, "MatrixBlockAdd"); job1.setJarByClass(MatrixBlockAdd.class); // No Map FileInputFormat.addInputPath(job1, new Path(args[0])); FileInputFormat.addInputPath(job1, new Path(args[1])); job1.setInputFormatClass(SequenceFileInputFormat.class); job1.setMapperClass(NoNameMapper.class); //Reduce job1.setReducerClass(MatrixBlockAddReducer.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(MatrixBlock.class); FileOutputFormat.setOutputPath(job1, new Path(args[2])); job1.setOutputFormatClass(SequenceFileOutputFormat.class); //job1.setOutputFormatClass(TextOutputFormat.class); return job1.waitForCompletion(false) ? 0 : 1; }
From source file:jobs.MatrixBlockMult.java
License:Apache License
public int run(String[] args) throws Exception { Configuration conf = getConf(); conf.setFloat("SCALAR", Float.parseFloat(args[3])); conf.setBoolean("LTRANS", Boolean.parseBoolean(args[4])); conf.setBoolean("RTRANS", Boolean.parseBoolean(args[5])); conf.setInt("NRL", Integer.parseInt(args[6])); conf.setInt("NCL", Integer.parseInt(args[7])); conf.setInt("NRR", Integer.parseInt(args[8])); conf.setInt("NCR", Integer.parseInt(args[9])); //set # of reducers conf.setInt("mapred.reduce.tasks", Integer.parseInt(args[10])); //Get optional blocksize parameters if (args.length >= 12) conf.setInt("SRL", Integer.parseInt(args[11])); if (args.length >= 13) conf.setInt("SCL", Integer.parseInt(args[12])); if (args.length >= 14) conf.setInt("SRR", Integer.parseInt(args[13])); if (args.length >= 15) conf.setInt("SCR", Integer.parseInt(args[14])); conf.set("LEFTNAME", args[0]); conf.set("RIGHTNAME", args[1]); conf.set("RESNAME", args[2]); //heap space - should be entered with the -D format and not dealt with by the program. conf.set("mapred.map.child.java.opts", "-Xmx3G"); conf.set("mapred.reduce.child.java.opts", "-Xmx3G"); //job/* ww w . j a va2s .c o m*/ Job job1 = new Job(conf, "MatrixBlockMult"); job1.setJarByClass(MatrixBlockMult.class); // Map FileInputFormat.addInputPath(job1, new Path(args[0])); FileInputFormat.addInputPath(job1, new Path(args[1])); job1.setInputFormatClass(SequenceFileInputFormat.class); job1.setMapperClass(BlockMultiplicationGroupingMapper.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(MatrixBlock.class); //Reduce job1.setReducerClass(MatrixBlockMultReducer.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(MatrixBlock.class); FileOutputFormat.setOutputPath(job1, new Path(args[2])); job1.setOutputFormatClass(SequenceFileOutputFormat.class); //job1.setOutputFormatClass(TextOutputFormat.class); return job1.waitForCompletion(false) ? 0 : 1; }