List of usage examples for org.apache.hadoop.conf Configuration setInt
public void setInt(String name, int value)
name
property to an int
. From source file:ivory.core.index.BuildLPInvertedIndexDocSorted.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0); int minSplitSize = conf.getInt(Constants.MinSplitSize, 0); int collectionDocCount = env.readCollectionDocumentCount(); String postingsType = conf.get(Constants.PostingsListsType, PostingsListDocSortedPositional.class.getCanonicalName()); @SuppressWarnings("unchecked") Class<? extends PostingsList> postingsClass = (Class<? extends PostingsList>) Class.forName(postingsType); // These are the default values for the LP algorithm. float mapMemoryThreshold = conf.getFloat(Constants.IndexingMapMemoryThreshold, 0.9f); float reduceMemoryThreshold = conf.getFloat(Constants.IndexingReduceMemoryThreshold, 0.9f); int maxHeap = conf.getInt(Constants.MaxHeap, 2048); int maxNDocsBeforeFlush = conf.getInt(Constants.MaxNDocsBeforeFlush, 50000); LOG.info("PowerTool: " + BuildLPInvertedIndexDocSorted.class.getSimpleName()); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCount)); LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName())); LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks)); LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize)); LOG.info(String.format(" - %s: %s", Constants.IndexingMapMemoryThreshold, mapMemoryThreshold)); LOG.info(String.format(" - %s: %s", Constants.IndexingReduceMemoryThreshold, reduceMemoryThreshold)); LOG.info(String.format(" - %s: %s", Constants.MaxHeap, maxHeap)); LOG.info(String.format(" - %s: %s", Constants.MaxNDocsBeforeFlush, maxNDocsBeforeFlush)); if (!fs.exists(new Path(indexPath))) { fs.mkdirs(new Path(indexPath)); }/*from w w w . j ava2 s .com*/ Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path postingsPath = new Path(env.getPostingsDirectory()); if (fs.exists(postingsPath)) { LOG.info("Postings already exist: no indexing will be performed."); return 0; } conf.setInt(Constants.CollectionDocumentCount, collectionDocCount); conf.setInt("mapred.min.split.size", minSplitSize); //conf.set("mapred.child.java.opts", "-Xmx" + maxHeap + "m"); conf.set("mapreduce.map.memory.mb", "2048"); conf.set("mapreduce.map.java.opts", "-Xmx2048m"); conf.set("mapreduce.reduce.memory.mb", "2048"); conf.set("mapreduce.reduce.java.opts", "-Xmx2048m"); Job job = Job.getInstance(conf, BuildLPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName); job.setJarByClass(BuildLPInvertedIndexDocSorted.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, postingsPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PostingsListDocSortedPositional.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PostingsListDocSortedPositional.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); env.writePostingsType("ivory.data.PostingsListDocSortedPositional"); return 0; }
From source file:ivory.core.index.MergeGlobalStatsAcrossIndexSegments.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length < 4) { System.err.println("Usage: [collection-name] [output-path] [df-threshold] [index1] [index2] ..."); System.exit(-1);/*from w ww . j ava 2 s. c om*/ } String collectionName = args[0]; String outputPath = args[1]; int dfThreshold = Integer.parseInt(args[2]); LOG.info("Merging global statistics across index segments..."); LOG.info(" CollectionName: " + collectionName); LOG.info(" OutputPath: " + outputPath); LOG.info(" DfThreshold: " + dfThreshold); LOG.info(" IndexPaths: "); StringBuffer sb = new StringBuffer(); for (int i = 3; i < args.length; i++) { LOG.info(" Adding" + args[i]); sb.append(args[i]); if (i != args.length - 1) sb.append(","); } conf.set("Ivory.CollectionName", collectionName); conf.set("Ivory.IndexPaths", sb.toString()); conf.set("Ivory.DataOutputPath", outputPath); conf.setInt("Ivory.DfThreshold", dfThreshold); new MergeGlobalStatsAcrossIndexSegments(conf).run(); }
From source file:ivory.core.preprocess.BuildDictionary.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); String collectionName = conf.get(Constants.CollectionName); LOG.info("PowerTool: " + BuildDictionary.class.getCanonicalName()); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); if (!fs.exists(new Path(indexPath))) { LOG.error("index path doesn't existing: skipping!"); return 0; }/*from www . j a v a 2 s .c o m*/ if (fs.exists(new Path(env.getIndexTermsData())) && fs.exists(new Path(env.getIndexTermIdsData())) && fs.exists(new Path(env.getIndexTermIdMappingData())) && fs.exists(new Path(env.getDfByTermData())) && fs.exists(new Path(env.getCfByTermData())) && fs.exists(new Path(env.getDfByIntData())) && fs.exists(new Path(env.getCfByIntData()))) { LOG.info("term and term id data exist: skipping!"); return 0; } conf.setInt(Constants.CollectionTermCount, (int) env.readCollectionTermCount()); conf.set("mapred.child.java.opts", "-Xmx2048m"); Path tmpPath = new Path(env.getTempDirectory()); fs.delete(tmpPath, true); Job job = new Job(conf, BuildDictionary.class.getSimpleName() + ":" + collectionName); job.setJarByClass(BuildDictionary.class); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(env.getTermDfCfDirectory())); FileOutputFormat.setOutputPath(job, tmpPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfIntLong.class); job.setOutputKeyClass(Text.class); job.setSortComparatorClass(DictionaryTransformationStrategy.WritableComparator.class); job.setMapperClass(Mapper.class); job.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); fs.delete(tmpPath, true); return 0; }
From source file:ivory.core.preprocess.BuildTermDocVectors.java
License:Apache License
@SuppressWarnings("unchecked") public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); String collectionName = conf.get(Constants.CollectionName); String collectionPath = conf.get(Constants.CollectionPath); String inputFormat = conf.get(Constants.InputFormat); String tokenizer = conf.get(Constants.Tokenizer); String mappingClass = conf.get(Constants.DocnoMappingClass); int docnoOffset = conf.getInt(Constants.DocnoOffset, 0); int numReducers = conf.getInt(Constants.TermDocVectorSegments, 0); LOG.info("PowerTool: " + BuildTermDocVectors.class.getCanonicalName()); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath)); LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat)); LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer)); LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass)); LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset)); LOG.info(String.format(" - %s: %s", Constants.TermDocVectorSegments, numReducers)); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.error("Error, docno mapping data file " + mappingFile + " doesn't exist!"); return 0; }//from ww w .j av a 2 s. c om DistributedCache.addCacheFile(mappingFile.toUri(), conf); Path outputPath = new Path(env.getTermDocVectorsDirectory()); if (fs.exists(outputPath)) { LOG.info("TermDocVectors already exist: Skipping!"); return 0; } env.writeCollectionName(collectionName); env.writeCollectionPath(collectionPath); env.writeInputFormat(inputFormat); env.writeDocnoMappingClass(mappingClass); env.writeTokenizerClass(tokenizer); env.writeDocnoOffset(docnoOffset); conf.set("mapred.child.java.opts", "-Xmx2048m"); Job job1 = new Job(conf, BuildTermDocVectors.class.getSimpleName() + ":" + collectionName); job1.setJarByClass(BuildTermDocVectors.class); job1.setNumReduceTasks(numReducers); FileInputFormat.addInputPaths(job1, collectionPath); FileOutputFormat.setOutputPath(job1, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD); job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat)); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setMapOutputKeyClass(IntWritable.class); job1.setMapOutputValueClass(LazyTermDocVector.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(LazyTermDocVector.class); job1.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); job1.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Write out number of postings. int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue(); env.writeCollectionDocumentCount(collectionDocCount); Path dlFile = env.getDoclengthsData(); if (fs.exists(dlFile)) { LOG.info("DocLength data exists: Skipping!"); return 0; } conf.setInt(Constants.CollectionDocumentCount, collectionDocCount); conf.set(InputPath, env.getDoclengthsDirectory().toString()); conf.set(DocLengthDataFile, dlFile.toString()); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); LOG.info("Writing doc length data to " + dlFile + "..."); Job job2 = new Job(conf, "DocLengthTable:" + collectionName); job2.setJarByClass(BuildTermDocVectors.class); job2.setNumReduceTasks(0); job2.setInputFormatClass(NullInputFormat.class); job2.setOutputFormatClass(NullOutputFormat.class); job2.setMapperClass(DocLengthDataWriterMapper.class); startTime = System.currentTimeMillis(); job2.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue(); env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount); return 0; }
From source file:ivory.driver.BuildIPIndex.java
License:Apache License
/** * Runs this tool.//from ww w . j ava 2 s. co m */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = args[0]; Path p = new Path(indexPath); if (!fs.exists(p)) { sLogger.warn("Index path doesn't exist..."); return -1; } int numMappers = Integer.parseInt(args[1]); int numReducers = Integer.parseInt(args[2]); sLogger.info("Tool name: BuildIPIndex"); sLogger.info(" - Index path: " + indexPath); conf.set("Ivory.IndexPath", indexPath); conf.setInt("Ivory.NumMapTasks", numMappers); conf.setInt("Ivory.NumReduceTasks", numReducers); BuildIPInvertedIndexDocSorted indexTool = new BuildIPInvertedIndexDocSorted(conf); indexTool.run(); BuildIntPostingsForwardIndex postingsIndexer = new BuildIntPostingsForwardIndex(conf); postingsIndexer.run(); return 0; }
From source file:ivory.driver.PreprocessClueWebEnglish.java
License:Apache License
/** * Runs this tool./*from w w w. ja v a2 s.com*/ */ public int run(String[] args) throws Exception { if (args.length != 5) { printUsage(); return -1; } String collection = args[0]; String indexPath = args[1]; int segment = Integer.parseInt(args[2]); int numMappers = Integer.parseInt(args[3]); int numReducers = Integer.parseInt(args[4]); sLogger.info("Tool name: BuildIndexClueWebEnglish"); sLogger.info(" - Collection path: " + collection); sLogger.info(" - Index path: " + indexPath); sLogger.info(" - segement number: " + segment); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path p = new Path(indexPath); if (!fs.exists(p)) { sLogger.error("Error: index path doesn't exist!"); return 0; } if (!fs.exists(env.getDocnoMappingData())) { sLogger.error("Error: docno mapping data doesn't exist!"); return 0; } conf.setInt("Ivory.NumMapTasks", numMappers); conf.setInt("Ivory.NumReduceTasks", numReducers); conf.set("Ivory.CollectionName", "ClueWeb:English:Segment" + segment); conf.set("Ivory.CollectionPath", collection); conf.set("Ivory.IndexPath", indexPath); conf.set("Ivory.InputFormat", "org.apache.hadoop.mapred.SequenceFileInputFormat"); conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer"); conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping"); conf.set("Ivory.DocnoMappingFile", env.getDocnoMappingData().toString()); conf.setInt("Ivory.DocnoOffset", DocnoOffsets[segment]); conf.setInt("Ivory.MinDf", 10); conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE); conf.setInt("Ivory.TermIndexWindow", 8); new BuildTermDocVectors(conf).run(); new GetTermCount(conf).run(); new BuildTermIdMap(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.driver.PreprocessClueWebEnglish2.java
License:Apache License
/** * Runs this tool.//from www .j a v a 2 s . com */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String collection = args[0]; String indexPath = args[1]; int segment = Integer.parseInt(args[2]); sLogger.info("Tool name: BuildTermDocVectorTest2"); sLogger.info(" - Collection path: " + collection); sLogger.info(" - Index path: " + indexPath); sLogger.info(" - segement number: " + segment); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path p = new Path(indexPath); if (!fs.exists(p)) { sLogger.error("Error: index path doesn't exist!"); return 0; } if (!fs.exists(env.getDocnoMappingData())) { sLogger.error("Error: docno mapping data doesn't exist!"); return 0; } conf.setInt(Constants.NumReduceTasks, 200); conf.set(Constants.CollectionName, "ClueWeb:English:Segment" + segment); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexPath); conf.set(Constants.InputFormat, org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.Tokenizer, ivory.tokenize.GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, DocnoOffsets[segment]); conf.setInt(Constants.MinDf, 10); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); new BuildTermDocVectors2(conf).run(); new GetTermCount2(conf).run(); new BuildTermIdMap2(conf).run(); new BuildIntDocVectors2(conf).run(); return 0; }
From source file:ivory.driver.PreprocessClueWebEnglishMultipleSegments.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 4) { System.out.println("usage: [index-path] [num-of-mappers] [num-of-reducers] [input-path]..."); ToolRunner.printGenericCommandUsage(System.out); return -1; }/*from www .j a v a2s .c o m*/ String indexPath = args[0]; int numMappers = Integer.parseInt(args[1]); int numReducers = Integer.parseInt(args[2]); StringBuilder sb = new StringBuilder(args[3]); if (args.length > 4) { for (int i = 4; i < args.length; i++) { sb.append(","); sb.append(args[i]); } } String collection = sb.toString(); LOG.info("Tool name: PreprocessClueWebEnglishMultipleSegments"); LOG.info(" - Index path: " + indexPath); LOG.info(" - Collections: " + collection); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path p = new Path(indexPath); if (!fs.exists(p)) { LOG.error("Error: index path doesn't exist!"); return 0; } if (!fs.exists(env.getDocnoMappingData())) { LOG.error("Error: docno mapping data doesn't exist!"); return 0; } conf.setInt(Constants.NumMapTasks, numMappers); conf.setInt(Constants.NumReduceTasks, numReducers); conf.set(Constants.CollectionName, "ClueWeb:English"); conf.set(Constants.CollectionPath, collection); conf.set(Constants.IndexPath, indexPath); conf.set(Constants.InputFormat, org.apache.hadoop.mapred.SequenceFileInputFormat.class.getCanonicalName()); conf.set(Constants.Tokenizer, ivory.tokenize.GalagoTokenizer.class.getCanonicalName()); conf.set(Constants.DocnoMappingClass, edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping.class.getCanonicalName()); conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString()); conf.setInt(Constants.DocnoOffset, 0); conf.setInt(Constants.MinDf, 50); conf.setInt(Constants.MaxDf, Integer.MAX_VALUE); conf.setInt(Constants.TermIndexWindow, 8); new BuildTermDocVectors(conf).run(); new GetTermCount(conf).run(); new BuildTermIdMap(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.driver.PreprocessGov2.java
License:Apache License
/** * Runs this tool.//from www .j a v a 2s . co m */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String collection = args[0]; String indexRootPath = args[1]; int numMappers = Integer.parseInt(args[2]); int numReducers = Integer.parseInt(args[3]); sLogger.info("Tool name: PreprocessGov2"); sLogger.info(" - Collection path: " + collection); sLogger.info(" - Index path: " + indexRootPath); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexRootPath); if (!fs.exists(p)) { sLogger.info("index directory doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs); // Look for the docno mapping, which maps from docid (String) to docno // (sequentially-number integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); Path mappingDir = env.getDocnoMappingDirectory(); if (!fs.exists(mappingFile)) { sLogger.info("docno-mapping.dat doesn't exist, creating..."); String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), new Integer(numMappers).toString() }; NumberTrecWebDocuments tool = new NumberTrecWebDocuments(); tool.setConf(conf); tool.run(arr); fs.delete(mappingDir, true); } // Now we're ready to start the preprocessing pipeline... set // appropriate properties. conf.setInt("Ivory.NumMapTasks", numMappers); conf.setInt("Ivory.NumReduceTasks", numReducers); conf.set("Ivory.CollectionName", "Gov2"); conf.set("Ivory.CollectionPath", collection); conf.set("Ivory.IndexPath", indexRootPath); conf.set("Ivory.InputFormat", "org.apache.hadoop.mapred.SequenceFileInputFormat"); conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer"); conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.trecweb.Gov2DocnoMapping"); conf.set("Ivory.DocnoMappingFile", mappingFile.toString()); conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1 conf.setInt("Ivory.MinDf", 10); conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE); conf.setInt("Ivory.TermIndexWindow", 8); new BuildTermDocVectors(conf).run(); new GetTermCount(conf).run(); new BuildTermIdMap(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }
From source file:ivory.driver.PreprocessMedline.java
License:Apache License
/** * Runs this tool.//from w ww. j a va2s . co m */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String collection = args[0]; String indexPath = args[1]; int numMappers = Integer.parseInt(args[2]); int numReducers = Integer.parseInt(args[3]); sLogger.info("Tool name: ProcessMedline"); sLogger.info(" - Collection path: " + collection); sLogger.info(" - Index path: " + indexPath); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); // Create the index directory if it doesn't already exist. Path p = new Path(indexPath); if (!fs.exists(p)) { sLogger.info("index path doesn't exist, creating..."); fs.mkdirs(p); } RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); // Look for the docno mapping, which maps from docid (String) to docno // (sequentially-number integer). If it doesn't exist create it. Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { sLogger.info(mappingFile + " doesn't exist, creating..."); String[] arr = new String[] { collection, indexPath + "/medline-docid-tmp", mappingFile.toString(), new Integer(numMappers).toString() }; NumberMedlineCitations tool = new NumberMedlineCitations(); tool.setConf(conf); tool.run(arr); fs.delete(new Path(indexPath + "/medline-docid-tmp"), true); } // Now we're ready to start the preprocessing pipeline... set // appropriate properties. conf.setInt("Ivory.NumMapTasks", numMappers); conf.setInt("Ivory.NumReduceTasks", numReducers); conf.set("Ivory.CollectionName", "Medline"); conf.set("Ivory.CollectionPath", collection); conf.set("Ivory.IndexPath", indexPath); conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer"); conf.set("Ivory.InputFormat", "edu.umd.cloud9.collection.medline.MedlineCitationInputFormat"); conf.set("Ivory.DocnoMappingFile", indexPath + "docno.mapping"); conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.medline.MedlineDocnoMapping"); conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1 conf.setInt("Ivory.MinDf", 2); // toss away singleton terms conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE); conf.setInt("Ivory.TermIndexWindow", 8); new BuildTermDocVectors(conf).run(); new GetTermCount(conf).run(); new BuildTermIdMap(conf).run(); new BuildIntDocVectors(conf).run(); new BuildIntDocVectorsForwardIndex(conf).run(); new BuildTermDocVectorsForwardIndex(conf).run(); return 0; }