Example usage for org.apache.hadoop.conf Configuration setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setInt.

Prototype

public void setInt(String name, int value)

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:ivory.core.index.BuildLPInvertedIndexDocSorted.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);
    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    int collectionDocCount = env.readCollectionDocumentCount();

    String postingsType = conf.get(Constants.PostingsListsType,
            PostingsListDocSortedPositional.class.getCanonicalName());
    @SuppressWarnings("unchecked")
    Class<? extends PostingsList> postingsClass = (Class<? extends PostingsList>) Class.forName(postingsType);

    // These are the default values for the LP algorithm.
    float mapMemoryThreshold = conf.getFloat(Constants.IndexingMapMemoryThreshold, 0.9f);
    float reduceMemoryThreshold = conf.getFloat(Constants.IndexingReduceMemoryThreshold, 0.9f);
    int maxHeap = conf.getInt(Constants.MaxHeap, 2048);
    int maxNDocsBeforeFlush = conf.getInt(Constants.MaxNDocsBeforeFlush, 50000);

    LOG.info("PowerTool: " + BuildLPInvertedIndexDocSorted.class.getSimpleName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCount));
    LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName()));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));
    LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize));
    LOG.info(String.format(" - %s: %s", Constants.IndexingMapMemoryThreshold, mapMemoryThreshold));
    LOG.info(String.format(" - %s: %s", Constants.IndexingReduceMemoryThreshold, reduceMemoryThreshold));
    LOG.info(String.format(" - %s: %s", Constants.MaxHeap, maxHeap));
    LOG.info(String.format(" - %s: %s", Constants.MaxNDocsBeforeFlush, maxNDocsBeforeFlush));

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }/*from w w w  . j ava2 s .com*/

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCount);

    conf.setInt("mapred.min.split.size", minSplitSize);
    //conf.set("mapred.child.java.opts", "-Xmx" + maxHeap + "m");
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");

    Job job = Job.getInstance(conf, BuildLPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildLPInvertedIndexDocSorted.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, postingsPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PostingsListDocSortedPositional.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PostingsListDocSortedPositional.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType("ivory.data.PostingsListDocSortedPositional");

    return 0;
}

From source file:ivory.core.index.MergeGlobalStatsAcrossIndexSegments.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    if (args.length < 4) {
        System.err.println("Usage: [collection-name] [output-path] [df-threshold] [index1] [index2] ...");
        System.exit(-1);/*from  w  ww .  j ava 2 s.  c  om*/
    }

    String collectionName = args[0];
    String outputPath = args[1];
    int dfThreshold = Integer.parseInt(args[2]);

    LOG.info("Merging global statistics across index segments...");
    LOG.info(" CollectionName: " + collectionName);
    LOG.info(" OutputPath: " + outputPath);
    LOG.info(" DfThreshold: " + dfThreshold);

    LOG.info(" IndexPaths: ");
    StringBuffer sb = new StringBuffer();
    for (int i = 3; i < args.length; i++) {
        LOG.info("    Adding" + args[i]);
        sb.append(args[i]);
        if (i != args.length - 1)
            sb.append(",");
    }

    conf.set("Ivory.CollectionName", collectionName);
    conf.set("Ivory.IndexPaths", sb.toString());
    conf.set("Ivory.DataOutputPath", outputPath);
    conf.setInt("Ivory.DfThreshold", dfThreshold);

    new MergeGlobalStatsAcrossIndexSegments(conf).run();
}

From source file:ivory.core.preprocess.BuildDictionary.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    String collectionName = conf.get(Constants.CollectionName);

    LOG.info("PowerTool: " + BuildDictionary.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    if (!fs.exists(new Path(indexPath))) {
        LOG.error("index path doesn't existing: skipping!");
        return 0;
    }/*from www . j a  v a  2 s  .c  o  m*/

    if (fs.exists(new Path(env.getIndexTermsData())) && fs.exists(new Path(env.getIndexTermIdsData()))
            && fs.exists(new Path(env.getIndexTermIdMappingData()))
            && fs.exists(new Path(env.getDfByTermData())) && fs.exists(new Path(env.getCfByTermData()))
            && fs.exists(new Path(env.getDfByIntData())) && fs.exists(new Path(env.getCfByIntData()))) {
        LOG.info("term and term id data exist: skipping!");
        return 0;
    }

    conf.setInt(Constants.CollectionTermCount, (int) env.readCollectionTermCount());
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    Path tmpPath = new Path(env.getTempDirectory());
    fs.delete(tmpPath, true);

    Job job = new Job(conf, BuildDictionary.class.getSimpleName() + ":" + collectionName);

    job.setJarByClass(BuildDictionary.class);
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(env.getTermDfCfDirectory()));
    FileOutputFormat.setOutputPath(job, tmpPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfIntLong.class);
    job.setOutputKeyClass(Text.class);
    job.setSortComparatorClass(DictionaryTransformationStrategy.WritableComparator.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    fs.delete(tmpPath, true);

    return 0;
}

From source file:ivory.core.preprocess.BuildTermDocVectors.java

License:Apache License

@SuppressWarnings("unchecked")
public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    String collectionName = conf.get(Constants.CollectionName);
    String collectionPath = conf.get(Constants.CollectionPath);
    String inputFormat = conf.get(Constants.InputFormat);
    String tokenizer = conf.get(Constants.Tokenizer);
    String mappingClass = conf.get(Constants.DocnoMappingClass);
    int docnoOffset = conf.getInt(Constants.DocnoOffset, 0);
    int numReducers = conf.getInt(Constants.TermDocVectorSegments, 0);

    LOG.info("PowerTool: " + BuildTermDocVectors.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath));
    LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat));
    LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer));
    LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass));
    LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset));
    LOG.info(String.format(" - %s: %s", Constants.TermDocVectorSegments, numReducers));

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();

    if (!fs.exists(mappingFile)) {
        LOG.error("Error, docno mapping data file " + mappingFile + " doesn't exist!");
        return 0;
    }//from   ww w  .j  av  a  2 s.  c  om

    DistributedCache.addCacheFile(mappingFile.toUri(), conf);

    Path outputPath = new Path(env.getTermDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        LOG.info("TermDocVectors already exist: Skipping!");
        return 0;
    }

    env.writeCollectionName(collectionName);
    env.writeCollectionPath(collectionPath);
    env.writeInputFormat(inputFormat);
    env.writeDocnoMappingClass(mappingClass);
    env.writeTokenizerClass(tokenizer);
    env.writeDocnoOffset(docnoOffset);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    Job job1 = new Job(conf, BuildTermDocVectors.class.getSimpleName() + ":" + collectionName);
    job1.setJarByClass(BuildTermDocVectors.class);

    job1.setNumReduceTasks(numReducers);

    FileInputFormat.addInputPaths(job1, collectionPath);
    FileOutputFormat.setOutputPath(job1, outputPath);
    SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD);

    job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat));
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);

    job1.setMapOutputKeyClass(IntWritable.class);
    job1.setMapOutputValueClass(LazyTermDocVector.class);
    job1.setOutputKeyClass(IntWritable.class);
    job1.setOutputValueClass(LazyTermDocVector.class);

    job1.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();
    job1.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // Write out number of postings.
    int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue();
    env.writeCollectionDocumentCount(collectionDocCount);

    Path dlFile = env.getDoclengthsData();
    if (fs.exists(dlFile)) {
        LOG.info("DocLength data exists: Skipping!");
        return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCount);
    conf.set(InputPath, env.getDoclengthsDirectory().toString());
    conf.set(DocLengthDataFile, dlFile.toString());

    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    LOG.info("Writing doc length data to " + dlFile + "...");

    Job job2 = new Job(conf, "DocLengthTable:" + collectionName);
    job2.setJarByClass(BuildTermDocVectors.class);

    job2.setNumReduceTasks(0);
    job2.setInputFormatClass(NullInputFormat.class);
    job2.setOutputFormatClass(NullOutputFormat.class);
    job2.setMapperClass(DocLengthDataWriterMapper.class);

    startTime = System.currentTimeMillis();
    job2.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue();
    env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount);

    return 0;
}

From source file:ivory.driver.BuildIPIndex.java

License:Apache License

/**
 * Runs this tool.//from ww  w .  j ava  2  s. co m
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = args[0];

    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        sLogger.warn("Index path doesn't exist...");
        return -1;
    }

    int numMappers = Integer.parseInt(args[1]);
    int numReducers = Integer.parseInt(args[2]);

    sLogger.info("Tool name: BuildIPIndex");
    sLogger.info(" - Index path: " + indexPath);

    conf.set("Ivory.IndexPath", indexPath);
    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);

    BuildIPInvertedIndexDocSorted indexTool = new BuildIPInvertedIndexDocSorted(conf);
    indexTool.run();

    BuildIntPostingsForwardIndex postingsIndexer = new BuildIntPostingsForwardIndex(conf);
    postingsIndexer.run();

    return 0;
}

From source file:ivory.driver.PreprocessClueWebEnglish.java

License:Apache License

/**
 * Runs this tool./*from w w  w. ja  v  a2  s.com*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 5) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexPath = args[1];
    int segment = Integer.parseInt(args[2]);
    int numMappers = Integer.parseInt(args[3]);
    int numReducers = Integer.parseInt(args[4]);

    sLogger.info("Tool name: BuildIndexClueWebEnglish");
    sLogger.info(" - Collection path: " + collection);
    sLogger.info(" - Index path: " + indexPath);
    sLogger.info(" - segement number: " + segment);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        sLogger.error("Error: index path doesn't exist!");
        return 0;
    }

    if (!fs.exists(env.getDocnoMappingData())) {
        sLogger.error("Error: docno mapping data doesn't exist!");
        return 0;
    }

    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);

    conf.set("Ivory.CollectionName", "ClueWeb:English:Segment" + segment);
    conf.set("Ivory.CollectionPath", collection);
    conf.set("Ivory.IndexPath", indexPath);
    conf.set("Ivory.InputFormat", "org.apache.hadoop.mapred.SequenceFileInputFormat");
    conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer");
    conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping");
    conf.set("Ivory.DocnoMappingFile", env.getDocnoMappingData().toString());

    conf.setInt("Ivory.DocnoOffset", DocnoOffsets[segment]);
    conf.setInt("Ivory.MinDf", 10);
    conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE);
    conf.setInt("Ivory.TermIndexWindow", 8);

    new BuildTermDocVectors(conf).run();
    new GetTermCount(conf).run();
    new BuildTermIdMap(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.driver.PreprocessClueWebEnglish2.java

License:Apache License

/**
 * Runs this tool.//from   www  .j a  v  a 2 s .  com
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexPath = args[1];
    int segment = Integer.parseInt(args[2]);

    sLogger.info("Tool name: BuildTermDocVectorTest2");
    sLogger.info(" - Collection path: " + collection);
    sLogger.info(" - Index path: " + indexPath);
    sLogger.info(" - segement number: " + segment);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        sLogger.error("Error: index path doesn't exist!");
        return 0;
    }

    if (!fs.exists(env.getDocnoMappingData())) {
        sLogger.error("Error: docno mapping data doesn't exist!");
        return 0;
    }

    conf.setInt(Constants.NumReduceTasks, 200);

    conf.set(Constants.CollectionName, "ClueWeb:English:Segment" + segment);
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexPath);
    conf.set(Constants.InputFormat,
            org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.Tokenizer, ivory.tokenize.GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass,
            edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

    conf.setInt(Constants.DocnoOffset, DocnoOffsets[segment]);
    conf.setInt(Constants.MinDf, 10);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

    new BuildTermDocVectors2(conf).run();
    new GetTermCount2(conf).run();
    new BuildTermIdMap2(conf).run();
    new BuildIntDocVectors2(conf).run();

    return 0;
}

From source file:ivory.driver.PreprocessClueWebEnglishMultipleSegments.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 4) {
        System.out.println("usage: [index-path] [num-of-mappers] [num-of-reducers] [input-path]...");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }/*from   www .j  a v  a2s  .c o  m*/

    String indexPath = args[0];
    int numMappers = Integer.parseInt(args[1]);
    int numReducers = Integer.parseInt(args[2]);

    StringBuilder sb = new StringBuilder(args[3]);
    if (args.length > 4) {
        for (int i = 4; i < args.length; i++) {
            sb.append(",");
            sb.append(args[i]);
        }
    }
    String collection = sb.toString();

    LOG.info("Tool name: PreprocessClueWebEnglishMultipleSegments");
    LOG.info(" - Index path: " + indexPath);
    LOG.info(" - Collections: " + collection);

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        LOG.error("Error: index path doesn't exist!");
        return 0;
    }

    if (!fs.exists(env.getDocnoMappingData())) {
        LOG.error("Error: docno mapping data doesn't exist!");
        return 0;
    }

    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);

    conf.set(Constants.CollectionName, "ClueWeb:English");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexPath);
    conf.set(Constants.InputFormat, org.apache.hadoop.mapred.SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.Tokenizer, ivory.tokenize.GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass,
            edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

    conf.setInt(Constants.DocnoOffset, 0);
    conf.setInt(Constants.MinDf, 50);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

    new BuildTermDocVectors(conf).run();
    new GetTermCount(conf).run();
    new BuildTermIdMap(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.driver.PreprocessGov2.java

License:Apache License

/**
 * Runs this tool.//from  www .j a v a  2s . co  m
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexRootPath = args[1];
    int numMappers = Integer.parseInt(args[2]);
    int numReducers = Integer.parseInt(args[3]);

    sLogger.info("Tool name: PreprocessGov2");
    sLogger.info(" - Collection path: " + collection);
    sLogger.info(" - Index path: " + indexRootPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        sLogger.info("index directory doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        sLogger.info("docno-mapping.dat doesn't exist, creating...");
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(),
                new Integer(numMappers).toString() };
        NumberTrecWebDocuments tool = new NumberTrecWebDocuments();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(mappingDir, true);
    }

    // Now we're ready to start the preprocessing pipeline... set
    // appropriate properties.
    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);

    conf.set("Ivory.CollectionName", "Gov2");
    conf.set("Ivory.CollectionPath", collection);
    conf.set("Ivory.IndexPath", indexRootPath);
    conf.set("Ivory.InputFormat", "org.apache.hadoop.mapred.SequenceFileInputFormat");
    conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer");
    conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.trecweb.Gov2DocnoMapping");
    conf.set("Ivory.DocnoMappingFile", mappingFile.toString());

    conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1
    conf.setInt("Ivory.MinDf", 10);
    conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE);
    conf.setInt("Ivory.TermIndexWindow", 8);

    new BuildTermDocVectors(conf).run();
    new GetTermCount(conf).run();
    new BuildTermIdMap(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.driver.PreprocessMedline.java

License:Apache License

/**
 * Runs this tool.//from w ww.  j a va2s .  co  m
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexPath = args[1];
    int numMappers = Integer.parseInt(args[2]);
    int numReducers = Integer.parseInt(args[3]);

    sLogger.info("Tool name: ProcessMedline");
    sLogger.info(" - Collection path: " + collection);
    sLogger.info(" - Index path: " + indexPath);

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        sLogger.info("index path doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
        sLogger.info(mappingFile + " doesn't exist, creating...");
        String[] arr = new String[] { collection, indexPath + "/medline-docid-tmp", mappingFile.toString(),
                new Integer(numMappers).toString() };
        NumberMedlineCitations tool = new NumberMedlineCitations();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(new Path(indexPath + "/medline-docid-tmp"), true);
    }

    // Now we're ready to start the preprocessing pipeline... set
    // appropriate properties.
    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);

    conf.set("Ivory.CollectionName", "Medline");
    conf.set("Ivory.CollectionPath", collection);
    conf.set("Ivory.IndexPath", indexPath);
    conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer");
    conf.set("Ivory.InputFormat", "edu.umd.cloud9.collection.medline.MedlineCitationInputFormat");
    conf.set("Ivory.DocnoMappingFile", indexPath + "docno.mapping");
    conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.medline.MedlineDocnoMapping");

    conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1
    conf.setInt("Ivory.MinDf", 2); // toss away singleton terms
    conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE);
    conf.setInt("Ivory.TermIndexWindow", 8);

    new BuildTermDocVectors(conf).run();
    new GetTermCount(conf).run();
    new BuildTermIdMap(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}