Example usage for org.apache.hadoop.conf Configuration setInt

List of usage examples for org.apache.hadoop.conf Configuration setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setInt.

Prototype

public void setInt(String name, int value) 

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:io.svectors.hbase.sink.HbaseTestUtil.java

License:Apache License

/**
 * Returns a new HBaseTestingUtility instance.
 *//* ww w .  j a  va 2 s  .  c o  m*/
private static HBaseTestingUtility createTestingUtility() {
    final Configuration hbaseConf = HBaseConfiguration.create();
    hbaseConf.setInt("replication.stats.thread.period.seconds", 5);
    hbaseConf.setLong("replication.sleep.before.failover", 2000);
    hbaseConf.setInt("replication.source.maxretriesmultiplier", 10);
    return new HBaseTestingUtility(hbaseConf);
}

From source file:it.crs4.features.GetMeta.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    GenericOptionsParser parser = new GenericOptionsParser(conf, args);
    String[] otherArgs = parser.getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: <prog> IN OUT");
        System.exit(2);/*from  w  w w.j a v  a  2 s .  c  o  m*/
    }
    conf.setInt(JobContext.NUM_REDUCES, 0);
    Job job = Job.getInstance(conf, "get bioimg meta");
    job.setInputFormatClass(NLineInputFormat.class);
    job.setJarByClass(GetMeta.class);
    job.setMapperClass(GetMetaMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    NLineInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:it.crs4.seal.prq.PrqOptionParser.java

License:Open Source License

@Override
protected CommandLine parseOptions(Configuration conf, String[] args) throws IOException, ParseException {
    conf.setInt(MinBasesThresholdConfigName, DefaultMinBasesThreshold);
    conf.setBoolean(DropFailedFilterConfigName, DropFailedFilterDefault);
    conf.setBoolean(WarningOnlyIfUnpairedConfigName, WarningOnlyIfUnpairedDefault);
    conf.setInt(NumReadsExpectedConfigName, NumReadsExpectedDefault);

    CommandLine line = super.parseOptions(conf, args);

    /* **** handle deprected properties **** */
    if (conf.get(PrqOptionParser.OLD_INPUT_FORMAT_CONF) != null) {
        throw new ParseException("The property " + PrqOptionParser.OLD_INPUT_FORMAT_CONF
                + " is no longer supported.\n" + "Please use the command line option --input-format instead.");
    }//from w  w w  . jav  a 2  s  .  com

    Utils.checkDeprecatedProp(conf, LOG, MinBasesThresholdConfigName_deprecated, MinBasesThresholdConfigName);
    Utils.checkDeprecatedProp(conf, LOG, DropFailedFilterConfigName_deprecated, DropFailedFilterConfigName);
    Utils.checkDeprecatedProp(conf, LOG, WarningOnlyIfUnpairedConfigName_deprecated,
            WarningOnlyIfUnpairedConfigName);

    // Let the deprecated properties override the new ones, unless the new ones have a non-default value.
    // If the new property has a non-default value, it must have been set by the user.
    // If, on the other hand, the deprecated property has a value, it must have been set by the user since
    // we're not setting them here.
    if (conf.get(MinBasesThresholdConfigName_deprecated) != null
            && conf.getInt(MinBasesThresholdConfigName, DefaultMinBasesThreshold) == DefaultMinBasesThreshold) {
        conf.setInt(MinBasesThresholdConfigName,
                conf.getInt(MinBasesThresholdConfigName_deprecated, DefaultMinBasesThreshold));
    }

    if (conf.get(DropFailedFilterConfigName_deprecated) != null && conf.getBoolean(DropFailedFilterConfigName,
            DropFailedFilterDefault) == DropFailedFilterDefault) {
        conf.setBoolean(DropFailedFilterConfigName,
                conf.getBoolean(DropFailedFilterConfigName_deprecated, DropFailedFilterDefault));
    }

    if (conf.get(WarningOnlyIfUnpairedConfigName_deprecated) != null
            && conf.getBoolean(WarningOnlyIfUnpairedConfigName,
                    WarningOnlyIfUnpairedDefault) == WarningOnlyIfUnpairedDefault) {
        conf.setBoolean(WarningOnlyIfUnpairedConfigName,
                conf.getBoolean(WarningOnlyIfUnpairedConfigName_deprecated, WarningOnlyIfUnpairedDefault));
    }

    /* **** end handle deprecated properties **** */

    if (line.hasOption(opt_traditionalIds.getOpt()))
        conf.setBoolean(PairReadsQSeq.PRQ_CONF_TRADITIONAL_IDS, true);

    if (line.hasOption(opt_numReads.getOpt())) {
        int numReads;
        try {
            numReads = Integer.valueOf(line.getOptionValue(opt_numReads.getOpt()));
            if (numReads <= 0)
                throw new ParseException("Number of reads per fragment must be >= 0 (got " + numReads + ")");
            if (numReads > 2) {
                throw new ParseException(
                        "Working with more than two reads per template is not supported at the moment.\n"
                                + "If you're interested in seeing this feature implemented contact the Seal developers.");
            }
        } catch (NumberFormatException e) {
            throw new ParseException(e.getMessage());
        }
        conf.setInt(NumReadsExpectedConfigName, numReads);
    }

    // set number of reduce tasks to use
    conf.set(ClusterUtils.NUM_RED_TASKS_PROPERTY, String.valueOf(getNReduceTasks()));
    return line;
}

From source file:ivory.app.BuildIndex.java

License:Apache License

@SuppressWarnings({ "static-access" })
@Override/*  w w w  .  j  ava 2s .  co  m*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(new Option(POSITIONAL_INDEX_IP, "build positional index (IP algorithm)"));
    options.addOption(new Option(POSITIONAL_INDEX_LP, "build positional index (LP algorithm)"));
    options.addOption(new Option(NONPOSITIONAL_INDEX_IP, "build nonpositional index (IP algorithm)"));

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) index path")
            .create(INDEX_PATH));
    options.addOption(OptionBuilder.withArgName("num").hasArg()
            .withDescription("(optional) number of index partitions: 64 default").create(INDEX_PARTITIONS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INDEX_PATH)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String indexPath = cmdline.getOptionValue(INDEX_PATH);

    int indexPartitions = cmdline.hasOption(INDEX_PARTITIONS)
            ? Integer.parseInt(cmdline.getOptionValue(INDEX_PARTITIONS))
            : 64;

    Configuration conf = getConf();

    LOG.info("Tool name: " + this.getClass().getSimpleName());
    LOG.info(String.format(" -%s %s", INDEX_PATH, indexPath));
    LOG.info(String.format(" -%s %d", INDEX_PARTITIONS, indexPartitions));

    if (cmdline.hasOption(POSITIONAL_INDEX_IP)) {
        LOG.info(String.format(" -%s", POSITIONAL_INDEX_IP));
        conf.set(Constants.IndexPath, indexPath);
        conf.setInt(Constants.NumReduceTasks, indexPartitions);
        conf.set(Constants.PostingsListsType,
                ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName());

        new BuildIPInvertedIndexDocSorted(conf).run();
        new BuildIntPostingsForwardIndex(conf).run();
    } else if (cmdline.hasOption(POSITIONAL_INDEX_LP)) {
        LOG.info(String.format(" -%s", POSITIONAL_INDEX_LP));
        conf.set(Constants.IndexPath, indexPath);
        conf.setInt(Constants.NumReduceTasks, indexPartitions);
        conf.set(Constants.PostingsListsType,
                ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName());

        conf.setFloat("Ivory.IndexingMapMemoryThreshold", 0.9f);
        conf.setFloat("Ivory.IndexingReduceMemoryThreshold", 0.9f);
        conf.setInt("Ivory.MaxHeap", 2048);
        conf.setInt("Ivory.MaxNDocsBeforeFlush", 50000);

        new BuildLPInvertedIndexDocSorted(conf).run();
        new BuildIntPostingsForwardIndex(conf).run();
    } else if (cmdline.hasOption(NONPOSITIONAL_INDEX_IP)) {
        LOG.info(String.format(" -%s", NONPOSITIONAL_INDEX_IP));
        conf.set(Constants.IndexPath, indexPath);
        conf.setInt(Constants.NumReduceTasks, indexPartitions);
        conf.set(Constants.PostingsListsType,
                ivory.core.data.index.PostingsListDocSortedNonPositional.class.getCanonicalName());

        new BuildIPInvertedIndexDocSorted(conf).run();
        new BuildIntPostingsForwardIndex(conf).run();
    } else {
        LOG.info(String.format("Nothing to do. Specify one of the following: %s, %s, %s", POSITIONAL_INDEX_IP,
                POSITIONAL_INDEX_LP, NONPOSITIONAL_INDEX_IP));
    }

    return 0;
}

From source file:ivory.app.PreprocessClueWebEnglish.java

License:Apache License

/**
 * Runs this tool.//from w w  w  .ja  v  a 2s .c om
 */
@SuppressWarnings({ "static-access" })
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    ;

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path")
            .create(PreprocessCollection.COLLECTION_PATH));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) index path")
            .create(PreprocessCollection.INDEX_PATH));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("(required) segment").create(SEGMENT));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(PreprocessCollection.COLLECTION_PATH)
            || !cmdline.hasOption(PreprocessCollection.INDEX_PATH) || !cmdline.hasOption(SEGMENT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String collection = cmdline.getOptionValue(PreprocessCollection.COLLECTION_PATH);
    String indexPath = cmdline.getOptionValue(PreprocessCollection.INDEX_PATH);
    int segment = Integer.parseInt(cmdline.getOptionValue(SEGMENT));

    LOG.info("Tool name: " + PreprocessClueWebEnglish.class.getSimpleName());
    LOG.info(" - collection path: " + collection);
    LOG.info(" - index path: " + indexPath);
    LOG.info(" - segement: " + segment);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        LOG.info("index path doesn't exist, creating...");
        fs.mkdirs(p);
    } else {
        LOG.info("Index directory " + p + " already exists!");
        return -1;
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();
    new ClueWarcDocnoMappingBuilder().build(new Path(collection), mappingFile, conf);

    conf.set(Constants.CollectionName, "ClueWeb:English:Segment" + segment);
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, ClueWarcDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

    conf.setInt(Constants.DocnoOffset, DOCNO_OFFSETS[segment]);
    conf.setInt(Constants.MinDf, 10);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);

    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.app.PreprocessCollection.java

License:Apache License

/**
 * Runs this tool./*from ww w . j  a  va2 s .  co  m*/
 */
@Override
public int run(String[] args) throws Exception {
    Options options = createOptions();

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(COLLECTION_PATH) || !cmdline.hasOption(COLLECTION_NAME)
            || !cmdline.hasOption(INDEX_PATH) || !cmdline.hasOption(DOCNO_MAPPING)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String collection = cmdline.getOptionValue(COLLECTION_PATH);
    String collectionName = cmdline.getOptionValue(COLLECTION_NAME);
    String indexPath = cmdline.getOptionValue(INDEX_PATH);
    int docnoOffset = 0;

    if (cmdline.hasOption(DOCNO_OFFSET)) {
        docnoOffset = Integer.parseInt(cmdline.getOptionValue(DOCNO_OFFSET));
    }

    Class<? extends DocnoMapping> docnoMappingClass = null;
    try {
        docnoMappingClass = (Class<? extends DocnoMapping>) Class
                .forName(cmdline.getOptionValue(DOCNO_MAPPING));
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    }

    @SuppressWarnings("rawtypes")
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    if (cmdline.hasOption(INPUTFORMAT)) {
        try {
            inputFormatClass = (Class<? extends InputFormat<?, ?>>) Class
                    .forName(cmdline.getOptionValue(INPUTFORMAT));
        } catch (ClassNotFoundException e) {
            throw new RuntimeException(e);
        }
    }

    Class<? extends Tokenizer> tokenizerClass = GalagoTokenizer.class;
    if (cmdline.hasOption(TOKENIZER)) {
        try {
            tokenizerClass = (Class<? extends Tokenizer>) Class.forName(cmdline.getOptionValue(TOKENIZER));
        } catch (ClassNotFoundException e) {
            throw new RuntimeException(e);
        }
    }

    int minDf = 2;
    if (cmdline.hasOption(MIN_DF)) {
        minDf = Integer.parseInt(cmdline.getOptionValue(MIN_DF));
    }

    LOG.info("Tool name: " + this.getClass().getSimpleName());
    LOG.info(String.format(" -%s %s", COLLECTION_PATH, collection));
    LOG.info(String.format(" -%s %s", COLLECTION_NAME, collectionName));
    LOG.info(String.format(" -%s %s", INDEX_PATH, indexPath));
    LOG.info(String.format(" -%s %s", DOCNO_MAPPING, docnoMappingClass.getCanonicalName()));
    LOG.info(String.format(" -%s %s", INPUTFORMAT, inputFormatClass.getCanonicalName()));
    LOG.info(String.format(" -%s %s", TOKENIZER, tokenizerClass.getCanonicalName()));
    LOG.info(String.format(" -%s %d", MIN_DF, minDf));

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        LOG.info("Index directory " + p + " doesn't exist, creating.");
        fs.mkdirs(p);
    } else {
        LOG.info("Index directory " + p + " already exists!");
        return -1;
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    conf.set(Constants.CollectionName, collectionName);
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexPath);
    conf.set(Constants.InputFormat, inputFormatClass.getCanonicalName());
    conf.set(Constants.Tokenizer, tokenizerClass.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, docnoMappingClass.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

    conf.setInt(Constants.DocnoOffset, docnoOffset);
    conf.setInt(Constants.MinDf, minDf);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);

    Path mappingFile = env.getDocnoMappingData();
    docnoMappingClass.newInstance().getBuilder().build(new Path(collection), mappingFile, conf);

    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.app.PreprocessTrecForeign.java

License:Apache License

/**
 * Runs this tool.//w  w  w . j a  v a2  s.c o m
 */
public int run(String[] args) throws Exception {
    Configuration conf = parseArgs(args);
    FileSystem fs = FileSystem.get(conf);
    String indexRootPath = conf.get(Constants.IndexPath);
    String collection = conf.get(Constants.CollectionPath);

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);
    Path mappingFile = env.getDocnoMappingData();
    new TrecDocnoMappingBuilder().build(new Path(collection), mappingFile, conf);

    conf.set(Constants.DocnoMappingClass, TrecDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());
    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 2); // toss away singleton terms
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);
    conf.set(Constants.InputFormat, TrecDocumentInputFormat.class.getCanonicalName());

    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.app.PreprocessWikipedia.java

License:Apache License

/**
 * Runs this tool./* www. j  av a2 s  .  c  o m*/
 */
public int run(String[] args) throws Exception {
    if (parseArgs(args) < 0) {
        printUsage();
        return -1;
    }
    Configuration conf = getConf();

    conf.set(Constants.Language, collectionLang);
    conf.setBoolean(Constants.Stemming, true); // default behavior of tokenizer is currently to stem, but we shouldnt rely on that

    if (tokenizerModel != null) {
        conf.set(Constants.TokenizerData, tokenizerModel);
    }

    // user can either provide a tokenizer class as a program argument, 
    // or let the factory find an appropriate class based on language code
    try {
        Class.forName(tokenizerClass);
    } catch (Exception e) {
        tokenizerClass = TokenizerFactory.getTokenizerClass(collectionLang, tokenizerModel).getCanonicalName();
    }

    if (collectionVocab != null) {
        conf.set(Constants.CollectionVocab, collectionVocab); // vocabulary to read collection from
    }
    if (e_stopwordList != null) {
        conf.set(Constants.StopwordList, e_stopwordList);
        conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed");
    }
    // CROSS-LINGUAL CASE
    if (mode == CROSS_LINGUAL_E) { // English side
        conf.set("Ivory.FinalVocab", collectionVocab); // vocabulary to map terms to integers in BuildTargetLang...
        conf.set(Constants.StopwordList, e_stopwordList);
        conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed");
    }

    if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated
        conf.set(Constants.TargetIndexPath, targetIndexPath);
        conf.set("Ivory.F_Vocab_F2E", fVocab_f2e);
        conf.set("Ivory.E_Vocab_F2E", eVocab_f2e);
        conf.set("Ivory.TTable_F2E", ttable_f2e);
        conf.set("Ivory.E_Vocab_E2F", eVocab_e2f);
        conf.set("Ivory.F_Vocab_E2F", fVocab_e2f);
        conf.set("Ivory.TTable_E2F", ttable_e2f);
        conf.set(Constants.CollectionVocab, fVocab_f2e); // vocabulary to read collection from
        conf.set("Ivory.FinalVocab", eVocab_f2e); // vocabulary to map terms to integers in BuildTargetLang...
        if (f_stopwordList != null) {
            conf.set(Constants.StopwordList, f_stopwordList);
            conf.set(Constants.StemmedStopwordList, f_stopwordList + ".stemmed");
        }
        if (e_stopwordList != null) {
            conf.set(Constants.TargetStopwordList, e_stopwordList);
            conf.set(Constants.TargetStemmedStopwordList, e_stopwordList + ".stemmed");
        }
        if (e_tokenizerModel != null) {
            conf.set(Constants.TargetTokenizer, e_tokenizerModel);
        }
        conf.set(Constants.TargetLanguage, targetLang);
    }

    int numMappers = 100;
    int numReducers = 100;

    // Print out options
    LOG.info("Tool name: WikipediaDriver");
    LOG.info(" - Index path: " + indexRootPath);
    LOG.info(" - Raw collection path: " + rawCollection);
    LOG.info(" - Compressed collection path: " + seqCollection);
    LOG.info(" - Collection language: " + collectionLang);
    LOG.info(" - Tokenizer class: " + tokenizerClass);
    LOG.info(" - Tokenizer model: " + tokenizerModel);
    LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle);
    LOG.info(" - Stopwords file: " + e_stopwordList);

    if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) {
        LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side.");
        LOG.info(" - Collection vocab file: " + conf.get(Constants.CollectionVocab));
        LOG.info(" - Tokenizer model: " + tokenizerModel);

        if (mode == CROSS_LINGUAL_F) {
            LOG.info(" - TTable file " + collectionLang + " --> " + targetLang + " : " + ttable_f2e);
            LOG.info(" - Source vocab file: " + fVocab_f2e);
            LOG.info(" - Target vocab file: " + eVocab_f2e);
            LOG.info(" - TTable file " + targetLang + " --> " + collectionLang + " : " + ttable_e2f);
            LOG.info(" - Source vocab file: " + eVocab_e2f);
            LOG.info(" - Target vocab file: " + fVocab_e2f);
            LOG.info(" - Source stopwords file: " + f_stopwordList);
            LOG.info(" - Target stopwords file: " + e_stopwordList);
            LOG.info(" - Target stemmed stopwords file: " + conf.get(Constants.TargetStemmedStopwordList));
            LOG.info(" - Target tokenizer path: " + e_tokenizerModel);
        }
    }

    FileSystem fs = FileSystem.get(conf);

    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        LOG.info("Index path doesn't exist, creating...");
        fs.mkdirs(p);
    }
    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Build docno mapping from raw collection
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
        LOG.info(mappingFile + " doesn't exist, creating...");
        String[] arr = new String[] { "-input=" + rawCollection, "-output_file=" + mappingFile.toString(),
                "-wiki_language=" + collectionLang };
        LOG.info("Running WikipediaDocnoMappingBuilder with args " + Arrays.toString(arr));

        WikipediaDocnoMappingBuilder tool = new WikipediaDocnoMappingBuilder();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    } else {
        LOG.info("Docno mapping already exists at: " + mappingFile);
    }

    // Repack Wikipedia into sequential compressed block
    if (!fs.exists(new Path(seqCollection + "/part-00000"))) {
        LOG.info(seqCollection + " doesn't exist, creating...");
        String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection,
                "-mapping_file=" + mappingFile.toString(), "-compression_type=block",
                "-wiki_language=" + collectionLang };
        LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

        RepackWikipedia tool = new RepackWikipedia();
        tool.setConf(conf);
        tool.run(arr);
    } else {
        LOG.info("Repacked collection already exists at: " + seqCollection);
    }

    conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName());
    conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer"
    conf.setInt(Constants.MinDf, MinDF);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.TermIndexWindow, TermIndexWindow);

    // Builds term doc vectors from document collection, and filters the terms that are not included
    // in Ivory.SrcVocab.
    long startTime = System.currentTimeMillis();
    long preprocessStartTime = System.currentTimeMillis();
    LOG.info("Building term doc vectors...");
    int exitCode = new BuildTermDocVectors(conf).run();
    if (exitCode >= 0) {
        LOG.info("Job BuildTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0
                + " seconds");
    } else {
        LOG.info("Error: BuildTermDocVectors. Terminating...");
        return -1;
    }

    // Get CF and DF counts.
    startTime = System.currentTimeMillis();
    LOG.info("Counting terms...");
    exitCode = new ComputeGlobalTermStatistics(conf).run();
    LOG.info("TermCount = " + env.readCollectionTermCount());
    if (exitCode >= 0) {
        LOG.info("Job ComputeGlobalTermStatistics finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    } else {
        LOG.info("Error: ComputeGlobalTermStatistics. Terminating...");
        return -1;
    }
    // Build a map from terms to sequentially generated integer term ids.
    startTime = System.currentTimeMillis();
    LOG.info("Building term-to-integer id mapping...");
    exitCode = new BuildDictionary(conf).run();
    if (exitCode >= 0) {
        LOG.info("Job BuildDictionary finished in " + (System.currentTimeMillis() - startTime) / 1000.0
                + " seconds");
    } else {
        LOG.info("Error: BuildDictionary. Terminating...");
        return -1;
    }

    // Compute term weights, and output weighted term doc vectors.
    LOG.info("Building weighted term doc vectors...");
    startTime = System.currentTimeMillis();

    conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle);

    if (mode == CROSS_LINGUAL_F) {
        // Translate term doc vectors into English.
        exitCode = new BuildTranslatedTermDocVectors(conf).run();
    } else {
        // Build weighted term doc vectors.
        exitCode = new BuildWeightedTermDocVectors(conf).run();
    }
    if (exitCode >= 0) {
        LOG.info("Job BuildTranslated/WeightedTermDocVectors finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    } else {
        LOG.info("Error: BuildTranslated/WeightedTermDocVectors. Terminating...");
        return -1;
    }

    // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted integer doc vectors...");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    if (mode == MONO_LINGUAL) {
        exitCode = new BuildIntDocVectors(conf).run();
        exitCode = new BuildWeightedIntDocVectors(conf).run();
        if (exitCode >= 0) {
            LOG.info("Job BuildWeightedIntDocVectors finished in "
                    + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        } else {
            LOG.info("Error: BuildWeightedIntDocVectors. Terminating...");
            return -1;
        }
    } else {
        BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors(
                conf);

        int finalNumDocs = weightedIntVectorsTool.run();

        LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        if (finalNumDocs > 0) {
            LOG.info("Changed doc count: " + env.readCollectionDocumentCount() + " => " + finalNumDocs);
            env.writeCollectionDocumentCount(finalNumDocs);
        } else {
            LOG.info("No document output! Terminating...");
            return -1;
        }
        // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
        Vocab engVocabH = null;
        try {
            engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
        } catch (IOException e) {
            e.printStackTrace();
        }
        LOG.info("Changed term count: " + env.readCollectionTermCount() + " => " + engVocabH.size());
        env.writeCollectionTermCount(engVocabH.size());
    }

    LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0
            + " seconds");

    return 0;
}

From source file:ivory.core.driver.BuildNonPositionalIndexIP.java

License:Apache License

/**
 * Runs this tool.//ww w  .  j a va  2s. c om
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = args[0];

    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        LOG.warn("Index path doesn't exist...");
        return -1;
    }

    int numReducers = Integer.parseInt(args[1]);

    LOG.info("Tool name: " + BuildPositionalIndexIP.class.getCanonicalName());
    LOG.info(" - Index path: " + indexPath);

    conf.set(Constants.IndexPath, indexPath);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.PostingsListsType,
            ivory.core.data.index.PostingsListDocSortedNonPositional.class.getCanonicalName());

    new BuildIPInvertedIndexDocSorted(conf).run();
    new BuildIntPostingsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.core.driver.BuildPositionalIndexIP.java

License:Apache License

/**
 * Runs this tool.//from  w w w.ja  va  2  s . com
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = args[0];

    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        LOG.warn("Index path doesn't exist...");
        return -1;
    }

    int numReducers = Integer.parseInt(args[1]);

    LOG.info("Tool name: " + BuildPositionalIndexIP.class.getCanonicalName());
    LOG.info(" - Index path: " + indexPath);

    conf.set(Constants.IndexPath, indexPath);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.PostingsListsType,
            ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName());

    new BuildIPInvertedIndexDocSorted(conf).run();
    new BuildIntPostingsForwardIndex(conf).run();

    return 0;
}