Example usage for org.apache.hadoop.conf Configuration setInt

List of usage examples for org.apache.hadoop.conf Configuration setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setInt.

Prototype

public void setInt(String name, int value) 

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:ivory.core.driver.PreprocessAquaint2.java

License:Apache License

/**
 * Runs this tool./*from   ww w  .  j  av  a  2s.com*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexRootPath = args[1];

    LOG.info("Tool name: " + PreprocessAquaint2.class.getCanonicalName());
    LOG.info(" - Collection path: " + collection);
    LOG.info(" - Index path: " + indexRootPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        fs.mkdirs(p);
    } else {
        LOG.info("Index directory already exists, skipping!");
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);
    conf.set(XMLInputFormat.START_TAG_KEY, Aquaint2Document.getXmlStartTag(fs, collection));
    conf.set(XMLInputFormat.END_TAG_KEY, Aquaint2Document.getXmlEndTag());

    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() };
        NumberAquaint2Documents2 tool = new NumberAquaint2Documents2();
        tool.setConf(conf);
        tool.run(arr);
        fs.delete(mappingDir, true);
    } else {
        LOG.info("DocnoMapping already exists, skipping!");
    }
    Aquaint2DocnoMapping dm = new Aquaint2DocnoMapping();
    dm.loadMapping(mappingFile, fs);

    int docno;
    int expectedDocno;
    String expectedDocid;
    String docid;
    boolean testAquaint2 = false;
    if (testAquaint2) {
        docno = 500;
        expectedDocid = "AFP_ENG_20041001.0500";
        docid = dm.getDocid(docno);
        System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", "
                + (expectedDocid.equals(docid)));
        docno = 600;
        expectedDocid = "AFP_ENG_20041001.0600";
        docid = dm.getDocid(docno);
        System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", "
                + (expectedDocid.equals(docid)));
        docno = 700;
        expectedDocid = "AFP_ENG_20041001.0701";
        docid = dm.getDocid(docno);
        System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", "
                + (expectedDocid.equals(docid)));
        docno = 800;
        expectedDocid = "AFP_ENG_20041003.0019";
        docid = dm.getDocid(docno);
        System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", "
                + (expectedDocid.equals(docid)));
        expectedDocno = 500;
        docid = "AFP_ENG_20041001.0500";
        docno = dm.getDocno(docid);
        System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", "
                + (expectedDocno == docno));
        expectedDocno = 600;
        docid = "AFP_ENG_20041001.0600";
        docno = dm.getDocno(docid);
        System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", "
                + (expectedDocno == docno));
        expectedDocno = 700;
        docid = "AFP_ENG_20041001.0701";
        docno = dm.getDocno(docid);
        System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", "
                + (expectedDocno == docno));
        expectedDocno = 800;
        docid = "AFP_ENG_20041003.0019";
        docno = dm.getDocno(docid);
        System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", "
                + (expectedDocno == docno));
        System.out.println("finished testing, now exiting");
        return 0;
    }
    boolean testGigaword = false;
    if (testGigaword) {
        for (int i = 1; i < 301; i++) {
            docno = i * 1000;
            docid = dm.getDocid(docno);
            System.out.println("dm.getDocid(" + docno + "): " + docid);
        }
        System.out.println("finished testing, now exiting");
        return 0;
    }

    conf.set(Constants.CollectionName, "Aquaint2");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, Aquaint2DocumentInputFormat2.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, Aquaint2DocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 2); // toss away singleton terms
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

    new BuildTermDocVectors(conf).run();

    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    //new BuildTermDocVectorsForwardIndex(conf).run();

    new BuildIPInvertedIndexDocSorted(conf).run();

    conf.set(Constants.ScoringModel, "ivory.pwsim.score.TfIdf");
    conf.setBoolean(Constants.Normalize, true);

    new BuildIntPostingsForwardIndex(conf).run();

    boolean buildingVectors = true;
    //boolean buildingVectors = false;
    if (buildingVectors) {
        //new BuildWeightedIntDocVectors(conf).run();

        //conf.setBoolean(Constants.BuildWeighted, true);
        //new BuildIntDocVectorsForwardIndex(conf).run();

        String findexDirPath = indexRootPath + "/findex";
        String findexFilePath = indexRootPath + "/findex.dat";
        if (fs.exists(new Path(findexDirPath))) {
            LOG.info("ForwardIndex already exists: Skipping!");
        } else {
            new BuildAquaint2ForwardIndex().runTool(conf, collection, findexDirPath, findexFilePath,
                    mappingFile.toString());
        }
    }

    return 0;
}

From source file:ivory.core.driver.PreprocessClueWebEnglish.java

License:Apache License

/**
 * Runs this tool.//w  w  w. ja v  a 2  s  .c o  m
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexPath = args[1];
    int segment = Integer.parseInt(args[2]);

    LOG.info("Tool name: " + PreprocessClueWebEnglish.class.getCanonicalName());
    LOG.info(" - Collection path: " + collection);
    LOG.info(" - Index path: " + indexPath);
    LOG.info(" - segement number: " + segment);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        LOG.error("Error: index path doesn't exist!");
        return 0;
    }

    if (!fs.exists(env.getDocnoMappingData())) {
        LOG.error("Error: docno mapping data doesn't exist!");
        return 0;
    }

    conf.set(Constants.CollectionName, "ClueWeb:English:Segment" + segment);
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, ClueWarcDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

    conf.setInt(Constants.DocnoOffset, DocnoOffsets[segment]);
    conf.setInt(Constants.MinDf, 10);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.core.driver.PreprocessClueWebEnglishMultipleSegments.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 4) {
        System.out.println("usage: [index-path] [num-of-mappers] [num-of-reducers] [input-path]...");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }// w ww. j a v a 2 s.  co m

    String indexPath = args[0];
    int numMappers = Integer.parseInt(args[1]);
    int numReducers = Integer.parseInt(args[2]);

    StringBuilder sb = new StringBuilder(args[3]);
    if (args.length > 4) {
        for (int i = 4; i < args.length; i++) {
            sb.append(",");
            sb.append(args[i]);
        }
    }
    String collection = sb.toString();

    LOG.info("Tool name: PreprocessClueWebEnglishMultipleSegments");
    LOG.info(" - Index path: " + indexPath);
    LOG.info(" - Collections: " + collection);

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        LOG.error("Error: index path doesn't exist!");
        return 0;
    }

    if (!fs.exists(env.getDocnoMappingData())) {
        LOG.error("Error: docno mapping data doesn't exist!");
        return 0;
    }

    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);

    conf.set(Constants.CollectionName, "ClueWeb:English");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, ClueWarcDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

    conf.setInt(Constants.DocnoOffset, 0);
    conf.setInt(Constants.MinDf, 50);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.core.driver.PreprocessGov2.java

License:Apache License

/**
 * Runs this tool.//ww w  .  ja  v  a2 s  . c  o  m
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexRootPath = args[1];

    LOG.info("Tool name: " + PreprocessGov2.class.getCanonicalName());
    LOG.info(" - Collection path: " + collection);
    LOG.info(" - Index path: " + indexRootPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        LOG.info("index directory doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        LOG.info("docno-mapping.dat doesn't exist, creating...");
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), "100" };
        NumberTrecWebDocuments tool = new NumberTrecWebDocuments();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(mappingDir, true);
    }

    conf.set(Constants.CollectionName, "Gov2");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, Gov2DocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, mappingFile.toString());

    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 10);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.core.driver.PreprocessMedline.java

License:Apache License

/**
 * Runs this tool./*from  w  ww.  j  a  v  a  2  s . com*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexPath = args[1];

    LOG.info("Tool name: ProcessMedline");
    LOG.info(" - Collection path: " + collection);
    LOG.info(" - Index path: " + indexPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
        LOG.info("index path doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno (sequentially-number
    // integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        LOG.info(mappingFile + " doesn't exist, creating...");
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() };
        NumberMedlineCitations2 tool = new NumberMedlineCitations2();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(mappingDir, true);
    }

    conf.set(Constants.CollectionName, "Medline");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexPath);
    conf.set(Constants.InputFormat, MedlineCitationInputFormat2.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, MedlineDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 2); // toss away singleton terms
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);

    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.core.driver.PreprocessTREC.java

License:Apache License

/**
 * Runs this tool.//from  w w  w . j  av a 2  s .  c  o m
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexRootPath = args[1];

    LOG.info("Tool name: " + PreprocessTREC.class.getCanonicalName());
    LOG.info(" - Collection path: " + collection);
    LOG.info(" - Index path: " + indexRootPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        LOG.info("index directory doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno (sequentially-number
    // integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        LOG.info("docno-mapping.dat doesn't exist, creating...");
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() };
        NumberTrecDocuments2 tool = new NumberTrecDocuments2();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(mappingDir, true);
    }

    conf.set(Constants.CollectionName, "TREC_vol45");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, TrecDocumentInputFormat2.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, TrecDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 2); // toss away singleton terms
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.core.driver.PreprocessWikipedia.java

License:Apache License

/**
 * Runs this tool.//from   w  ww. j a v  a  2 s  .  c o  m
 */
public int run(String[] args) throws Exception {
    int mode = args.length;
    if (mode != MONO_LINGUAL && mode != CROSS_LINGUAL_E && mode != CROSS_LINGUAL_F) {
        printUsage();
        return -1;
    }

    String indexRootPath = args[0];
    String rawCollection = args[1]; //"/shared/Wikipedia/raw/dewiki-20100117-pages-articles.xml";
    String seqCollection = args[2]; //"/umd-lin/fture/pwsim/de-wikipedia/compressed.block/de-20100117";
    String tokenizerClass = args[3];

    Configuration conf = new Configuration();

    String collectionLang = null, tokenizerModel = null, collectionVocab = null;
    String fVocab_f2e = null, eVocab_f2e = null, fVocab_e2f, eVocab_e2f = null, ttable_f2e = null,
            ttable_e2f = null;
    if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { // CROSS-LINGUAL CASE
        collectionLang = args[4];
        tokenizerModel = args[5];
        collectionVocab = args[6];
        conf.set("Ivory.Lang", collectionLang);
        conf.set("Ivory.TokenizerModel", tokenizerModel);
        conf.set("Ivory.CollectionVocab", collectionVocab);
        conf.set("Ivory.FinalVocab", collectionVocab);

        if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated
            fVocab_f2e = args[6]; //  same as collection vocab
            eVocab_f2e = args[7];
            ttable_f2e = args[8];
            eVocab_e2f = args[9];
            fVocab_e2f = args[10];
            ttable_e2f = args[11];

            conf.set("Ivory.F_Vocab_F2E", fVocab_f2e);
            conf.set("Ivory.E_Vocab_F2E", eVocab_f2e);
            conf.set("Ivory.TTable_F2E", ttable_f2e);
            conf.set("Ivory.E_Vocab_E2F", eVocab_e2f);
            conf.set("Ivory.F_Vocab_E2F", fVocab_e2f);
            conf.set("Ivory.TTable_E2F", ttable_e2f);
            conf.set("Ivory.FinalVocab", eVocab_e2f);
        }
    }

    int numMappers = 100;
    int numReducers = 100;

    LOG.info("Tool name: WikipediaDriver");
    LOG.info(" - Index path: " + indexRootPath);
    LOG.info(" - Raw collection path: " + rawCollection);
    LOG.info(" - Compressed collection path: " + seqCollection);
    LOG.info(" - Tokenizer class: " + tokenizerClass);
    LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle);

    if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) {
        LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side.");
        LOG.info(" - Collection vocab file: " + collectionVocab);
        LOG.info(" - Tokenizer model: " + tokenizerModel);

        if (mode == CROSS_LINGUAL_F) {
            LOG.info(" - TTable file " + collectionLang + " --> English : " + ttable_f2e);
            LOG.info(" - Source vocab file: " + fVocab_f2e);
            LOG.info(" - Target vocab file: " + eVocab_f2e);
            LOG.info(" - TTable file " + "English --> " + collectionLang + " : " + ttable_e2f);
            LOG.info(" - Source vocab file: " + fVocab_f2e);
            LOG.info(" - Target vocab file: " + eVocab_f2e);
        }
    }
    LOG.info("Launching with " + numMappers + " mappers, " + numReducers + " reducers...");

    FileSystem fs = FileSystem.get(conf);

    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        LOG.info("Index path doesn't exist, creating...");
        fs.mkdirs(p);
    }
    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Build docno mapping from raw collection
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
        LOG.info(mappingFile + " doesn't exist, creating...");
        String[] arr = new String[] { "-input=" + rawCollection,
                "-output_path=" + indexRootPath + "/wiki-docid-tmp", "-output_file=" + mappingFile.toString(),
                "-keep_all=false" };

        BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    } else {
        LOG.info(p + " exists");
    }

    // Repack Wikipedia into sequential compressed block
    p = new Path(seqCollection);
    if (!fs.exists(p)) {
        LOG.info(seqCollection + " doesn't exist, creating...");
        String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection,
                "-mapping_file=" + mappingFile.toString(), "-compression_type=block",
                "-wiki_language=" + collectionLang };
        RepackWikipedia tool = new RepackWikipedia();
        tool.setConf(conf);
        tool.run(arr);
    }

    conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName());
    conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer"
    conf.setInt(Constants.MinDf, MinDF);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.TermIndexWindow, TermIndexWindow);

    // Builds term doc vectors from document collection, and filters the terms that are not included in Ivory.SrcVocab
    long startTime = System.currentTimeMillis();
    long preprocessStartTime = System.currentTimeMillis();
    LOG.info("Building term doc vectors...");
    new BuildTermDocVectors(conf).run();
    LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // Get CF and DF counts
    startTime = System.currentTimeMillis();
    LOG.info("Counting terms...");
    new ComputeGlobalTermStatistics(conf).run();
    LOG.info("TermCount = " + env.readCollectionTermCount() + "\nJob finished in "
            + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // Build a map from terms to sequentially generated integer term ids
    startTime = System.currentTimeMillis();
    LOG.info("Building term-to-integer id mapping...");
    new BuildDictionary(conf).run();
    LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // Compute term weights, and output weighted term doc vectors
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted term doc vectors...");
    conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25");
    if (mode == CROSS_LINGUAL_F) {
        conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle);

        // translate term doc vectors into English. 
        conf.setBoolean("Ivory.Normalize", true);
        new BuildTranslatedTermDocVectors(conf).run();
    } else {
        conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle);

        // get weighted term doc vectors
        conf.setBoolean("Ivory.Normalize", true);
        new BuildWeightedTermDocVectors(conf).run();
    }
    LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted integer doc vectors...");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    if (mode == MONO_LINGUAL) {
        new BuildIntDocVectors(conf).run();
        new BuildWeightedIntDocVectors(conf).run();
        LOG.info("Job BuildWeightedIntDocVectors finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    } else {
        BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors(
                conf);
        LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

        int finalNumDocs = weightedIntVectorsTool.run();
        if (finalNumDocs > 0) {
            LOG.info("Changed doc count from " + env.readCollectionDocumentCount() + " to = " + finalNumDocs);
            env.writeCollectionDocumentCount(finalNumDocs);
        }
        // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
        Vocab engVocabH = null;
        try {
            engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
        } catch (IOException e) {
            e.printStackTrace();
        }
        LOG.info("Changed term count to : " + env.readCollectionTermCount() + " = " + engVocabH.size());
        env.writeCollectionTermCount(engVocabH.size());
    }

    LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0
            + " seconds");

    return 0;
}

From source file:ivory.core.driver.PreprocessWt10g.java

License:Apache License

/**
 * Runs this tool.//w w  w.j ava2 s  . c  o  m
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexRootPath = args[1];

    LOG.info("Tool name: " + PreprocessWt10g.class.getCanonicalName());
    LOG.info(" - Collection path: " + collection);
    LOG.info(" - Index path: " + indexRootPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        LOG.info("index directory doesn't exist, creating...");
        fs.mkdirs(p);
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Look for the docno mapping, which maps from docid (String) to docno (sequentially-number
    // integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        LOG.info("docno-mapping.dat doesn't exist, creating...");
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(), "100" };
        NumberTrecWebDocuments tool = new NumberTrecWebDocuments();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(mappingDir, true);
    }

    conf.set(Constants.CollectionName, "Wt10g");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, Wt10gDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, mappingFile.toString());

    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 10);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    new BuildTermDocVectorsForwardIndex(conf).run();

    return 0;
}

From source file:ivory.core.index.BuildIntPostingsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    String indexPath = conf.get(Constants.IndexPath);

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String collectionName = env.readCollectionName();

    LOG.info("Tool: " + BuildIntPostingsForwardIndex.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));

    Job job = new Job(getConf(), BuildIntPostingsForwardIndex.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildIntPostingsForwardIndex.class);

    Path inputPath = new Path(env.getPostingsDirectory());
    FileInputFormat.setInputPaths(job, inputPath);

    Path postingsIndexPath = new Path(env.getPostingsIndexData());

    if (fs.exists(postingsIndexPath)) {
        LOG.info("Postings forward index path already exists!");
        return 0;
    }//w  w w .ja  v a2 s .  co  m
    job.setNumReduceTasks(1);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(NullOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:ivory.core.index.BuildIPInvertedIndexDocSorted.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);
    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    int collectionDocCnt = env.readCollectionDocumentCount();

    String postingsType = conf.get(Constants.PostingsListsType,
            ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName());
    @SuppressWarnings("unchecked")
    Class<? extends PostingsList> postingsClass = (Class<? extends PostingsList>) Class.forName(postingsType);

    LOG.info("PowerTool: " + BuildIPInvertedIndexDocSorted.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCnt));
    LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName()));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));
    LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize));

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }/*w w  w  .j  ava  2  s  . com*/

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCnt);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    Job job = new Job(conf, BuildIPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildIPInvertedIndexDocSorted.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, postingsPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(TermPositions.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(postingsClass);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType(postingsClass.getCanonicalName());

    return 0;
}