Example usage for org.apache.hadoop.conf Configuration setBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setBoolean.

Prototype

public void setBoolean(String name, boolean value)

Source Link

Document

Set the value of the name property to a boolean.

Usage

From source file:it.crs4.pydoop.mapreduce.pipes.TestPipeApplication.java

License:Apache License

/**
 * test org.apache.hadoop.mapreduce.pipes.PipesReducer
 * test the transfer of data: key and value
 *
 * @throws Exception/*from   w  ww.  ja v  a 2  s.  c o m*/
 */
@Test
public void testPipesReducer() throws Exception {
    System.err.println("testPipesReducer");

    File[] psw = cleanTokenPasswordFile();
    try {
        JobID jobId = new JobID("201408272347", 0);
        TaskID taskId = new TaskID(jobId, TaskType.MAP, 0);
        TaskAttemptID taskAttemptid = new TaskAttemptID(taskId, 0);

        Job job = new Job(new Configuration());
        job.setJobID(jobId);
        Configuration conf = job.getConfiguration();
        conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskAttemptid.toString());
        FileSystem fs = new RawLocalFileSystem();
        fs.setConf(conf);

        File fCommand = getFileCommand("it.crs4.pydoop.mapreduce.pipes.PipeReducerStub");
        conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath());
        System.err.println("fCommand" + fCommand.getAbsolutePath());

        Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>("user".getBytes(),
                "password".getBytes(), new Text("kind"), new Text("service"));
        TokenCache.setJobToken(token, job.getCredentials());
        conf.setBoolean(MRJobConfig.SKIP_RECORDS, true);

        TestReporter reporter = new TestReporter();
        DummyInputFormat input_format = new DummyInputFormat();
        List<InputSplit> isplits = input_format.getSplits(job);
        InputSplit isplit = isplits.get(0);
        TaskAttemptContextImpl tcontext = new TaskAttemptContextImpl(conf, taskAttemptid);

        RecordWriter<IntWritable, Text> writer = new TestRecordWriter(
                new FileOutputStream(workSpace.getAbsolutePath() + File.separator + "outfile"));

        BooleanWritable bw = new BooleanWritable(true);
        List<Text> texts = new ArrayList<Text>();
        texts.add(new Text("first"));
        texts.add(new Text("second"));
        texts.add(new Text("third"));

        DummyRawKeyValueIterator kvit = new DummyRawKeyValueIterator();

        ReduceContextImpl<BooleanWritable, Text, IntWritable, Text> context = new ReduceContextImpl<BooleanWritable, Text, IntWritable, Text>(
                conf, taskAttemptid, kvit, null, null, writer, null, null, null, BooleanWritable.class,
                Text.class);

        PipesReducer<BooleanWritable, Text, IntWritable, Text> reducer = new PipesReducer<BooleanWritable, Text, IntWritable, Text>();
        reducer.setup(context);

        initStdOut(conf);
        reducer.reduce(bw, texts, context);
        reducer.cleanup(context);
        String stdOut = readStdOut(conf);

        // test data: key
        assertTrue(stdOut.contains("reducer key :true"));
        // and values
        assertTrue(stdOut.contains("reduce value  :first"));
        assertTrue(stdOut.contains("reduce value  :second"));
        assertTrue(stdOut.contains("reduce value  :third"));

    } finally {
        if (psw != null) {
            // remove password files
            for (File file : psw) {
                file.deleteOnExit();
            }
        }
    }

}

From source file:it.crs4.seal.demux.Demux.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    LOG.info("starting");

    Configuration conf = getConf();
    DemuxOptionParser parser = new DemuxOptionParser();
    parser.parse(conf, args);/*from   w ww  .jav a  2  s  . c om*/

    conf.setBoolean(CONF_NO_INDEX_READS, parser.getNoIndexReads());
    conf.setBoolean(CONF_SEPARATE_READS, parser.getSeparateReads());

    LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");
    if (parser.getNoIndexReads())
        LOG.info("Not expecting to find any index reads.  Will demultiplex based only on lane.");

    // load sample sheet to fail early in case of problems
    DemuxUtils.loadSampleSheet(parser.getSampleSheetPath(), conf);

    // must be called before creating the job, since the job
    // *copies* the Configuration.
    distributeSampleSheet(parser.getSampleSheetPath());

    // Create a Job using the processed conf
    Job job = new Job(getConf(), makeJobName(parser.getInputPaths().get(0)));

    job.setJarByClass(Demux.class);

    // input paths
    for (Path p : parser.getInputPaths())
        FileInputFormat.addInputPath(job, p);

    job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName("qseq")));

    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(SequenceId.class);
    job.setMapOutputValueClass(SequencedFragment.class);

    job.setPartitionerClass(SequenceIdLocationPartitioner.class);
    job.setGroupingComparatorClass(GroupByLocationComparator.class);
    job.setSortComparatorClass(TwoOneThreeSortComparator.class);

    job.setReducerClass(Red.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(SequencedFragment.class);

    // output
    job.setOutputFormatClass(DemuxOutputFormat.class);
    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    // Submit the job, then poll for progress until the job is complete
    boolean result = job.waitForCompletion(true);
    if (result) {
        LOG.info("done");
        if (parser.getCreateLaneContent())
            createLaneContentFiles(parser.getOutputPath(), parser.getSampleSheetPath());
        return 0;
    } else {
        LOG.fatal(this.getClass().getName() + " failed!");
        return 1;
    }
}

From source file:it.crs4.seal.prq.PrqOptionParser.java

License:Open Source License

@Override
protected CommandLine parseOptions(Configuration conf, String[] args) throws IOException, ParseException {
    conf.setInt(MinBasesThresholdConfigName, DefaultMinBasesThreshold);
    conf.setBoolean(DropFailedFilterConfigName, DropFailedFilterDefault);
    conf.setBoolean(WarningOnlyIfUnpairedConfigName, WarningOnlyIfUnpairedDefault);
    conf.setInt(NumReadsExpectedConfigName, NumReadsExpectedDefault);

    CommandLine line = super.parseOptions(conf, args);

    /* **** handle deprected properties **** */
    if (conf.get(PrqOptionParser.OLD_INPUT_FORMAT_CONF) != null) {
        throw new ParseException("The property " + PrqOptionParser.OLD_INPUT_FORMAT_CONF
                + " is no longer supported.\n" + "Please use the command line option --input-format instead.");
    }/*from w ww. ja  v  a  2s  . c o  m*/

    Utils.checkDeprecatedProp(conf, LOG, MinBasesThresholdConfigName_deprecated, MinBasesThresholdConfigName);
    Utils.checkDeprecatedProp(conf, LOG, DropFailedFilterConfigName_deprecated, DropFailedFilterConfigName);
    Utils.checkDeprecatedProp(conf, LOG, WarningOnlyIfUnpairedConfigName_deprecated,
            WarningOnlyIfUnpairedConfigName);

    // Let the deprecated properties override the new ones, unless the new ones have a non-default value.
    // If the new property has a non-default value, it must have been set by the user.
    // If, on the other hand, the deprecated property has a value, it must have been set by the user since
    // we're not setting them here.
    if (conf.get(MinBasesThresholdConfigName_deprecated) != null
            && conf.getInt(MinBasesThresholdConfigName, DefaultMinBasesThreshold) == DefaultMinBasesThreshold) {
        conf.setInt(MinBasesThresholdConfigName,
                conf.getInt(MinBasesThresholdConfigName_deprecated, DefaultMinBasesThreshold));
    }

    if (conf.get(DropFailedFilterConfigName_deprecated) != null && conf.getBoolean(DropFailedFilterConfigName,
            DropFailedFilterDefault) == DropFailedFilterDefault) {
        conf.setBoolean(DropFailedFilterConfigName,
                conf.getBoolean(DropFailedFilterConfigName_deprecated, DropFailedFilterDefault));
    }

    if (conf.get(WarningOnlyIfUnpairedConfigName_deprecated) != null
            && conf.getBoolean(WarningOnlyIfUnpairedConfigName,
                    WarningOnlyIfUnpairedDefault) == WarningOnlyIfUnpairedDefault) {
        conf.setBoolean(WarningOnlyIfUnpairedConfigName,
                conf.getBoolean(WarningOnlyIfUnpairedConfigName_deprecated, WarningOnlyIfUnpairedDefault));
    }

    /* **** end handle deprecated properties **** */

    if (line.hasOption(opt_traditionalIds.getOpt()))
        conf.setBoolean(PairReadsQSeq.PRQ_CONF_TRADITIONAL_IDS, true);

    if (line.hasOption(opt_numReads.getOpt())) {
        int numReads;
        try {
            numReads = Integer.valueOf(line.getOptionValue(opt_numReads.getOpt()));
            if (numReads <= 0)
                throw new ParseException("Number of reads per fragment must be >= 0 (got " + numReads + ")");
            if (numReads > 2) {
                throw new ParseException(
                        "Working with more than two reads per template is not supported at the moment.\n"
                                + "If you're interested in seeing this feature implemented contact the Seal developers.");
            }
        } catch (NumberFormatException e) {
            throw new ParseException(e.getMessage());
        }
        conf.setInt(NumReadsExpectedConfigName, numReads);
    }

    // set number of reduce tasks to use
    conf.set(ClusterUtils.NUM_RED_TASKS_PROPERTY, String.valueOf(getNReduceTasks()));
    return line;
}

From source file:ivory.app.PreprocessWikipedia.java

License:Apache License

/**
 * Runs this tool./*from w  w w . j av  a2 s  .  co m*/
 */
public int run(String[] args) throws Exception {
    if (parseArgs(args) < 0) {
        printUsage();
        return -1;
    }
    Configuration conf = getConf();

    conf.set(Constants.Language, collectionLang);
    conf.setBoolean(Constants.Stemming, true); // default behavior of tokenizer is currently to stem, but we shouldnt rely on that

    if (tokenizerModel != null) {
        conf.set(Constants.TokenizerData, tokenizerModel);
    }

    // user can either provide a tokenizer class as a program argument, 
    // or let the factory find an appropriate class based on language code
    try {
        Class.forName(tokenizerClass);
    } catch (Exception e) {
        tokenizerClass = TokenizerFactory.getTokenizerClass(collectionLang, tokenizerModel).getCanonicalName();
    }

    if (collectionVocab != null) {
        conf.set(Constants.CollectionVocab, collectionVocab); // vocabulary to read collection from
    }
    if (e_stopwordList != null) {
        conf.set(Constants.StopwordList, e_stopwordList);
        conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed");
    }
    // CROSS-LINGUAL CASE
    if (mode == CROSS_LINGUAL_E) { // English side
        conf.set("Ivory.FinalVocab", collectionVocab); // vocabulary to map terms to integers in BuildTargetLang...
        conf.set(Constants.StopwordList, e_stopwordList);
        conf.set(Constants.StemmedStopwordList, e_stopwordList + ".stemmed");
    }

    if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated
        conf.set(Constants.TargetIndexPath, targetIndexPath);
        conf.set("Ivory.F_Vocab_F2E", fVocab_f2e);
        conf.set("Ivory.E_Vocab_F2E", eVocab_f2e);
        conf.set("Ivory.TTable_F2E", ttable_f2e);
        conf.set("Ivory.E_Vocab_E2F", eVocab_e2f);
        conf.set("Ivory.F_Vocab_E2F", fVocab_e2f);
        conf.set("Ivory.TTable_E2F", ttable_e2f);
        conf.set(Constants.CollectionVocab, fVocab_f2e); // vocabulary to read collection from
        conf.set("Ivory.FinalVocab", eVocab_f2e); // vocabulary to map terms to integers in BuildTargetLang...
        if (f_stopwordList != null) {
            conf.set(Constants.StopwordList, f_stopwordList);
            conf.set(Constants.StemmedStopwordList, f_stopwordList + ".stemmed");
        }
        if (e_stopwordList != null) {
            conf.set(Constants.TargetStopwordList, e_stopwordList);
            conf.set(Constants.TargetStemmedStopwordList, e_stopwordList + ".stemmed");
        }
        if (e_tokenizerModel != null) {
            conf.set(Constants.TargetTokenizer, e_tokenizerModel);
        }
        conf.set(Constants.TargetLanguage, targetLang);
    }

    int numMappers = 100;
    int numReducers = 100;

    // Print out options
    LOG.info("Tool name: WikipediaDriver");
    LOG.info(" - Index path: " + indexRootPath);
    LOG.info(" - Raw collection path: " + rawCollection);
    LOG.info(" - Compressed collection path: " + seqCollection);
    LOG.info(" - Collection language: " + collectionLang);
    LOG.info(" - Tokenizer class: " + tokenizerClass);
    LOG.info(" - Tokenizer model: " + tokenizerModel);
    LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle);
    LOG.info(" - Stopwords file: " + e_stopwordList);

    if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) {
        LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side.");
        LOG.info(" - Collection vocab file: " + conf.get(Constants.CollectionVocab));
        LOG.info(" - Tokenizer model: " + tokenizerModel);

        if (mode == CROSS_LINGUAL_F) {
            LOG.info(" - TTable file " + collectionLang + " --> " + targetLang + " : " + ttable_f2e);
            LOG.info(" - Source vocab file: " + fVocab_f2e);
            LOG.info(" - Target vocab file: " + eVocab_f2e);
            LOG.info(" - TTable file " + targetLang + " --> " + collectionLang + " : " + ttable_e2f);
            LOG.info(" - Source vocab file: " + eVocab_e2f);
            LOG.info(" - Target vocab file: " + fVocab_e2f);
            LOG.info(" - Source stopwords file: " + f_stopwordList);
            LOG.info(" - Target stopwords file: " + e_stopwordList);
            LOG.info(" - Target stemmed stopwords file: " + conf.get(Constants.TargetStemmedStopwordList));
            LOG.info(" - Target tokenizer path: " + e_tokenizerModel);
        }
    }

    FileSystem fs = FileSystem.get(conf);

    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        LOG.info("Index path doesn't exist, creating...");
        fs.mkdirs(p);
    }
    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Build docno mapping from raw collection
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
        LOG.info(mappingFile + " doesn't exist, creating...");
        String[] arr = new String[] { "-input=" + rawCollection, "-output_file=" + mappingFile.toString(),
                "-wiki_language=" + collectionLang };
        LOG.info("Running WikipediaDocnoMappingBuilder with args " + Arrays.toString(arr));

        WikipediaDocnoMappingBuilder tool = new WikipediaDocnoMappingBuilder();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    } else {
        LOG.info("Docno mapping already exists at: " + mappingFile);
    }

    // Repack Wikipedia into sequential compressed block
    if (!fs.exists(new Path(seqCollection + "/part-00000"))) {
        LOG.info(seqCollection + " doesn't exist, creating...");
        String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection,
                "-mapping_file=" + mappingFile.toString(), "-compression_type=block",
                "-wiki_language=" + collectionLang };
        LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

        RepackWikipedia tool = new RepackWikipedia();
        tool.setConf(conf);
        tool.run(arr);
    } else {
        LOG.info("Repacked collection already exists at: " + seqCollection);
    }

    conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName());
    conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer"
    conf.setInt(Constants.MinDf, MinDF);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.TermIndexWindow, TermIndexWindow);

    // Builds term doc vectors from document collection, and filters the terms that are not included
    // in Ivory.SrcVocab.
    long startTime = System.currentTimeMillis();
    long preprocessStartTime = System.currentTimeMillis();
    LOG.info("Building term doc vectors...");
    int exitCode = new BuildTermDocVectors(conf).run();
    if (exitCode >= 0) {
        LOG.info("Job BuildTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0
                + " seconds");
    } else {
        LOG.info("Error: BuildTermDocVectors. Terminating...");
        return -1;
    }

    // Get CF and DF counts.
    startTime = System.currentTimeMillis();
    LOG.info("Counting terms...");
    exitCode = new ComputeGlobalTermStatistics(conf).run();
    LOG.info("TermCount = " + env.readCollectionTermCount());
    if (exitCode >= 0) {
        LOG.info("Job ComputeGlobalTermStatistics finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    } else {
        LOG.info("Error: ComputeGlobalTermStatistics. Terminating...");
        return -1;
    }
    // Build a map from terms to sequentially generated integer term ids.
    startTime = System.currentTimeMillis();
    LOG.info("Building term-to-integer id mapping...");
    exitCode = new BuildDictionary(conf).run();
    if (exitCode >= 0) {
        LOG.info("Job BuildDictionary finished in " + (System.currentTimeMillis() - startTime) / 1000.0
                + " seconds");
    } else {
        LOG.info("Error: BuildDictionary. Terminating...");
        return -1;
    }

    // Compute term weights, and output weighted term doc vectors.
    LOG.info("Building weighted term doc vectors...");
    startTime = System.currentTimeMillis();

    conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle);

    if (mode == CROSS_LINGUAL_F) {
        // Translate term doc vectors into English.
        exitCode = new BuildTranslatedTermDocVectors(conf).run();
    } else {
        // Build weighted term doc vectors.
        exitCode = new BuildWeightedTermDocVectors(conf).run();
    }
    if (exitCode >= 0) {
        LOG.info("Job BuildTranslated/WeightedTermDocVectors finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    } else {
        LOG.info("Error: BuildTranslated/WeightedTermDocVectors. Terminating...");
        return -1;
    }

    // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted integer doc vectors...");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    if (mode == MONO_LINGUAL) {
        exitCode = new BuildIntDocVectors(conf).run();
        exitCode = new BuildWeightedIntDocVectors(conf).run();
        if (exitCode >= 0) {
            LOG.info("Job BuildWeightedIntDocVectors finished in "
                    + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        } else {
            LOG.info("Error: BuildWeightedIntDocVectors. Terminating...");
            return -1;
        }
    } else {
        BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors(
                conf);

        int finalNumDocs = weightedIntVectorsTool.run();

        LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        if (finalNumDocs > 0) {
            LOG.info("Changed doc count: " + env.readCollectionDocumentCount() + " => " + finalNumDocs);
            env.writeCollectionDocumentCount(finalNumDocs);
        } else {
            LOG.info("No document output! Terminating...");
            return -1;
        }
        // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
        Vocab engVocabH = null;
        try {
            engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
        } catch (IOException e) {
            e.printStackTrace();
        }
        LOG.info("Changed term count: " + env.readCollectionTermCount() + " => " + engVocabH.size());
        env.writeCollectionTermCount(engVocabH.size());
    }

    LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0
            + " seconds");

    return 0;
}

From source file:ivory.core.driver.PreprocessAquaint2.java

License:Apache License

/**
 * Runs this tool.//w w w  .j av  a2  s .c  o  m
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    String collection = args[0];
    String indexRootPath = args[1];

    LOG.info("Tool name: " + PreprocessAquaint2.class.getCanonicalName());
    LOG.info(" - Collection path: " + collection);
    LOG.info(" - Index path: " + indexRootPath);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Create the index directory if it doesn't already exist.
    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        fs.mkdirs(p);
    } else {
        LOG.info("Index directory already exists, skipping!");
    }

    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);
    conf.set(XMLInputFormat.START_TAG_KEY, Aquaint2Document.getXmlStartTag(fs, collection));
    conf.set(XMLInputFormat.END_TAG_KEY, Aquaint2Document.getXmlEndTag());

    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();

    if (!fs.exists(mappingFile)) {
        String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString() };
        NumberAquaint2Documents2 tool = new NumberAquaint2Documents2();
        tool.setConf(conf);
        tool.run(arr);
        fs.delete(mappingDir, true);
    } else {
        LOG.info("DocnoMapping already exists, skipping!");
    }
    Aquaint2DocnoMapping dm = new Aquaint2DocnoMapping();
    dm.loadMapping(mappingFile, fs);

    int docno;
    int expectedDocno;
    String expectedDocid;
    String docid;
    boolean testAquaint2 = false;
    if (testAquaint2) {
        docno = 500;
        expectedDocid = "AFP_ENG_20041001.0500";
        docid = dm.getDocid(docno);
        System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", "
                + (expectedDocid.equals(docid)));
        docno = 600;
        expectedDocid = "AFP_ENG_20041001.0600";
        docid = dm.getDocid(docno);
        System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", "
                + (expectedDocid.equals(docid)));
        docno = 700;
        expectedDocid = "AFP_ENG_20041001.0701";
        docid = dm.getDocid(docno);
        System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", "
                + (expectedDocid.equals(docid)));
        docno = 800;
        expectedDocid = "AFP_ENG_20041003.0019";
        docid = dm.getDocid(docno);
        System.out.println("dm.getDocid(" + docno + "): " + docid + ", should be: " + expectedDocid + ", "
                + (expectedDocid.equals(docid)));
        expectedDocno = 500;
        docid = "AFP_ENG_20041001.0500";
        docno = dm.getDocno(docid);
        System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", "
                + (expectedDocno == docno));
        expectedDocno = 600;
        docid = "AFP_ENG_20041001.0600";
        docno = dm.getDocno(docid);
        System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", "
                + (expectedDocno == docno));
        expectedDocno = 700;
        docid = "AFP_ENG_20041001.0701";
        docno = dm.getDocno(docid);
        System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", "
                + (expectedDocno == docno));
        expectedDocno = 800;
        docid = "AFP_ENG_20041003.0019";
        docno = dm.getDocno(docid);
        System.out.println("dm.getDocno(" + docid + "): " + docno + ", should be: " + expectedDocno + ", "
                + (expectedDocno == docno));
        System.out.println("finished testing, now exiting");
        return 0;
    }
    boolean testGigaword = false;
    if (testGigaword) {
        for (int i = 1; i < 301; i++) {
            docno = i * 1000;
            docid = dm.getDocid(docno);
            System.out.println("dm.getDocid(" + docno + "): " + docid);
        }
        System.out.println("finished testing, now exiting");
        return 0;
    }

    conf.set(Constants.CollectionName, "Aquaint2");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, Aquaint2DocumentInputFormat2.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, Aquaint2DocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 2); // toss away singleton terms
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

    new BuildTermDocVectors(conf).run();

    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();
    new BuildIntDocVectors(conf).run();

    new BuildIntDocVectorsForwardIndex(conf).run();
    //new BuildTermDocVectorsForwardIndex(conf).run();

    new BuildIPInvertedIndexDocSorted(conf).run();

    conf.set(Constants.ScoringModel, "ivory.pwsim.score.TfIdf");
    conf.setBoolean(Constants.Normalize, true);

    new BuildIntPostingsForwardIndex(conf).run();

    boolean buildingVectors = true;
    //boolean buildingVectors = false;
    if (buildingVectors) {
        //new BuildWeightedIntDocVectors(conf).run();

        //conf.setBoolean(Constants.BuildWeighted, true);
        //new BuildIntDocVectorsForwardIndex(conf).run();

        String findexDirPath = indexRootPath + "/findex";
        String findexFilePath = indexRootPath + "/findex.dat";
        if (fs.exists(new Path(findexDirPath))) {
            LOG.info("ForwardIndex already exists: Skipping!");
        } else {
            new BuildAquaint2ForwardIndex().runTool(conf, collection, findexDirPath, findexFilePath,
                    mappingFile.toString());
        }
    }

    return 0;
}

From source file:ivory.core.driver.PreprocessWikipedia.java

License:Apache License

/**
 * Runs this tool.//from   w w  w .j  av a2s .  c om
 */
public int run(String[] args) throws Exception {
    int mode = args.length;
    if (mode != MONO_LINGUAL && mode != CROSS_LINGUAL_E && mode != CROSS_LINGUAL_F) {
        printUsage();
        return -1;
    }

    String indexRootPath = args[0];
    String rawCollection = args[1]; //"/shared/Wikipedia/raw/dewiki-20100117-pages-articles.xml";
    String seqCollection = args[2]; //"/umd-lin/fture/pwsim/de-wikipedia/compressed.block/de-20100117";
    String tokenizerClass = args[3];

    Configuration conf = new Configuration();

    String collectionLang = null, tokenizerModel = null, collectionVocab = null;
    String fVocab_f2e = null, eVocab_f2e = null, fVocab_e2f, eVocab_e2f = null, ttable_f2e = null,
            ttable_e2f = null;
    if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) { // CROSS-LINGUAL CASE
        collectionLang = args[4];
        tokenizerModel = args[5];
        collectionVocab = args[6];
        conf.set("Ivory.Lang", collectionLang);
        conf.set("Ivory.TokenizerModel", tokenizerModel);
        conf.set("Ivory.CollectionVocab", collectionVocab);
        conf.set("Ivory.FinalVocab", collectionVocab);

        if (mode == CROSS_LINGUAL_F) { // non-English side, needs to be translated
            fVocab_f2e = args[6]; //  same as collection vocab
            eVocab_f2e = args[7];
            ttable_f2e = args[8];
            eVocab_e2f = args[9];
            fVocab_e2f = args[10];
            ttable_e2f = args[11];

            conf.set("Ivory.F_Vocab_F2E", fVocab_f2e);
            conf.set("Ivory.E_Vocab_F2E", eVocab_f2e);
            conf.set("Ivory.TTable_F2E", ttable_f2e);
            conf.set("Ivory.E_Vocab_E2F", eVocab_e2f);
            conf.set("Ivory.F_Vocab_E2F", fVocab_e2f);
            conf.set("Ivory.TTable_E2F", ttable_e2f);
            conf.set("Ivory.FinalVocab", eVocab_e2f);
        }
    }

    int numMappers = 100;
    int numReducers = 100;

    LOG.info("Tool name: WikipediaDriver");
    LOG.info(" - Index path: " + indexRootPath);
    LOG.info(" - Raw collection path: " + rawCollection);
    LOG.info(" - Compressed collection path: " + seqCollection);
    LOG.info(" - Tokenizer class: " + tokenizerClass);
    LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle);

    if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) {
        LOG.info("Cross-lingual collection : Preprocessing " + collectionLang + " side.");
        LOG.info(" - Collection vocab file: " + collectionVocab);
        LOG.info(" - Tokenizer model: " + tokenizerModel);

        if (mode == CROSS_LINGUAL_F) {
            LOG.info(" - TTable file " + collectionLang + " --> English : " + ttable_f2e);
            LOG.info(" - Source vocab file: " + fVocab_f2e);
            LOG.info(" - Target vocab file: " + eVocab_f2e);
            LOG.info(" - TTable file " + "English --> " + collectionLang + " : " + ttable_e2f);
            LOG.info(" - Source vocab file: " + fVocab_f2e);
            LOG.info(" - Target vocab file: " + eVocab_f2e);
        }
    }
    LOG.info("Launching with " + numMappers + " mappers, " + numReducers + " reducers...");

    FileSystem fs = FileSystem.get(conf);

    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
        LOG.info("Index path doesn't exist, creating...");
        fs.mkdirs(p);
    }
    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);

    // Build docno mapping from raw collection
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
        LOG.info(mappingFile + " doesn't exist, creating...");
        String[] arr = new String[] { "-input=" + rawCollection,
                "-output_path=" + indexRootPath + "/wiki-docid-tmp", "-output_file=" + mappingFile.toString(),
                "-keep_all=false" };

        BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
        tool.setConf(conf);
        tool.run(arr);

        fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    } else {
        LOG.info(p + " exists");
    }

    // Repack Wikipedia into sequential compressed block
    p = new Path(seqCollection);
    if (!fs.exists(p)) {
        LOG.info(seqCollection + " doesn't exist, creating...");
        String[] arr = new String[] { "-input=" + rawCollection, "-output=" + seqCollection,
                "-mapping_file=" + mappingFile.toString(), "-compression_type=block",
                "-wiki_language=" + collectionLang };
        RepackWikipedia tool = new RepackWikipedia();
        tool.setConf(conf);
        tool.run(arr);
    }

    conf.set(Constants.CollectionName, "Wikipedia-" + collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName());
    conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer"
    conf.setInt(Constants.MinDf, MinDF);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.TermIndexWindow, TermIndexWindow);

    // Builds term doc vectors from document collection, and filters the terms that are not included in Ivory.SrcVocab
    long startTime = System.currentTimeMillis();
    long preprocessStartTime = System.currentTimeMillis();
    LOG.info("Building term doc vectors...");
    new BuildTermDocVectors(conf).run();
    LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // Get CF and DF counts
    startTime = System.currentTimeMillis();
    LOG.info("Counting terms...");
    new ComputeGlobalTermStatistics(conf).run();
    LOG.info("TermCount = " + env.readCollectionTermCount() + "\nJob finished in "
            + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // Build a map from terms to sequentially generated integer term ids
    startTime = System.currentTimeMillis();
    LOG.info("Building term-to-integer id mapping...");
    new BuildDictionary(conf).run();
    LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // Compute term weights, and output weighted term doc vectors
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted term doc vectors...");
    conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25");
    if (mode == CROSS_LINGUAL_F) {
        conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle);

        // translate term doc vectors into English. 
        conf.setBoolean("Ivory.Normalize", true);
        new BuildTranslatedTermDocVectors(conf).run();
    } else {
        conf.setInt("Ivory.MinNumTerms", MinNumTermsPerArticle);

        // get weighted term doc vectors
        conf.setBoolean("Ivory.Normalize", true);
        new BuildWeightedTermDocVectors(conf).run();
    }
    LOG.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted integer doc vectors...");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    if (mode == MONO_LINGUAL) {
        new BuildIntDocVectors(conf).run();
        new BuildWeightedIntDocVectors(conf).run();
        LOG.info("Job BuildWeightedIntDocVectors finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    } else {
        BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors(
                conf);
        LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in "
                + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

        int finalNumDocs = weightedIntVectorsTool.run();
        if (finalNumDocs > 0) {
            LOG.info("Changed doc count from " + env.readCollectionDocumentCount() + " to = " + finalNumDocs);
            env.writeCollectionDocumentCount(finalNumDocs);
        }
        // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
        Vocab engVocabH = null;
        try {
            engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
        } catch (IOException e) {
            e.printStackTrace();
        }
        LOG.info("Changed term count to : " + env.readCollectionTermCount() + " = " + engVocabH.size());
        env.writeCollectionTermCount(engVocabH.size());
    }

    LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0
            + " seconds");

    return 0;
}

From source file:ivory.core.preprocess.BuildTermDocVectors.java

License:Apache License

@SuppressWarnings("unchecked")
public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    String collectionName = conf.get(Constants.CollectionName);
    String collectionPath = conf.get(Constants.CollectionPath);
    String inputFormat = conf.get(Constants.InputFormat);
    String tokenizer = conf.get(Constants.Tokenizer);
    String mappingClass = conf.get(Constants.DocnoMappingClass);
    int docnoOffset = conf.getInt(Constants.DocnoOffset, 0);
    int numReducers = conf.getInt(Constants.TermDocVectorSegments, 0);

    LOG.info("PowerTool: " + BuildTermDocVectors.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath));
    LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat));
    LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer));
    LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass));
    LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset));
    LOG.info(String.format(" - %s: %s", Constants.TermDocVectorSegments, numReducers));

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();

    if (!fs.exists(mappingFile)) {
        LOG.error("Error, docno mapping data file " + mappingFile + " doesn't exist!");
        return 0;
    }//from   www.  j a v  a 2s  .c  o m

    DistributedCache.addCacheFile(mappingFile.toUri(), conf);

    Path outputPath = new Path(env.getTermDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        LOG.info("TermDocVectors already exist: Skipping!");
        return 0;
    }

    env.writeCollectionName(collectionName);
    env.writeCollectionPath(collectionPath);
    env.writeInputFormat(inputFormat);
    env.writeDocnoMappingClass(mappingClass);
    env.writeTokenizerClass(tokenizer);
    env.writeDocnoOffset(docnoOffset);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    Job job1 = new Job(conf, BuildTermDocVectors.class.getSimpleName() + ":" + collectionName);
    job1.setJarByClass(BuildTermDocVectors.class);

    job1.setNumReduceTasks(numReducers);

    FileInputFormat.addInputPaths(job1, collectionPath);
    FileOutputFormat.setOutputPath(job1, outputPath);
    SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD);

    job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat));
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);

    job1.setMapOutputKeyClass(IntWritable.class);
    job1.setMapOutputValueClass(LazyTermDocVector.class);
    job1.setOutputKeyClass(IntWritable.class);
    job1.setOutputValueClass(LazyTermDocVector.class);

    job1.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();
    job1.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // Write out number of postings.
    int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue();
    env.writeCollectionDocumentCount(collectionDocCount);

    Path dlFile = env.getDoclengthsData();
    if (fs.exists(dlFile)) {
        LOG.info("DocLength data exists: Skipping!");
        return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCount);
    conf.set(InputPath, env.getDoclengthsDirectory().toString());
    conf.set(DocLengthDataFile, dlFile.toString());

    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    LOG.info("Writing doc length data to " + dlFile + "...");

    Job job2 = new Job(conf, "DocLengthTable:" + collectionName);
    job2.setJarByClass(BuildTermDocVectors.class);

    job2.setNumReduceTasks(0);
    job2.setInputFormatClass(NullInputFormat.class);
    job2.setOutputFormatClass(NullOutputFormat.class);
    job2.setMapperClass(DocLengthDataWriterMapper.class);

    startTime = System.currentTimeMillis();
    job2.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue();
    env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount);

    return 0;
}

From source file:ivory.preprocess.BuildTermDocVectors2.java

License:Apache License

@SuppressWarnings("unchecked")
public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    String collectionName = conf.get(Constants.CollectionName);
    String collectionPath = conf.get(Constants.CollectionPath);
    String inputFormat = conf.get(Constants.InputFormat);
    String tokenizer = conf.get(Constants.Tokenizer);
    String mappingClass = conf.get(Constants.DocnoMappingClass);
    int docnoOffset = conf.getInt(Constants.DocnoOffset, 0);

    LOG.info("PowerTool: BuildTermDocVectors2");
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath));
    LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat));
    LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer));
    LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass));
    LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset));

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();

    if (!fs.exists(mappingFile)) {
        LOG.error("Error, docno mapping data file " + mappingFile + "doesn't exist!");
        return 0;
    }/* ww  w.  j  a  v  a 2s. c o  m*/

    DistributedCache.addCacheFile(mappingFile.toUri(), conf);

    Path outputPath = new Path(env.getTermDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        LOG.info("TermDocVectors already exist: Skipping!");
        return 0;
    }

    env.writeCollectionName(collectionName);
    env.writeCollectionPath(collectionPath);
    env.writeInputFormat(inputFormat);
    env.writeDocnoMappingClass(mappingClass);
    env.writeTokenizerClass(tokenizer);
    env.writeDocnoOffset(docnoOffset);

    Job job1 = new Job(conf, "BuildTermDocVectors2:" + collectionName);
    job1.setJarByClass(BuildTermDocVectors2.class);

    job1.setNumReduceTasks(0);

    FileInputFormat.addInputPaths(job1, collectionPath);
    FileOutputFormat.setOutputPath(job1, outputPath);
    SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD);

    job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat));
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);

    job1.setMapOutputKeyClass(IntWritable.class);
    job1.setMapOutputValueClass(LazyTermDocVector.class);
    job1.setOutputKeyClass(IntWritable.class);
    job1.setOutputValueClass(LazyTermDocVector.class);

    job1.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();
    job1.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // write out number of postings
    int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue();
    env.writeCollectionDocumentCount(collectionDocCount);

    Path dlFile = env.getDoclengthsData();
    if (fs.exists(dlFile)) {
        LOG.info("DocLength data exists: Skipping!");
        return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCount);
    conf.set(InputPath, env.getDoclengthsDirectory().toString());
    conf.set(DocLengthDataFile, dlFile.toString());

    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    LOG.info("Writing doc length data to " + dlFile + "...");

    Job job2 = new Job(conf, "DocLengthTable2:" + collectionName);
    job2.setJarByClass(BuildTermDocVectors2.class);

    job2.setNumReduceTasks(0);
    job2.setInputFormatClass(NullInputFormat.class);
    job2.setOutputFormatClass(NullOutputFormat.class);
    job2.setMapperClass(DocLengthDataWriterMapper.class);

    startTime = System.currentTimeMillis();
    job2.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue();
    env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount);

    return 0;
}

From source file:jobs.MatrixBlockMult.java

License:Apache License

public int run(String[] args) throws Exception {

    Configuration conf = getConf();

    conf.setFloat("SCALAR", Float.parseFloat(args[3]));

    conf.setBoolean("LTRANS", Boolean.parseBoolean(args[4]));
    conf.setBoolean("RTRANS", Boolean.parseBoolean(args[5]));

    conf.setInt("NRL", Integer.parseInt(args[6]));
    conf.setInt("NCL", Integer.parseInt(args[7]));
    conf.setInt("NRR", Integer.parseInt(args[8]));
    conf.setInt("NCR", Integer.parseInt(args[9]));

    //set # of reducers
    conf.setInt("mapred.reduce.tasks", Integer.parseInt(args[10]));

    //Get optional blocksize parameters
    if (args.length >= 12)
        conf.setInt("SRL", Integer.parseInt(args[11]));

    if (args.length >= 13)
        conf.setInt("SCL", Integer.parseInt(args[12]));

    if (args.length >= 14)
        conf.setInt("SRR", Integer.parseInt(args[13]));

    if (args.length >= 15)
        conf.setInt("SCR", Integer.parseInt(args[14]));

    conf.set("LEFTNAME", args[0]);
    conf.set("RIGHTNAME", args[1]);
    conf.set("RESNAME", args[2]);

    //heap space - should be entered with the -D format and not dealt with by the program.    
    conf.set("mapred.map.child.java.opts", "-Xmx3G");
    conf.set("mapred.reduce.child.java.opts", "-Xmx3G");

    //job//w w  w .java  2 s. co  m
    Job job1 = new Job(conf, "MatrixBlockMult");
    job1.setJarByClass(MatrixBlockMult.class);

    // Map
    FileInputFormat.addInputPath(job1, new Path(args[0]));
    FileInputFormat.addInputPath(job1, new Path(args[1]));
    job1.setInputFormatClass(SequenceFileInputFormat.class);
    job1.setMapperClass(BlockMultiplicationGroupingMapper.class);
    job1.setMapOutputKeyClass(Text.class);
    job1.setMapOutputValueClass(MatrixBlock.class);

    //Reduce       
    job1.setReducerClass(MatrixBlockMultReducer.class);
    job1.setOutputKeyClass(Text.class);
    job1.setOutputValueClass(MatrixBlock.class);
    FileOutputFormat.setOutputPath(job1, new Path(args[2]));
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);
    //job1.setOutputFormatClass(TextOutputFormat.class);

    return job1.waitForCompletion(false) ? 0 : 1;
}

From source file:jobs.MatrixBlockTraceMult.java

License:Apache License

public int run(String[] args) throws Exception {

    Configuration conf = getConf();

    conf.setFloat("SCALAR", Float.parseFloat(args[3]));

    conf.setBoolean("LTRANS", Boolean.parseBoolean(args[4]));
    conf.setBoolean("RTRANS", Boolean.parseBoolean(args[5]));

    //set # of reducers
    conf.setInt("mapred.reduce.tasks", Integer.parseInt(args[6]));

    //Get optional blocksize parameters
    if (args.length >= 8)
        conf.setInt("SRL", Integer.parseInt(args[7]));

    if (args.length >= 9)
        conf.setInt("SCL", Integer.parseInt(args[8]));

    if (args.length >= 10)
        conf.setInt("SRR", Integer.parseInt(args[9]));

    if (args.length >= 11)
        conf.setInt("SCR", Integer.parseInt(args[10]));

    conf.set("LEFTNAME", args[0]);
    conf.set("RIGHTNAME", args[1]);

    //heap space - should be entered with the -D format and not dealt with by the program.    
    conf.set("mapred.map.child.java.opts", "-Xmx3G");
    conf.set("mapred.reduce.child.java.opts", "-Xmx3G");

    //job/*from  w w w.j a  v a 2 s  . c  om*/
    Job job1 = new Job(conf, "MatrixBlockTraceMult");
    job1.setJarByClass(MatrixBlockMult.class);

    // Map
    FileInputFormat.addInputPath(job1, new Path(args[0]));
    FileInputFormat.addInputPath(job1, new Path(args[1]));
    job1.setInputFormatClass(SequenceFileInputFormat.class);
    job1.setMapperClass(SquareBlockTraceMultiplicationGroupingMapper.class);
    job1.setMapOutputKeyClass(Text.class);
    job1.setMapOutputValueClass(MatrixBlock.class);

    //Reduce       
    job1.setReducerClass(SquareMatrixBlockTraceMultReducer.class);
    job1.setOutputKeyClass(NullWritable.class);
    job1.setOutputValueClass(DoubleWritable.class);
    FileOutputFormat.setOutputPath(job1, new Path(args[2]));
    job1.setOutputFormatClass(TextOutputFormat.class);

    return job1.waitForCompletion(false) ? 0 : 1;
}