Example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks

List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks.

Prototype

public void setNumReduceTasks(int tasks) throws IllegalStateException 

Source Link

Document

Set the number of reduce tasks for the job.

Usage

From source file:com.twitter.algebra.nmf.SampleRowsJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, float sampleRate)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.setFloat(SAMPLERATE, sampleRate);
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "samplerows");

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(SampleRowsJob.class);
    job.setJobName(SampleRowsJob.class.getSimpleName() + "-" + matrixOutputPath.getName());

    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);

    job.setNumReduceTasks(0);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.submit();/*from   w  ww  . j  a v  a 2 s.  c o  m*/
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.nmf.XtXJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, int numCols, String xmPath, Path matrixOutputPath)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.setInt(MATRIXCOLS, numCols);// ww w .j  a v a  2s  .  c om
    //    conf.set(XMPATH, xmPath);
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, new Path[] { matrixInputPath }, "xtx");

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJobName("XtXJob-" + matrixOutputPath.getName());
    job.setJarByClass(XtXJob.class);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);
    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "xtx");
    job.setNumReduceTasks(numReducers);
    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numCols);

    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.submit();
    job.waitForCompletion(true);
}

From source file:com.twitter.algebra.TransposeJob.java

License:Apache License

/**
 * Perform transpose of A, where A refers to the path that contains a matrix
 * in {@link SequenceFileInputFormat}./*from w  ww .  j a  v  a  2 s  . com*/
 * 
 * @param conf
 *          the initial configuration
 * @param matrixInputPath
 *          the path to the input files that we process
 * @param matrixOutputPath
 *          the path of the resulting transpose matrix
 * @param numInputRows
 *          rows
 * @param numInputCols
 *          cols
 * @return the running job
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows,
        int numInputCols) throws IOException, InterruptedException, ClassNotFoundException {
    conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows);
    conf.setInt(RowPartitioner.TOTAL_KEYS, numInputCols);
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "transpose");

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(TransposeJob.class);
    job.setJobName(TransposeJob.class.getSimpleName());

    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(TransposeMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "transpose");
    job.setNumReduceTasks(numReducers);
    //    job.setPartitionerClass(RowPartitioner.IntRowPartitioner.class);
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputCols);
    job.setCombinerClass(MergeVectorsCombiner.class);
    job.setReducerClass(MergeVectorsReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.elephanttwin.indexing.AbstractBlockIndexingJob.java

License:Open Source License

/**
 * Sets up various job properites required for the indexing job.
 * If your implementation needs to mess with the conf, you can do so by overriding
 * this method (remember to call super.setupJob()!) or in setMapper().
 * @param conf/*from   w  w  w.j  a v  a  2s.co m*/
 * @return
 * @throws IOException
 */
protected Job setupJob(Configuration conf) throws IOException {
    Job job = new Job(new Configuration(conf));
    job.setJarByClass(getClass());
    job.setInputFormatClass(BlockIndexedFileInputFormat.class);
    job.setReducerClass(MapFileIndexingReducer.class);
    job.setMapOutputKeyClass(TextLongPairWritable.class);
    job.setMapOutputValueClass(LongPairWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ListLongPair.class);
    job.setPartitionerClass(TextLongPairWritable.Parititioner.class);
    job.setSortComparatorClass(TextLongPairWritable.PairComparator.class);
    job.setGroupingComparatorClass(TextLongPairWritable.KeyOnlyComparator.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setNumReduceTasks(getNumPartitions());
    BlockIndexedFileInputFormat.setIndexOptions(job, getInputFormat(), getValueClass(), getIndex(),
            getColumnName());
    return job;
}

From source file:com.twitter.elephanttwin.lucene.indexing.AbstractLuceneIndexingJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    LOG = Logger.getLogger(this.getClass());
    params = newIndexConfig();//from   www.  j  a  v  a2 s  . co m

    LOG.info("Starting up indexer...");
    LOG.info(" - input: " + Joiner.on(" ").join(IndexConfig.input.get()));
    LOG.info(" - index: " + IndexConfig.index);
    LOG.info(" - number of shards: " + IndexConfig.numPartitions.get());

    Configuration conf = getConf();

    conf.set(AbstractLuceneIndexingReducer.HDFS_INDEX_LOCATION, IndexConfig.index.get());
    conf.set(AbstractLuceneIndexingReducer.ANALYZER, IndexConfig.analyzer.get());
    conf.set(AbstractLuceneIndexingReducer.SIMILARITY, IndexConfig.similarity.get());
    conf.setInt(AbstractSamplingIndexingMapper.SAMPLE_PERCENTAGE, IndexConfig.samplePercentage.get());

    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    Job job = new Job(conf, getJobName(params));

    // Job's constructor copies conf, we need a reference to the one job
    // is actually using
    conf = job.getConfiguration();

    job.setJarByClass(this.getClass());

    job.setNumReduceTasks(IndexConfig.numPartitions.get());

    for (String s : IndexConfig.input.get()) {
        Path spath = new Path(s);
        FileSystem fs = spath.getFileSystem(getConf());
        List<FileStatus> stats = Lists.newArrayList();
        addInputPathRecursively(stats, fs, spath, HdfsUtils.HIDDEN_FILE_FILTER);
        for (FileStatus foundStat : stats) {
            FileInputFormat.addInputPath(job, foundStat.getPath());
        }
    }

    FileOutputFormat.setOutputPath(job, new Path(IndexConfig.index.get()));

    setupJob(job);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(IndexConfig.index.get());
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    LOG.info("Job " + getJobName(params) + " started.");
    // TODO Jimmy has a parameter that controls whether we wait in Thud but not in ES.
    // when would we not want to wait?
    job.waitForCompletion(true);
    LOG.info("Job " + getJobName(params) + " Finished in " + (System.currentTimeMillis() - startTime) / 1000.0
            + " seconds");

    if (job.isSuccessful()) {
        writeIndexDescriptors(getIndexDescriptor());
    }
    return job.isSuccessful() ? 0 : 1;
}

From source file:com.twitter.elephanttwin.retrieval.ScanUsingIndexJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    params = new IndexConfig();

    LOG.info(" - input: " + Joiner.on(" ").join(params.getInput()));
    LOG.info(" - output: " + IndexConfig.output.get());

    Configuration conf = getConf();

    Path outputDir = new Path(params.getOutput());
    FileSystem fs = outputDir.getFileSystem(conf);
    fs.delete(outputDir, true);// w ww  .j  a va 2  s .  co  m

    int totalInputFiles = 0;
    List<FileStatus> stats = Lists.newArrayList();
    for (String s : params.getInput()) {
        Path spath = new Path(IndexConfig.index.get() + s);
        HdfsUtils.addInputPathRecursively(stats, fs, spath, hiddenDirectoryFilter, indexDataFilter);
    }

    totalInputFiles = stats.size();
    LOG.info(totalInputFiles + " total index files to be scanned");

    conf.set(IndexScanMapper.searchColumnName, params.getColumnName());
    Job job = new Job(new Configuration(conf));
    job.setJarByClass(getClass());
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(params.getOutput()));

    for (FileStatus file : stats)
        FileInputFormat.addInputPath(job, file.getPath());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    job.setNumReduceTasks(1);

    job.setMapperClass(IndexScanMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    job.setJobName("ScanUsingIndexJob:" + IndexConfig.input.get());
    BlockIndexedFileInputFormat.setSearchOptions(job, params.getinputFormat(), params.getValueClass(),
            params.getIndex(), (String) null);
    job.waitForCompletion(true);
    return 0;
}

From source file:com.twitter.hraven.etl.JobFileProcessor.java

License:Apache License

/**
 * @param conf//from  w  ww  .  j a va  2s. c o m
 *          to use to create and run the job
 * @param scan
 *          to be used to scan the raw table.
 * @param totalJobCount
 *          the total number of jobs that need to be run in this batch. Used
 *          in job name.
 * @return The job to be submitted to the cluster.
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
private Job getProcessingJob(Configuration conf, Scan scan, int totalJobCount) throws IOException {

    Configuration confClone = new Configuration(conf);

    // Turn off speculative execution.
    // Note: must be BEFORE the job construction with the new mapreduce API.
    confClone.setBoolean("mapred.map.tasks.speculative.execution", false);

    // Set up job
    Job job = new Job(confClone, getJobName(totalJobCount));

    // This is a map-only class, skip reduce step
    job.setNumReduceTasks(0);
    job.setJarByClass(JobFileProcessor.class);
    job.setOutputFormatClass(MultiTableOutputFormat.class);

    TableMapReduceUtil.initTableMapperJob(Constants.HISTORY_RAW_TABLE, scan, JobFileTableMapper.class,
            JobFileTableMapper.getOutputKeyClass(), JobFileTableMapper.getOutputValueClass(), job);

    return job;
}

From source file:com.twitter.hraven.etl.JobFileRawLoader.java

License:Apache License

/**
 * @param conf//  w ww  .  ja  va  2  s . c o m
 *          to use to create and run the job. Should be an HBase
 *          configuration.
 * @param input
 *          path to the processFile * @param totalJobCount the total number of
 *          jobs that need to be run in this batch. Used in job name.
 * @return whether all job confs were loaded properly.
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
private boolean runRawLoaderJob(Configuration myHBaseConf, String input, int totalJobCount)
        throws IOException, InterruptedException, ClassNotFoundException {
    boolean success;

    // Turn off speculative execution.
    // Note: must be BEFORE the job construction with the new mapreduce API.
    myHBaseConf.setBoolean("mapred.map.tasks.speculative.execution", false);

    // Set up job
    Job job = new Job(myHBaseConf, getJobName(totalJobCount));
    job.setJarByClass(JobFileRawLoader.class);

    Path inputPath = new Path(input);

    if (hdfs.exists(inputPath)) {

        // Set input
        job.setInputFormatClass(SequenceFileInputFormat.class);
        SequenceFileInputFormat.setInputPaths(job, inputPath);

        job.setMapperClass(JobFileRawLoaderMapper.class);

        // Set the output format to push data into HBase.
        job.setOutputFormatClass(TableOutputFormat.class);
        TableMapReduceUtil.initTableReducerJob(Constants.HISTORY_RAW_TABLE, null, job);

        job.setOutputKeyClass(JobFileRawLoaderMapper.getOutputKeyClass());
        job.setOutputValueClass(JobFileRawLoaderMapper.getOutputValueClass());

        // This is a map-only class, skip reduce step
        job.setNumReduceTasks(0);

        // Run the job
        success = job.waitForCompletion(true);

        if (success) {
            success = hdfs.delete(inputPath, false);
        }

    } else {
        System.err.println("Unable to find processFile: " + inputPath);
        success = false;
    }
    return success;
}

From source file:com.twitter.scalding.parquet.scrooge.TestCorruptScroogeRecords.java

License:Apache License

@Override
public void setupJob(Job job, Path path) throws Exception {
    job.setInputFormatClass(ParquetScroogeInputFormat.class);
    ParquetScroogeInputFormat.setInputPaths(job, path);
    ParquetScroogeInputFormat.setThriftClass(job.getConfiguration(), StructWithUnionV2.class);

    ThriftReadSupport.setRecordConverterClass(job.getConfiguration(), ScroogeRecordConverter.class);

    job.setMapperClass(ReadMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(NullOutputFormat.class);
}

From source file:com.wipro.ats.bdre.datagen.mr.Driver.java

License:Apache License

/**
 * @param args the cli arguments/* w  ww . j a va  2s .  c om*/
 */
@Override
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = getConf();
    GetGeneralConfig generalConfig = new GetGeneralConfig();
    GeneralConfig gc = generalConfig.byConigGroupAndKey("imconfig", "common.default-fs-name");
    conf.set("fs.defaultFS", gc.getDefaultVal());

    String processId = args[0];
    Path outputDir = new Path(ResolvePath.replaceVars(args[1]));

    Properties dataProps = Config.getDataProperties(processId);
    Properties tableProps = Config.getTableProperties(processId);

    TableUtil tableUtil = new TableUtil();
    Table table = tableUtil.formTableFromConfig(processId);
    FileSystem fs = FileSystem.get(conf);
    LOGGER.info("Default FS =" + conf.get("fs.defaultFS"));
    //set in the conf for mappers to use
    conf.set(Config.SEPARATOR_KEY, tableProps.getProperty("separator"));
    conf.set(Config.PID_KEY, processId);
    conf.setLong(Config.NUM_ROWS_KEY, Long.parseLong(dataProps.getProperty("numRows")));
    conf.setInt(Config.NUM_SPLITS_KEY, Integer.parseInt(dataProps.getProperty("numSplits")));

    Job job = Job.getInstance(conf);
    Path mrOutputPath = new Path(outputDir.toString() + "/MROUT/" + table.getTableName());

    FileOutputFormat.setOutputPath(job, mrOutputPath);
    job.setJobName("Datagen-" + table.getTableName());
    job.setJarByClass(Driver.class);
    job.setMapperClass(RecordGenMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(RangeInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.waitForCompletion(true);

    //merge and create a single file

    Path srcDir = mrOutputPath;
    Path destFile = new Path(outputDir.toString() + "/" + table.getTableName());
    FileUtil.copyMerge(fs, srcDir, fs, destFile, true, conf, "");

    //Return file info oozie params
    RegisterFileInfo registerFileInfo = new RegisterFileInfo();
    registerFileInfo.setBatchId(null);
    registerFileInfo.setCreationTs(new Timestamp(new Date().getTime()));
    registerFileInfo.setFileHash("0");
    registerFileInfo.setFileSize(0L);
    registerFileInfo.setPath(destFile.toString());
    registerFileInfo.setSubProcessId(Integer.parseInt(processId));
    OozieUtil oozieUtil = new OozieUtil();
    oozieUtil.persistBeanData(registerFileInfo, false);
    return 0;
}