Example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks

List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks.

Prototype

public void setNumReduceTasks(int tasks) throws IllegalStateException 

Source Link

Document

Set the number of reduce tasks for the job.

Usage

From source file:clustering.mst.Driver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.err.printf("usage: %s similarity_result_dir document_count_file output_dir "
                + "[cluster_threshold] [reduce_number] [compression]\n", getClass().getSimpleName());
        System.exit(1);//from w w  w .j av a 2  s .c o m
    }

    Path step1_OutputDir = new Path(args[2] + "/step1");
    Path resultDir = new Path(args[2] + "/result");

    URI docCntFile = new URI(args[1] + "/part-r-00000#docCnt");

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    if (args.length > 3) {
        conf.setDouble("final.threshold", Double.valueOf(args[3]));
    } else {
        conf.setDouble("final.threshold", 0.2d);
    }
    if (args.length > 4) {
        conf.setInt("reduce.task.num", Integer.valueOf(args[4]));
    } else {
        conf.setInt("reduce.task.num", 5);
    }

    JobControl jobControl = new JobControl("mst jobs");

    /* step 1, split and calculate the child msts */

    Job childJob = Job.getInstance(conf, "mst child job");
    childJob.setJarByClass(Driver.class);

    childJob.addCacheFile(docCntFile);

    if (args.length > 5 && args[5].equals("0")) {
        FileInputFormat.addInputPath(childJob, new Path(args[0]));
        childJob.setInputFormatClass(KeyValueTextInputFormat.class);
    } else {
        SequenceFileInputFormat.addInputPath(childJob, new Path(args[0]));
        childJob.setInputFormatClass(SequenceFileAsTextInputFormat.class);
    }

    FileOutputFormat.setOutputPath(childJob, step1_OutputDir);

    childJob.setMapperClass(ChildMapper.class);
    childJob.setMapOutputKeyClass(DoubleWritable.class);
    childJob.setMapOutputValueClass(Text.class);

    childJob.setPartitionerClass(ChildPartitioner.class);

    childJob.setReducerClass(ChildReducer.class);
    childJob.setNumReduceTasks(conf.getInt("reduce.task.num", 1));
    childJob.setOutputKeyClass(DoubleWritable.class);
    childJob.setOutputValueClass(Text.class);

    ControlledJob controlledChildJob = new ControlledJob(conf);
    controlledChildJob.setJob(childJob);
    jobControl.addJob(controlledChildJob);

    /* step 2, merge step 1's output and calculate final mst */

    Job finalJob = Job.getInstance(conf, "mst final job");
    finalJob.setJarByClass(FinalReducer.class);

    finalJob.addCacheFile(docCntFile);

    FileInputFormat.addInputPath(finalJob, step1_OutputDir);
    finalJob.setInputFormatClass(KeyValueTextInputFormat.class);

    finalJob.setMapperClass(FinalMapper.class);
    finalJob.setMapOutputKeyClass(DoubleWritable.class);
    finalJob.setMapOutputValueClass(Text.class);

    finalJob.setReducerClass(FinalReducer.class);
    finalJob.setOutputKeyClass(IntWritable.class);
    finalJob.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(finalJob, resultDir);

    ControlledJob finalControlledJob = new ControlledJob(conf);
    finalControlledJob.setJob(finalJob);
    finalControlledJob.addDependingJob(controlledChildJob);
    jobControl.addJob(finalControlledJob);

    // run jobs

    MapReduceUtils.runJobs(jobControl);

    return finalJob.waitForCompletion(true) ? 0 : 1;
}

From source file:clustering.similarity.ISimDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("usage: %s simpre_dir output_dir " + "[compression_or_not] [reduce_task_number]\n",
                getClass().getSimpleName());
        System.exit(1);//w ww.  j  a  v a 2  s.  c  o  m
    }

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    Job job = Job.getInstance(conf, "isim job");
    job.setJarByClass(ISimDriver.class);

    if (args.length > 2 && args[2].equals("0")) {
        FileInputFormat.addInputPath(job, new Path(args[0]));
        job.setInputFormatClass(KeyValueTextInputFormat.class);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
    } else {
        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        SequenceFileInputFormat.addInputPath(job, new Path(args[0]));

        conf.setBoolean("mapreduce.map.output.compress", true);
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.GzipCodec");

        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
        SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class);
        SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));
    }

    if (args.length > 3) {
        conf.setInt("reduce.num", Integer.valueOf(args[3]));
    } else {
        conf.setInt("reduce.num", 5);
    }

    job.setMapperClass(ISimMapper.class);
    job.setMapOutputKeyClass(IntIntTupleWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setCombinerClass(ISimCombiner.class);
    job.setPartitionerClass(HashPartitioner.class);

    job.setNumReduceTasks(conf.getInt("reduce.num", 1));

    job.setReducerClass(ISimReducer.class);
    job.setOutputKeyClass(IntIntTupleWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    long starttime = System.currentTimeMillis();
    boolean complete = job.waitForCompletion(true);
    long endtime = System.currentTimeMillis();
    System.out.println("inverted similarity job finished in: " + (endtime - starttime) / 1000 + " seconds");

    return complete ? 0 : 1;
}

From source file:clustering.similarity.PreDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf(/*from  w  w  w  .ja v a 2s .c  o m*/
                "usage: %s inverted_index_result_dir output_dir"
                        + " [compress_or_not] [reducer_number] [deci_number]\n",
                this.getClass().getSimpleName());
        System.exit(1);
    }
    Configuration conf = getConf();

    conf = MapReduceUtils.initConf(conf);
    conf.set("mapreduce.reduce.speculative", "false");

    // TODO: 17-4-24 calculate split number from reducer number
    conf.setInt("split.num", 8);

    if (args.length > 3) {
        conf.setInt("reducer.num", Integer.valueOf(args[3]));
    } else {
        conf.setInt("reducer.num", 29);
    }
    if (args.length > 4) {
        conf.setInt("deci.number", Integer.valueOf(args[4]));
    } else {
        conf.setInt("deci.number", 3);
    }

    Job job = Job.getInstance(conf, "pre job");
    job.setJarByClass(PreDriver.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormatClass(KeyValueTextInputFormat.class);

    job.setMapperClass(PreMapper.class);
    job.setMapOutputKeyClass(IntIntTupleWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(PrePartitioner.class);

    job.setNumReduceTasks(conf.getInt("reducer.num", 29));
    job.setReducerClass(PreReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // set default compression
    if (args.length > 2 && args[2].equals("0")) {
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
    } else {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
        SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class);
        SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));
    }

    long starttime = System.currentTimeMillis();
    boolean complete = job.waitForCompletion(true);
    long endtime = System.currentTimeMillis();
    System.out.println("inverted similarity pre job finished in: " + (endtime - starttime) / 1000 + " seconds");

    return complete ? 0 : 1;
}

From source file:cmd.sampler.java

License:Apache License

/**
 * Driver for InputSampler from the command line. Configures a JobConf
 * instance and calls {@link #writePartitionFile}.
 *//* w w  w  . ja  va 2s.  c om*/
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    ArrayList<String> otherArgs = new ArrayList<String>();
    Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-r".equals(args[i])) {
                job.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else if ("-inFormat".equals(args[i])) {
                job.setInputFormatClass(Class.forName(args[++i]).asSubclass(InputFormat.class));
            } else if ("-keyClass".equals(args[i])) {
                job.setMapOutputKeyClass(Class.forName(args[++i]).asSubclass(WritableComparable.class));
            } else if ("-splitSample".equals(args[i])) {
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new SplitSampler<K, V>(numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    if (job.getNumReduceTasks() <= 1) {
        System.err.println("Sampler requires more than one reducer");
        return printUsage();
    }
    if (otherArgs.size() < 2) {
        System.out.println("ERROR: Wrong number of parameters: ");
        return printUsage();
    }
    if (null == sampler) {
        sampler = new SplitSampler<K, V>(1000, 10);
    }

    Path outf = new Path(otherArgs.remove(otherArgs.size() - 1));
    TotalOrderPartitioner.setPartitionFile(getConf(), outf);
    for (String s : otherArgs) {
        FileInputFormat.addInputPath(job, new Path(s));
    }
    InputSampler.<K, V>writePartitionFile(job, sampler);

    return 0;
}

From source file:cn.jpush.hdfs.mr.example.BaileyBorweinPlouffe.java

License:Apache License

/** Create and setup a job */
@SuppressWarnings("deprecation")
private static Job createJob(String name, Configuration conf) throws IOException {
    final Job job = new Job(conf, NAME + "_" + name);
    final Configuration jobconf = job.getConfiguration();
    job.setJarByClass(BaileyBorweinPlouffe.class);

    // setup mapper
    job.setMapperClass(BbpMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(BytesWritable.class);

    // setup reducer
    job.setReducerClass(BbpReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(BytesWritable.class);
    job.setNumReduceTasks(1);

    // setup input
    job.setInputFormatClass(BbpInputFormat.class);

    // disable task timeout
    jobconf.setLong(MRJobConfig.TASK_TIMEOUT, 0);

    // do not use speculative execution
    jobconf.setBoolean(MRJobConfig.MAP_SPECULATIVE, false);
    jobconf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, false);
    return job;/*ww w .j av a2s. com*/
}

From source file:cn.lhfei.hbase.ch04.SampleUploader.java

License:Apache License

/**
 * Job configuration./*from w  w  w  .j  a  v a 2  s .  c om*/
 */
public static Job configureJob(Configuration conf, String[] args) throws IOException {
    Path inputPath = new Path(args[0]);
    String tableName = args[1];
    //Job job = new Job(conf, NAME + "_" + tableName);

    Job job = Job.getInstance(conf);

    //job.setJarByClass(Uploader.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(Uploader.class);
    // No reducers. Just write straight to table. Call initTableReducerJob
    // because it sets up the TableOutputFormat.
    TableMapReduceUtil.initTableReducerJob(tableName, null, job);
    job.setNumReduceTasks(0);
    return job;
}

From source file:co.cask.cdap.data.tools.HBaseTableExporter.java

License:Apache License

/**
 * Sets up the actual MapReduce job./*from w  w  w.  j  a v  a 2  s. c  o m*/
 * @param tx The transaction which needs to be passed to the Scan instance. This transaction is be used by
 *           coprocessors to filter out the data corresonding to the invalid transactions .
 * @param tableName Name of the table which need to be exported as HFiles.
 * @return the configured job
 * @throws IOException
 */
public Job createSubmittableJob(Transaction tx, String tableName) throws IOException {

    Job job = Job.getInstance(hConf, "HBaseTableExporter");

    job.setJarByClass(HBaseTableExporter.class);
    Scan scan = new Scan();
    scan.setCacheBlocks(false);
    // Set the transaction attribute for the scan.
    scan.setAttribute(TxConstants.TX_OPERATION_ATTRIBUTE_KEY, new TransactionCodec().encode(tx));
    job.setNumReduceTasks(0);

    TableMapReduceUtil.initTableMapperJob(tableName, scan, KeyValueImporter.class, null, null, job);

    FileSystem fs = FileSystem.get(hConf);
    Random rand = new Random();
    Path root = new Path(fs.getWorkingDirectory(), "hbasetableexporter");
    fs.mkdirs(root);
    while (true) {
        bulkloadDir = new Path(root, "" + rand.nextLong());
        if (!fs.exists(bulkloadDir)) {
            break;
        }
    }

    HFileOutputFormat2.setOutputPath(job, bulkloadDir);
    HTable hTable = new HTable(hConf, tableName);
    HFileOutputFormat2.configureIncrementalLoad(job, hTable);

    return job;
}

From source file:co.cask.cdap.etl.batch.mapreduce.ETLMapReduce.java

License:Apache License

@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
    if (Boolean.valueOf(context.getSpecification().getProperty(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();//  w w w .  j  a v  a 2s.c om
    }
    CompositeFinisher.Builder finishers = CompositeFinisher.builder();

    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();

    // plugin name -> runtime args for that plugin
    Map<String, Map<String, String>> runtimeArgs = new HashMap<>();

    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    PipelinePhase phase = phaseSpec.getPhase();
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, phaseSpec);

    // we checked at configure time that there is exactly one source
    String sourceName = phaseSpec.getPhase().getSources().iterator().next();

    BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(sourceName);
    batchSource = new LoggedBatchConfigurable<>(sourceName, batchSource);
    BatchSourceContext sourceContext = new MapReduceSourceContext(context, mrMetrics,
            new DatasetContextLookupProvider(context), sourceName, context.getRuntimeArguments());
    batchSource.prepareRun(sourceContext);
    runtimeArgs.put(sourceName, sourceContext.getRuntimeArguments());
    finishers.add(batchSource, sourceContext);

    Map<String, SinkOutput> sinkOutputs = new HashMap<>();

    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Constants.CONNECTOR_TYPE),
            phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        String sinkName = stageInfo.getName();
        // todo: add a better way to get info for all sinks
        if (!phase.getSinks().contains(sinkName)) {
            continue;
        }

        BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(sinkName);
        batchSink = new LoggedBatchConfigurable<>(sinkName, batchSink);
        MapReduceSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics,
                new DatasetContextLookupProvider(context), sinkName, context.getRuntimeArguments());
        batchSink.prepareRun(sinkContext);
        runtimeArgs.put(sinkName, sinkContext.getRuntimeArguments());
        finishers.add(batchSink, sinkContext);

        sinkOutputs.put(sinkName,
                new SinkOutput(sinkContext.getOutputNames(), stageInfo.getErrorDatasetName()));
    }
    finisher = finishers.build();
    hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));

    // setup time partition for each error dataset
    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Transform.PLUGIN_TYPE),
            phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        if (stageInfo.getErrorDatasetName() != null) {
            Map<String, String> args = new HashMap<>();
            args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key",
                    Constants.ERROR_SCHEMA.toString());
            TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime());
            context.addOutput(Output.ofDataset(stageInfo.getErrorDatasetName(), args));
        }
    }

    job.setMapperClass(ETLMapper.class);
    Set<StageInfo> aggregators = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE);
    if (!aggregators.isEmpty()) {
        job.setReducerClass(ETLReducer.class);
        String aggregatorName = aggregators.iterator().next().getName();
        BatchAggregator aggregator = pluginInstantiator.newPluginInstance(aggregatorName);
        MapReduceAggregatorContext aggregatorContext = new MapReduceAggregatorContext(context, mrMetrics,
                new DatasetContextLookupProvider(context), aggregatorName, context.getRuntimeArguments());
        aggregator.prepareRun(aggregatorContext);
        finishers.add(aggregator, aggregatorContext);

        if (aggregatorContext.getNumPartitions() != null) {
            job.setNumReduceTasks(aggregatorContext.getNumPartitions());
        }
        // if the plugin sets the output key and value class directly, trust them
        Class<?> outputKeyClass = aggregatorContext.getGroupKeyClass();
        Class<?> outputValClass = aggregatorContext.getGroupValueClass();
        // otherwise, derive it from the plugin's parameters
        if (outputKeyClass == null) {
            outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
        }
        if (outputValClass == null) {
            outputValClass = TypeChecker.getGroupValueClass(aggregator);
        }
        hConf.set(GROUP_KEY_CLASS, outputKeyClass.getName());
        hConf.set(GROUP_VAL_CLASS, outputValClass.getName());
        // in case the classes are not a WritableComparable, but is some common type we support
        // for example, a String or a StructuredRecord
        WritableConversion writableConversion = WritableConversions.getConversion(outputKeyClass.getName());
        // if the conversion is null, it means the user is using their own object.
        if (writableConversion != null) {
            outputKeyClass = writableConversion.getWritableClass();
        }
        writableConversion = WritableConversions.getConversion(outputValClass.getName());
        if (writableConversion != null) {
            outputValClass = writableConversion.getWritableClass();
        }
        // check classes here instead of letting mapreduce do it, since mapreduce throws a cryptic error
        if (!WritableComparable.class.isAssignableFrom(outputKeyClass)) {
            throw new IllegalArgumentException(String.format(
                    "Invalid aggregator %s. The group key class %s must implement Hadoop's WritableComparable.",
                    aggregatorName, outputKeyClass));
        }
        if (!Writable.class.isAssignableFrom(outputValClass)) {
            throw new IllegalArgumentException(String.format(
                    "Invalid aggregator %s. The group value class %s must implement Hadoop's Writable.",
                    aggregatorName, outputValClass));
        }

        job.setMapOutputKeyClass(outputKeyClass);
        job.setMapOutputValueClass(outputValClass);
    } else {
        job.setNumReduceTasks(0);
    }

    hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(runtimeArgs));
}

From source file:co.cask.cdap.examples.datacleansing.DataCleansingMapReduce.java

License:Apache License

@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
    partitionCommitter = PartitionBatchInput.setInput(context, DataCleansing.RAW_RECORDS,
            new KVTableStatePersistor(DataCleansing.CONSUMING_STATE, "state.key"));

    // Each run writes its output to a partition for the league
    Long timeKey = Long.valueOf(context.getRuntimeArguments().get(OUTPUT_PARTITION_KEY));
    PartitionKey outputKey = PartitionKey.builder().addLongField("time", timeKey).build();

    Map<String, String> metadataToAssign = ImmutableMap.of("source.program", "DataCleansingMapReduce");

    // set up two outputs - one for invalid records and one for valid records
    Map<String, String> invalidRecordsArgs = new HashMap<>();
    PartitionedFileSetArguments.setOutputPartitionKey(invalidRecordsArgs, outputKey);
    PartitionedFileSetArguments.setOutputPartitionMetadata(invalidRecordsArgs, metadataToAssign);
    context.addOutput(Output.ofDataset(DataCleansing.INVALID_RECORDS, invalidRecordsArgs));

    Map<String, String> cleanRecordsArgs = new HashMap<>();
    PartitionedFileSetArguments.setDynamicPartitioner(cleanRecordsArgs, TimeAndZipPartitioner.class);
    PartitionedFileSetArguments.setOutputPartitionMetadata(cleanRecordsArgs, metadataToAssign);
    context.addOutput(Output.ofDataset(DataCleansing.CLEAN_RECORDS, cleanRecordsArgs));

    Job job = context.getHadoopJob();
    job.setMapperClass(SchemaMatchingFilter.class);
    job.setNumReduceTasks(0);

    // simply propagate the schema (if any) to be used by the mapper
    String schemaJson = context.getRuntimeArguments().get(SCHEMA_KEY);
    if (schemaJson != null) {
        job.getConfiguration().set(SCHEMA_KEY, schemaJson);
    }//from   w w w  .j ava2  s  .  c  om
}

From source file:co.cask.cdap.examples.fileset.WordCount.java

License:Apache License

@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
    Job job = context.getHadoopJob();
    job.setMapperClass(Tokenizer.class);
    job.setReducerClass(Counter.class);
    job.setNumReduceTasks(1);

    String inputDataset = context.getRuntimeArguments().get("input");
    inputDataset = inputDataset != null ? inputDataset : "lines";

    String outputDataset = context.getRuntimeArguments().get("output");
    outputDataset = outputDataset != null ? outputDataset : "counts";

    context.addInput(Input.ofDataset(inputDataset));
    context.addOutput(Output.ofDataset(outputDataset));
}