Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the combiner class for the job.

Usage

From source file:ca.uwaterloo.cs.bigdata2017w.assignment0.WordCount.java

License:Apache License

/**
 * Runs this tool./*from   w ww  .ja  va2s  .c  o m*/
 */
@Override
public int run(String[] argv) throws Exception {
    final Args args = new Args();
    CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(100));

    try {
        parser.parseArgument(argv);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        return -1;
    }

    LOG.info("Tool: " + WordCount.class.getSimpleName());
    LOG.info(" - input path: " + args.input);
    LOG.info(" - output path: " + args.output);
    LOG.info(" - number of reducers: " + args.numReducers);
    LOG.info(" - use in-mapper combining: " + args.imc);

    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJobName(WordCount.class.getSimpleName());
    job.setJarByClass(WordCount.class);

    job.setNumReduceTasks(args.numReducers);

    FileInputFormat.setInputPaths(job, new Path(args.input));
    FileOutputFormat.setOutputPath(job, new Path(args.output));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(args.imc ? MyMapperIMC.class : MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(args.output);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.HistogramMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.HistogramMain <input> <output>");
        System.exit(2);// w  ww  .j  a v  a  2  s .c o m
    }
    Job job = new Job(conf, "HistogramMain");
    job.setJarByClass(HistogramMain.class);

    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setCombinerClass(MyCombiner.class);

    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.out.println("\nStarting Job ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            System.out.println("Job failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Duration is " + duration + " seconds.");
    }
    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.PARMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.PARMain <input> <output>");
        System.exit(2);/* w  w w . j a  va2 s .  c  o  m*/
    }
    Job job = new Job(conf, "PARMain");
    job.setJarByClass(PARMain.class);

    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setCombinerClass(MyCombiner.class);

    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.out.println("\nStarting Job ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            System.out.println("Job failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Duration is " + duration + " seconds.");
    }
    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.ThreelMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.ThreelMain <input> <output>");
        System.exit(2);//  w w w.j  a va  2s  . c  o  m
    }

    Job job = new Job(conf, "ThreelMain");
    job.setJarByClass(ThreelMain.class);

    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(ArrayPrimitiveWritable.class);

    job.setCombinerClass(MyCombiner.class);

    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.out.println("\nStarting Job ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            System.out.println("Job failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Duration is " + duration + " seconds.");
    }
    return 0;
}

From source file:CalculateSentiment.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Path tempDir = new Path("wordcount-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 3) {
        System.err.println("Usage: wordcount <in> <out> <category>");
        System.exit(2);/*from ww w .  j a  va  2 s .c  om*/
    }
    conf.set("category", otherArgs[2]);

    // try {
    //     String filePath = otherArgs[0];
    //     BufferedReader br = new BufferedReader(new FileReader(filePath));
    //     String line = br.readLine();
    //     conf.set("category", line);
    // } catch (Exception e) {
    //     e.printStackTrace();
    // }
    // conf.set("category", WordCount.read(otherArgs[2]));

    DistributedCache.createSymlink(conf);
    String path = "CalculateSentiment.obj";
    Path filePath = new Path(path);
    String uriWithLink = filePath.toUri().toString() + "#" + "object";
    DistributedCache.addCacheFile(new URI(uriWithLink), conf);

    // DistributedCache.addCacheFile(new URI("/CalculateSentiment.obj"), conf);
    Job job = new Job(conf, "Test");

    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(DoubleSumReducer.class);
    job.setReducerClass(DoubleSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(1);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:chaohBIM.BIMGetIndex.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);/*from w  ww. j a v  a 2s . c o  m*/
    }
    Job job = new Job(conf, "getTfidf");
    job.setJarByClass(BIMGetIndex.class);

    job.setMapperClass(tfidfMapper.class);

    job.setCombinerClass(tfidfCombiner.class);
    job.setReducerClass(tfidfdReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(ZipFileInputFormat.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:chaohParse.huangWordCount.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);/*  www. j a va 2s  .  co  m*/
    }
    Job job = new Job(conf, "word count");

    job.setJarByClass(huangWordCount.class);

    job.setMapperClass(WordMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setCombinerClass(WordCombiner.class);
    job.setReducerClass(WordReducer.class);

    job.setInputFormatClass(ZipFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:cloud9.ComputeCooccurrenceMatrixStripesOOM.java

License:Apache License

/**
 * Runs this tool./* w w w .  java  2 s. c  om*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int window = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    sLogger.info("Tool: ComputeCooccurrenceMatrixStripes");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - window: " + window);
    sLogger.info(" - number of reducers: " + reduceTasks);

    Job job = new Job(getConf(), "CooccurrenceMatrixStripes");

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    job.getConfiguration().setInt("window", window);

    job.setJarByClass(ComputeCooccurrenceMatrixStripesOOM.class);
    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(String2IntOpenHashMapWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    job.getConfiguration().setInt("io.sort.mb", 400);

    job.getConfiguration().set("mapred.child.java.opts",
            "-Xmx1000m -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps");
    //job.getConfiguration().set("mapred.child.java.opts", "-Xmx1000m");

    job.getConfiguration().setInt("child.monitor.jstat.seconds", 2);

    job.getConfiguration().set("fs.default.name", "hdfs://master:9000");
    job.getConfiguration().set("mapred.job.tracker", "master:9001");

    //conf.set("user.name", "xulijie");

    job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", 1);

    //job.getConfiguration().setFloat("io.sort.record.percent", 0.2f);
    //job.getConfiguration().setFloat("io.sort.spill.percent", 0.95f);
    // conf.setFloat("mapred.job.shuffle.input.buffer.percent", 0.9f);
    // conf.setFloat("mapred.job.shuffle.merge.percent", 0.9f);
    //conf.setFloat("mapred.job.reduce.input.buffer.percent", 0.4f);
    //conf.set("mapred.job.tracker", "local");
    //conf.set("fs.default.name", "file:///");
    job.getConfiguration().setLong("mapred.min.split.size", 512 * 1024 * 1024L);
    job.getConfiguration().setLong("mapred.max.split.size", 512 * 1024 * 1024L);

    job.getConfiguration().setInt("mapred.map.max.attempts", 0);
    job.getConfiguration().setInt("mapred.reduce.max.attempts", 0);

    //job.getConfiguration().set("heapdump.reduce.input.groups", "3,897,853[5]");
    //job.getConfiguration().set("heapdump.reduce.input.records", "8407734;8407737;8407740;8407743;8407746;8407749;8407750");

    //job.getConfiguration().set("omit.reduce.input.records", "8407733;8407750");
    //job.getConfiguration().set("heapdump.reduce.input.records", "8407751");
    //job.getConfiguration().set("heapdump.reduce.output.records", "3897853");

    job.getConfiguration().set("heapdump.task.attempt.ids", "attempt_201404281552_0001_r_000000_0");

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:clustering.similarity.ISimDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("usage: %s simpre_dir output_dir " + "[compression_or_not] [reduce_task_number]\n",
                getClass().getSimpleName());
        System.exit(1);/*from w  w  w .  j  av a2  s  . c  om*/
    }

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    Job job = Job.getInstance(conf, "isim job");
    job.setJarByClass(ISimDriver.class);

    if (args.length > 2 && args[2].equals("0")) {
        FileInputFormat.addInputPath(job, new Path(args[0]));
        job.setInputFormatClass(KeyValueTextInputFormat.class);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
    } else {
        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        SequenceFileInputFormat.addInputPath(job, new Path(args[0]));

        conf.setBoolean("mapreduce.map.output.compress", true);
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.GzipCodec");

        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
        SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class);
        SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));
    }

    if (args.length > 3) {
        conf.setInt("reduce.num", Integer.valueOf(args[3]));
    } else {
        conf.setInt("reduce.num", 5);
    }

    job.setMapperClass(ISimMapper.class);
    job.setMapOutputKeyClass(IntIntTupleWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setCombinerClass(ISimCombiner.class);
    job.setPartitionerClass(HashPartitioner.class);

    job.setNumReduceTasks(conf.getInt("reduce.num", 1));

    job.setReducerClass(ISimReducer.class);
    job.setOutputKeyClass(IntIntTupleWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    long starttime = System.currentTimeMillis();
    boolean complete = job.waitForCompletion(true);
    long endtime = System.currentTimeMillis();
    System.out.println("inverted similarity job finished in: " + (endtime - starttime) / 1000 + " seconds");

    return complete ? 0 : 1;
}

From source file:clustering.tf_idf.DocCntDriver.java

License:Apache License

Job configJob(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("usage: %s simhash_result_dir pre_step_output_dir\n", getClass().getSimpleName());
        System.exit(1);/*  w  w w  .  j a va  2  s  .  c  om*/
    }

    Configuration conf = getConf();
    conf = initConf(conf);

    Job job = Job.getInstance(conf, "tf idf pre job");
    job.setJarByClass(WorkflowDriver.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(DocCntMapper.class);
    job.setCombinerClass(DocCntReducer.class);

    job.setReducerClass(DocCntReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(IntWritable.class);

    return job;
}