Example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the value class for the map output data.

Usage

From source file:clustering.mst.Driver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.err.printf("usage: %s similarity_result_dir document_count_file output_dir "
                + "[cluster_threshold] [reduce_number] [compression]\n", getClass().getSimpleName());
        System.exit(1);/*from   w w w.j av a  2 s  .c  o  m*/
    }

    Path step1_OutputDir = new Path(args[2] + "/step1");
    Path resultDir = new Path(args[2] + "/result");

    URI docCntFile = new URI(args[1] + "/part-r-00000#docCnt");

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    if (args.length > 3) {
        conf.setDouble("final.threshold", Double.valueOf(args[3]));
    } else {
        conf.setDouble("final.threshold", 0.2d);
    }
    if (args.length > 4) {
        conf.setInt("reduce.task.num", Integer.valueOf(args[4]));
    } else {
        conf.setInt("reduce.task.num", 5);
    }

    JobControl jobControl = new JobControl("mst jobs");

    /* step 1, split and calculate the child msts */

    Job childJob = Job.getInstance(conf, "mst child job");
    childJob.setJarByClass(Driver.class);

    childJob.addCacheFile(docCntFile);

    if (args.length > 5 && args[5].equals("0")) {
        FileInputFormat.addInputPath(childJob, new Path(args[0]));
        childJob.setInputFormatClass(KeyValueTextInputFormat.class);
    } else {
        SequenceFileInputFormat.addInputPath(childJob, new Path(args[0]));
        childJob.setInputFormatClass(SequenceFileAsTextInputFormat.class);
    }

    FileOutputFormat.setOutputPath(childJob, step1_OutputDir);

    childJob.setMapperClass(ChildMapper.class);
    childJob.setMapOutputKeyClass(DoubleWritable.class);
    childJob.setMapOutputValueClass(Text.class);

    childJob.setPartitionerClass(ChildPartitioner.class);

    childJob.setReducerClass(ChildReducer.class);
    childJob.setNumReduceTasks(conf.getInt("reduce.task.num", 1));
    childJob.setOutputKeyClass(DoubleWritable.class);
    childJob.setOutputValueClass(Text.class);

    ControlledJob controlledChildJob = new ControlledJob(conf);
    controlledChildJob.setJob(childJob);
    jobControl.addJob(controlledChildJob);

    /* step 2, merge step 1's output and calculate final mst */

    Job finalJob = Job.getInstance(conf, "mst final job");
    finalJob.setJarByClass(FinalReducer.class);

    finalJob.addCacheFile(docCntFile);

    FileInputFormat.addInputPath(finalJob, step1_OutputDir);
    finalJob.setInputFormatClass(KeyValueTextInputFormat.class);

    finalJob.setMapperClass(FinalMapper.class);
    finalJob.setMapOutputKeyClass(DoubleWritable.class);
    finalJob.setMapOutputValueClass(Text.class);

    finalJob.setReducerClass(FinalReducer.class);
    finalJob.setOutputKeyClass(IntWritable.class);
    finalJob.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(finalJob, resultDir);

    ControlledJob finalControlledJob = new ControlledJob(conf);
    finalControlledJob.setJob(finalJob);
    finalControlledJob.addDependingJob(controlledChildJob);
    jobControl.addJob(finalControlledJob);

    // run jobs

    MapReduceUtils.runJobs(jobControl);

    return finalJob.waitForCompletion(true) ? 0 : 1;
}

From source file:clustering.simhash.Driver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("usage: %s init_result_dir output_dir [simhash_threshold]\n",
                getClass().getSimpleName());
        System.exit(1);/*w ww. j ava 2 s  .  c  om*/
    }

    Path step1_outputDir = new Path(args[1] + "/step1");

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    if (args.length > 2) {
        conf.setInt("simhash.threshold", Integer.valueOf(args[2]));
    } else {
        conf.setInt("simhash.threshold", 3);
    }

    JobControl jobControl = new JobControl("simhash jobs");

    Job job1 = Job.getInstance(conf, "simhash step1 job");
    job1.setJarByClass(Driver.class);

    FileInputFormat.addInputPath(job1, new Path(args[0]));
    job1.setInputFormatClass(KeyValueTextInputFormat.class);

    job1.setMapperClass(Step1Mapper.class);
    job1.setMapOutputKeyClass(LongWritable.class);
    job1.setMapOutputValueClass(Text.class);

    job1.setReducerClass(Step1Reducer.class);
    job1.setOutputKeyClass(IntWritable.class);
    job1.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job1, step1_outputDir);

    ControlledJob controlledJob1 = new ControlledJob(conf);
    controlledJob1.setJob(job1);
    jobControl.addJob(controlledJob1);

    Job job2 = Job.getInstance(conf, "simhash step2 job");
    job2.setJarByClass(Driver.class);

    FileInputFormat.addInputPath(job2, step1_outputDir);
    job2.setInputFormatClass(KeyValueTextInputFormat.class);

    job2.setMapperClass(Step2Mapper.class);
    job2.setMapOutputKeyClass(IntWritable.class);
    job2.setMapOutputValueClass(Text.class);

    job2.setReducerClass(Step2Reducer.class);
    job2.setOutputKeyClass(IntWritable.class);
    job2.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job2, new Path(args[1] + "/result"));

    ControlledJob controlledJob2 = new ControlledJob(conf);
    controlledJob2.setJob(job2);
    controlledJob2.addDependingJob(controlledJob1);
    jobControl.addJob(controlledJob2);

    long starttime = System.currentTimeMillis();
    clustering.Utils.MapReduceUtils.runJobs(jobControl);

    boolean complete = job2.waitForCompletion(true);
    long endtime = System.currentTimeMillis();
    System.out.println("simhash job finished in: " + (endtime - starttime) / 1000 + " seconds");

    return complete ? 0 : 1;
}

From source file:clustering.similarity.ISimDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("usage: %s simpre_dir output_dir " + "[compression_or_not] [reduce_task_number]\n",
                getClass().getSimpleName());
        System.exit(1);//from www .  j a  va 2 s . com
    }

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    Job job = Job.getInstance(conf, "isim job");
    job.setJarByClass(ISimDriver.class);

    if (args.length > 2 && args[2].equals("0")) {
        FileInputFormat.addInputPath(job, new Path(args[0]));
        job.setInputFormatClass(KeyValueTextInputFormat.class);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
    } else {
        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        SequenceFileInputFormat.addInputPath(job, new Path(args[0]));

        conf.setBoolean("mapreduce.map.output.compress", true);
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.GzipCodec");

        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
        SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class);
        SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));
    }

    if (args.length > 3) {
        conf.setInt("reduce.num", Integer.valueOf(args[3]));
    } else {
        conf.setInt("reduce.num", 5);
    }

    job.setMapperClass(ISimMapper.class);
    job.setMapOutputKeyClass(IntIntTupleWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setCombinerClass(ISimCombiner.class);
    job.setPartitionerClass(HashPartitioner.class);

    job.setNumReduceTasks(conf.getInt("reduce.num", 1));

    job.setReducerClass(ISimReducer.class);
    job.setOutputKeyClass(IntIntTupleWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    long starttime = System.currentTimeMillis();
    boolean complete = job.waitForCompletion(true);
    long endtime = System.currentTimeMillis();
    System.out.println("inverted similarity job finished in: " + (endtime - starttime) / 1000 + " seconds");

    return complete ? 0 : 1;
}

From source file:clustering.similarity.PreDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf(/*from   w  w w .  ja v a 2s  . com*/
                "usage: %s inverted_index_result_dir output_dir"
                        + " [compress_or_not] [reducer_number] [deci_number]\n",
                this.getClass().getSimpleName());
        System.exit(1);
    }
    Configuration conf = getConf();

    conf = MapReduceUtils.initConf(conf);
    conf.set("mapreduce.reduce.speculative", "false");

    // TODO: 17-4-24 calculate split number from reducer number
    conf.setInt("split.num", 8);

    if (args.length > 3) {
        conf.setInt("reducer.num", Integer.valueOf(args[3]));
    } else {
        conf.setInt("reducer.num", 29);
    }
    if (args.length > 4) {
        conf.setInt("deci.number", Integer.valueOf(args[4]));
    } else {
        conf.setInt("deci.number", 3);
    }

    Job job = Job.getInstance(conf, "pre job");
    job.setJarByClass(PreDriver.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormatClass(KeyValueTextInputFormat.class);

    job.setMapperClass(PreMapper.class);
    job.setMapOutputKeyClass(IntIntTupleWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(PrePartitioner.class);

    job.setNumReduceTasks(conf.getInt("reducer.num", 29));
    job.setReducerClass(PreReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // set default compression
    if (args.length > 2 && args[2].equals("0")) {
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
    } else {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
        SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class);
        SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));
    }

    long starttime = System.currentTimeMillis();
    boolean complete = job.waitForCompletion(true);
    long endtime = System.currentTimeMillis();
    System.out.println("inverted similarity pre job finished in: " + (endtime - starttime) / 1000 + " seconds");

    return complete ? 0 : 1;
}

From source file:clustering.tf_idf.TermCntDriver.java

License:Apache License

Job configJob(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("usage: %s simhash_result_dir step_1_output_dir\n", getClass().getSimpleName());
        System.exit(1);// ww w  . java  2 s  . com
    }

    Configuration conf = getConf();
    conf = initConf(conf);

    Job job = Job.getInstance(conf, "tf idf step1 job");
    job.setJarByClass(TermCntDriver.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormatClass(KeyValueTextInputFormat.class);

    job.setMapperClass(TermCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setCombinerClass(TermCountReducer.class);

    job.setReducerClass(TermCountReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job;
}

From source file:cn.edu.hfut.dmic.webcollector.crawldb.DBReader.java

public static void main(String[] args) throws Exception {
    Path crawlPath = new Path("task2");
    Path currentPath = new Path(crawlPath, "crawldb/current");
    Path output = new Path("output");

    Configuration config = CrawlerConfiguration.create();
    FileSystem fs = FileSystem.get(config);

    if (fs.exists(output)) {
        fs.delete(output);/*from ww w  .  j av a 2  s  . c om*/
    }

    Job job = new Job(config);
    job.setJobName("dbreader " + crawlPath.toString());
    job.setMapperClass(DBReaderMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, currentPath);
    FileOutputFormat.setOutputPath(job, output);

    job.waitForCompletion(true);

}

From source file:cn.edu.hfut.dmic.webcollector.crawldb.Generator.java

public static String generate(Path crawlPath, Configuration conf) throws Exception {
    SegmentUtil.initSegments(crawlPath, conf);
    String segmentName = SegmentUtil.createSegment(crawlPath, conf);

    Path currentPath = new Path(crawlPath, "crawldb/current");
    Path generatePath = new Path(crawlPath, "segments/" + segmentName + "/generate");

    Job job = new Job(conf);
    job.setJobName("generate " + crawlPath.toString());
    job.setJarByClass(Generator.class);

    job.setReducerClass(GeneratorReducer.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    FileInputFormat.addInputPath(job, currentPath);
    FileOutputFormat.setOutputPath(job, generatePath);
    job.waitForCompletion(true);/*  w  w  w. j  ava2  s  . c o m*/
    long count = job.getCounters().findCounter("generator", "count").getValue();
    System.out.println("total generate:" + count);
    if (count == 0) {
        return null;
    } else {
        return segmentName;
    }

}

From source file:cn.edu.hfut.dmic.webcollector.crawldb.Merge.java

public static void merge(Path crawlPath, Path[] mergePaths, Configuration conf, String jobName)
        throws Exception {

    Job job = new Job(conf);
    job.setJobName(jobName + "  " + crawlPath.toString());
    job.setJarByClass(Merge.class);
    // job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar");
    Path crawldbPath = new Path(crawlPath, "crawldb");
    Path newdb = new Path(crawldbPath, "new");
    Path currentdb = new Path(crawldbPath, "current");

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(currentdb)) {
        FileInputFormat.addInputPath(job, currentdb);
    }//from w w  w  .  j av a 2  s  . co  m

    if (fs.exists(newdb)) {
        fs.delete(newdb);
    }
    for (Path mergePath : mergePaths) {
        FileInputFormat.addInputPath(job, mergePath);
    }
    FileOutputFormat.setOutputPath(job, newdb);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MergeMap.class);
    job.setReducerClass(MergeReduce.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.waitForCompletion(true);

}

From source file:cn.edu.hfut.dmic.webcollector.fetcher.Fetcher.java

public static void fetch(Path crawlPath, String segmentName, Configuration conf) throws Exception {
    Path segmentPath = new Path(crawlPath, "segments/" + segmentName);
    Path generatePath = new Path(segmentPath, "generate");

    Job job = new Job(conf);
    job.setJobName("fetch " + crawlPath.toString());
    job.setJarByClass(Fetcher.class);

    job.setReducerClass(FetcherReducer.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(FetcherOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    FileInputFormat.addInputPath(job, generatePath);
    FileOutputFormat.setOutputPath(job, segmentPath);

    job.waitForCompletion(true);/*  w  ww. j  av a  2  s.  co  m*/
}

From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java

public static Job createJob(Configuration conf, Path crawldb) throws IOException {

    Job job = new Job(conf);
    //job.setJarByClass(Merge.class);
    job.getConfiguration().set("mapred",
            "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar");
    Path newdb = new Path(crawldb, "new");
    Path currentdb = new Path(crawldb, "current");

    FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create());
    if (fs.exists(currentdb)) {
        FileInputFormat.addInputPath(job, currentdb);
    }/*from   www.j a v a 2s.  c  o m*/

    if (fs.exists(newdb)) {
        fs.delete(newdb);
    }

    FileOutputFormat.setOutputPath(job, newdb);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MergeMap.class);
    job.setReducerClass(MergeReduce.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    return job;
}