List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:clustering.mst.Driver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 3) { System.err.printf("usage: %s similarity_result_dir document_count_file output_dir " + "[cluster_threshold] [reduce_number] [compression]\n", getClass().getSimpleName()); System.exit(1);/*from w w w.j av a 2 s .c o m*/ } Path step1_OutputDir = new Path(args[2] + "/step1"); Path resultDir = new Path(args[2] + "/result"); URI docCntFile = new URI(args[1] + "/part-r-00000#docCnt"); Configuration conf = getConf(); conf = MapReduceUtils.initConf(conf); if (args.length > 3) { conf.setDouble("final.threshold", Double.valueOf(args[3])); } else { conf.setDouble("final.threshold", 0.2d); } if (args.length > 4) { conf.setInt("reduce.task.num", Integer.valueOf(args[4])); } else { conf.setInt("reduce.task.num", 5); } JobControl jobControl = new JobControl("mst jobs"); /* step 1, split and calculate the child msts */ Job childJob = Job.getInstance(conf, "mst child job"); childJob.setJarByClass(Driver.class); childJob.addCacheFile(docCntFile); if (args.length > 5 && args[5].equals("0")) { FileInputFormat.addInputPath(childJob, new Path(args[0])); childJob.setInputFormatClass(KeyValueTextInputFormat.class); } else { SequenceFileInputFormat.addInputPath(childJob, new Path(args[0])); childJob.setInputFormatClass(SequenceFileAsTextInputFormat.class); } FileOutputFormat.setOutputPath(childJob, step1_OutputDir); childJob.setMapperClass(ChildMapper.class); childJob.setMapOutputKeyClass(DoubleWritable.class); childJob.setMapOutputValueClass(Text.class); childJob.setPartitionerClass(ChildPartitioner.class); childJob.setReducerClass(ChildReducer.class); childJob.setNumReduceTasks(conf.getInt("reduce.task.num", 1)); childJob.setOutputKeyClass(DoubleWritable.class); childJob.setOutputValueClass(Text.class); ControlledJob controlledChildJob = new ControlledJob(conf); controlledChildJob.setJob(childJob); jobControl.addJob(controlledChildJob); /* step 2, merge step 1's output and calculate final mst */ Job finalJob = Job.getInstance(conf, "mst final job"); finalJob.setJarByClass(FinalReducer.class); finalJob.addCacheFile(docCntFile); FileInputFormat.addInputPath(finalJob, step1_OutputDir); finalJob.setInputFormatClass(KeyValueTextInputFormat.class); finalJob.setMapperClass(FinalMapper.class); finalJob.setMapOutputKeyClass(DoubleWritable.class); finalJob.setMapOutputValueClass(Text.class); finalJob.setReducerClass(FinalReducer.class); finalJob.setOutputKeyClass(IntWritable.class); finalJob.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(finalJob, resultDir); ControlledJob finalControlledJob = new ControlledJob(conf); finalControlledJob.setJob(finalJob); finalControlledJob.addDependingJob(controlledChildJob); jobControl.addJob(finalControlledJob); // run jobs MapReduceUtils.runJobs(jobControl); return finalJob.waitForCompletion(true) ? 0 : 1; }
From source file:clustering.simhash.Driver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 2) { System.err.printf("usage: %s init_result_dir output_dir [simhash_threshold]\n", getClass().getSimpleName()); System.exit(1);/*w ww. j ava 2 s . c om*/ } Path step1_outputDir = new Path(args[1] + "/step1"); Configuration conf = getConf(); conf = MapReduceUtils.initConf(conf); if (args.length > 2) { conf.setInt("simhash.threshold", Integer.valueOf(args[2])); } else { conf.setInt("simhash.threshold", 3); } JobControl jobControl = new JobControl("simhash jobs"); Job job1 = Job.getInstance(conf, "simhash step1 job"); job1.setJarByClass(Driver.class); FileInputFormat.addInputPath(job1, new Path(args[0])); job1.setInputFormatClass(KeyValueTextInputFormat.class); job1.setMapperClass(Step1Mapper.class); job1.setMapOutputKeyClass(LongWritable.class); job1.setMapOutputValueClass(Text.class); job1.setReducerClass(Step1Reducer.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job1, step1_outputDir); ControlledJob controlledJob1 = new ControlledJob(conf); controlledJob1.setJob(job1); jobControl.addJob(controlledJob1); Job job2 = Job.getInstance(conf, "simhash step2 job"); job2.setJarByClass(Driver.class); FileInputFormat.addInputPath(job2, step1_outputDir); job2.setInputFormatClass(KeyValueTextInputFormat.class); job2.setMapperClass(Step2Mapper.class); job2.setMapOutputKeyClass(IntWritable.class); job2.setMapOutputValueClass(Text.class); job2.setReducerClass(Step2Reducer.class); job2.setOutputKeyClass(IntWritable.class); job2.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job2, new Path(args[1] + "/result")); ControlledJob controlledJob2 = new ControlledJob(conf); controlledJob2.setJob(job2); controlledJob2.addDependingJob(controlledJob1); jobControl.addJob(controlledJob2); long starttime = System.currentTimeMillis(); clustering.Utils.MapReduceUtils.runJobs(jobControl); boolean complete = job2.waitForCompletion(true); long endtime = System.currentTimeMillis(); System.out.println("simhash job finished in: " + (endtime - starttime) / 1000 + " seconds"); return complete ? 0 : 1; }
From source file:clustering.similarity.ISimDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 2) { System.err.printf("usage: %s simpre_dir output_dir " + "[compression_or_not] [reduce_task_number]\n", getClass().getSimpleName()); System.exit(1);//from www . j a va 2 s . com } Configuration conf = getConf(); conf = MapReduceUtils.initConf(conf); Job job = Job.getInstance(conf, "isim job"); job.setJarByClass(ISimDriver.class); if (args.length > 2 && args[2].equals("0")) { FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(KeyValueTextInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); } else { job.setInputFormatClass(SequenceFileAsTextInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(args[0])); conf.setBoolean("mapreduce.map.output.compress", true); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.GzipCodec"); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class); SequenceFileOutputFormat.setOutputPath(job, new Path(args[1])); } if (args.length > 3) { conf.setInt("reduce.num", Integer.valueOf(args[3])); } else { conf.setInt("reduce.num", 5); } job.setMapperClass(ISimMapper.class); job.setMapOutputKeyClass(IntIntTupleWritable.class); job.setMapOutputValueClass(DoubleWritable.class); job.setCombinerClass(ISimCombiner.class); job.setPartitionerClass(HashPartitioner.class); job.setNumReduceTasks(conf.getInt("reduce.num", 1)); job.setReducerClass(ISimReducer.class); job.setOutputKeyClass(IntIntTupleWritable.class); job.setOutputValueClass(DoubleWritable.class); long starttime = System.currentTimeMillis(); boolean complete = job.waitForCompletion(true); long endtime = System.currentTimeMillis(); System.out.println("inverted similarity job finished in: " + (endtime - starttime) / 1000 + " seconds"); return complete ? 0 : 1; }
From source file:clustering.similarity.PreDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 2) { System.err.printf(/*from w w w . ja v a 2s . com*/ "usage: %s inverted_index_result_dir output_dir" + " [compress_or_not] [reducer_number] [deci_number]\n", this.getClass().getSimpleName()); System.exit(1); } Configuration conf = getConf(); conf = MapReduceUtils.initConf(conf); conf.set("mapreduce.reduce.speculative", "false"); // TODO: 17-4-24 calculate split number from reducer number conf.setInt("split.num", 8); if (args.length > 3) { conf.setInt("reducer.num", Integer.valueOf(args[3])); } else { conf.setInt("reducer.num", 29); } if (args.length > 4) { conf.setInt("deci.number", Integer.valueOf(args[4])); } else { conf.setInt("deci.number", 3); } Job job = Job.getInstance(conf, "pre job"); job.setJarByClass(PreDriver.class); FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setMapperClass(PreMapper.class); job.setMapOutputKeyClass(IntIntTupleWritable.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(PrePartitioner.class); job.setNumReduceTasks(conf.getInt("reducer.num", 29)); job.setReducerClass(PreReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // set default compression if (args.length > 2 && args[2].equals("0")) { FileOutputFormat.setOutputPath(job, new Path(args[1])); } else { job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class); SequenceFileOutputFormat.setOutputPath(job, new Path(args[1])); } long starttime = System.currentTimeMillis(); boolean complete = job.waitForCompletion(true); long endtime = System.currentTimeMillis(); System.out.println("inverted similarity pre job finished in: " + (endtime - starttime) / 1000 + " seconds"); return complete ? 0 : 1; }
From source file:clustering.tf_idf.TermCntDriver.java
License:Apache License
Job configJob(String[] args) throws Exception { if (args.length < 2) { System.err.printf("usage: %s simhash_result_dir step_1_output_dir\n", getClass().getSimpleName()); System.exit(1);// ww w . java 2 s . com } Configuration conf = getConf(); conf = initConf(conf); Job job = Job.getInstance(conf, "tf idf step1 job"); job.setJarByClass(TermCntDriver.class); FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setMapperClass(TermCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setCombinerClass(TermCountReducer.class); job.setReducerClass(TermCountReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job; }
From source file:cn.edu.hfut.dmic.webcollector.crawldb.DBReader.java
public static void main(String[] args) throws Exception { Path crawlPath = new Path("task2"); Path currentPath = new Path(crawlPath, "crawldb/current"); Path output = new Path("output"); Configuration config = CrawlerConfiguration.create(); FileSystem fs = FileSystem.get(config); if (fs.exists(output)) { fs.delete(output);/*from ww w . j av a 2 s . c om*/ } Job job = new Job(config); job.setJobName("dbreader " + crawlPath.toString()); job.setMapperClass(DBReaderMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, currentPath); FileOutputFormat.setOutputPath(job, output); job.waitForCompletion(true); }
From source file:cn.edu.hfut.dmic.webcollector.crawldb.Generator.java
public static String generate(Path crawlPath, Configuration conf) throws Exception { SegmentUtil.initSegments(crawlPath, conf); String segmentName = SegmentUtil.createSegment(crawlPath, conf); Path currentPath = new Path(crawlPath, "crawldb/current"); Path generatePath = new Path(crawlPath, "segments/" + segmentName + "/generate"); Job job = new Job(conf); job.setJobName("generate " + crawlPath.toString()); job.setJarByClass(Generator.class); job.setReducerClass(GeneratorReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileInputFormat.addInputPath(job, currentPath); FileOutputFormat.setOutputPath(job, generatePath); job.waitForCompletion(true);/* w w w. j ava2 s . c o m*/ long count = job.getCounters().findCounter("generator", "count").getValue(); System.out.println("total generate:" + count); if (count == 0) { return null; } else { return segmentName; } }
From source file:cn.edu.hfut.dmic.webcollector.crawldb.Merge.java
public static void merge(Path crawlPath, Path[] mergePaths, Configuration conf, String jobName) throws Exception { Job job = new Job(conf); job.setJobName(jobName + " " + crawlPath.toString()); job.setJarByClass(Merge.class); // job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar"); Path crawldbPath = new Path(crawlPath, "crawldb"); Path newdb = new Path(crawldbPath, "new"); Path currentdb = new Path(crawldbPath, "current"); FileSystem fs = FileSystem.get(conf); if (fs.exists(currentdb)) { FileInputFormat.addInputPath(job, currentdb); }//from w w w . j av a 2 s . co m if (fs.exists(newdb)) { fs.delete(newdb); } for (Path mergePath : mergePaths) { FileInputFormat.addInputPath(job, mergePath); } FileOutputFormat.setOutputPath(job, newdb); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(MergeMap.class); job.setReducerClass(MergeReduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.waitForCompletion(true); }
From source file:cn.edu.hfut.dmic.webcollector.fetcher.Fetcher.java
public static void fetch(Path crawlPath, String segmentName, Configuration conf) throws Exception { Path segmentPath = new Path(crawlPath, "segments/" + segmentName); Path generatePath = new Path(segmentPath, "generate"); Job job = new Job(conf); job.setJobName("fetch " + crawlPath.toString()); job.setJarByClass(Fetcher.class); job.setReducerClass(FetcherReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(FetcherOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); FileInputFormat.addInputPath(job, generatePath); FileOutputFormat.setOutputPath(job, segmentPath); job.waitForCompletion(true);/* w ww. j av a 2 s. co m*/ }
From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java
public static Job createJob(Configuration conf, Path crawldb) throws IOException { Job job = new Job(conf); //job.setJarByClass(Merge.class); job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar"); Path newdb = new Path(crawldb, "new"); Path currentdb = new Path(crawldb, "current"); FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create()); if (fs.exists(currentdb)) { FileInputFormat.addInputPath(job, currentdb); }/*from www.j a v a 2s. c o m*/ if (fs.exists(newdb)) { fs.delete(newdb); } FileOutputFormat.setOutputPath(job, newdb); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(MergeMap.class); job.setReducerClass(MergeReduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job; }