List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:ca.uwaterloo.cs.bigdata2017w.assignment0.WordCount.java
License:Apache License
/** * Runs this tool./*from w ww .ja va2s .c o m*/ */ @Override public int run(String[] argv) throws Exception { final Args args = new Args(); CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(100)); try { parser.parseArgument(argv); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); return -1; } LOG.info("Tool: " + WordCount.class.getSimpleName()); LOG.info(" - input path: " + args.input); LOG.info(" - output path: " + args.output); LOG.info(" - number of reducers: " + args.numReducers); LOG.info(" - use in-mapper combining: " + args.imc); Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJobName(WordCount.class.getSimpleName()); job.setJarByClass(WordCount.class); job.setNumReduceTasks(args.numReducers); FileInputFormat.setInputPaths(job, new Path(args.input)); FileOutputFormat.setOutputPath(job, new Path(args.output)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(args.imc ? MyMapperIMC.class : MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. Path outputDir = new Path(args.output); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.HistogramMain.java
License:Open Source License
public int run(String[] args) throws IOException { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.HistogramMain <input> <output>"); System.exit(2);// w ww .j a v a 2 s .c o m } Job job = new Job(conf, "HistogramMain"); job.setJarByClass(HistogramMain.class); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(DoubleWritable.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.out.println("\nStarting Job ..."); final long startTime = System.currentTimeMillis(); try { if (!job.waitForCompletion(true)) { System.out.println("Job failed."); System.exit(1); } } catch (Exception e) { throw new RuntimeException(e); } finally { final double duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Duration is " + duration + " seconds."); } return 0; }
From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.PARMain.java
License:Open Source License
public int run(String[] args) throws IOException { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.PARMain <input> <output>"); System.exit(2);/* w w w . j a va2 s . c o m*/ } Job job = new Job(conf, "PARMain"); job.setJarByClass(PARMain.class); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(DoubleWritable.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.out.println("\nStarting Job ..."); final long startTime = System.currentTimeMillis(); try { if (!job.waitForCompletion(true)) { System.out.println("Job failed."); System.exit(1); } } catch (Exception e) { throw new RuntimeException(e); } finally { final double duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Duration is " + duration + " seconds."); } return 0; }
From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.ThreelMain.java
License:Open Source License
public int run(String[] args) throws IOException { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.ThreelMain <input> <output>"); System.exit(2);// w w w.j a va 2s . c o m } Job job = new Job(conf, "ThreelMain"); job.setJarByClass(ThreelMain.class); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(ArrayPrimitiveWritable.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.out.println("\nStarting Job ..."); final long startTime = System.currentTimeMillis(); try { if (!job.waitForCompletion(true)) { System.out.println("Job failed."); System.exit(1); } } catch (Exception e) { throw new RuntimeException(e); } finally { final double duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Duration is " + duration + " seconds."); } return 0; }
From source file:CalculateSentiment.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Path tempDir = new Path("wordcount-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: wordcount <in> <out> <category>"); System.exit(2);/*from ww w . j a va 2 s .c om*/ } conf.set("category", otherArgs[2]); // try { // String filePath = otherArgs[0]; // BufferedReader br = new BufferedReader(new FileReader(filePath)); // String line = br.readLine(); // conf.set("category", line); // } catch (Exception e) { // e.printStackTrace(); // } // conf.set("category", WordCount.read(otherArgs[2])); DistributedCache.createSymlink(conf); String path = "CalculateSentiment.obj"; Path filePath = new Path(path); String uriWithLink = filePath.toUri().toString() + "#" + "object"; DistributedCache.addCacheFile(new URI(uriWithLink), conf); // DistributedCache.addCacheFile(new URI("/CalculateSentiment.obj"), conf); Job job = new Job(conf, "Test"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(DoubleSumReducer.class); job.setReducerClass(DoubleSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:chaohBIM.BIMGetIndex.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);/*from w ww. j a v a 2s . c o m*/ } Job job = new Job(conf, "getTfidf"); job.setJarByClass(BIMGetIndex.class); job.setMapperClass(tfidfMapper.class); job.setCombinerClass(tfidfCombiner.class); job.setReducerClass(tfidfdReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(ZipFileInputFormat.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:chaohParse.huangWordCount.java
License:Open Source License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);/* www. j a va 2s . co m*/ } Job job = new Job(conf, "word count"); job.setJarByClass(huangWordCount.class); job.setMapperClass(WordMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setCombinerClass(WordCombiner.class); job.setReducerClass(WordReducer.class); job.setInputFormatClass(ZipFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:cloud9.ComputeCooccurrenceMatrixStripesOOM.java
License:Apache License
/** * Runs this tool./* w w w . java 2 s. c om*/ */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int window = Integer.parseInt(args[2]); int reduceTasks = Integer.parseInt(args[3]); sLogger.info("Tool: ComputeCooccurrenceMatrixStripes"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - window: " + window); sLogger.info(" - number of reducers: " + reduceTasks); Job job = new Job(getConf(), "CooccurrenceMatrixStripes"); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); job.getConfiguration().setInt("window", window); job.setJarByClass(ComputeCooccurrenceMatrixStripesOOM.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(String2IntOpenHashMapWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.getConfiguration().setInt("io.sort.mb", 400); job.getConfiguration().set("mapred.child.java.opts", "-Xmx1000m -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps"); //job.getConfiguration().set("mapred.child.java.opts", "-Xmx1000m"); job.getConfiguration().setInt("child.monitor.jstat.seconds", 2); job.getConfiguration().set("fs.default.name", "hdfs://master:9000"); job.getConfiguration().set("mapred.job.tracker", "master:9001"); //conf.set("user.name", "xulijie"); job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", 1); //job.getConfiguration().setFloat("io.sort.record.percent", 0.2f); //job.getConfiguration().setFloat("io.sort.spill.percent", 0.95f); // conf.setFloat("mapred.job.shuffle.input.buffer.percent", 0.9f); // conf.setFloat("mapred.job.shuffle.merge.percent", 0.9f); //conf.setFloat("mapred.job.reduce.input.buffer.percent", 0.4f); //conf.set("mapred.job.tracker", "local"); //conf.set("fs.default.name", "file:///"); job.getConfiguration().setLong("mapred.min.split.size", 512 * 1024 * 1024L); job.getConfiguration().setLong("mapred.max.split.size", 512 * 1024 * 1024L); job.getConfiguration().setInt("mapred.map.max.attempts", 0); job.getConfiguration().setInt("mapred.reduce.max.attempts", 0); //job.getConfiguration().set("heapdump.reduce.input.groups", "3,897,853[5]"); //job.getConfiguration().set("heapdump.reduce.input.records", "8407734;8407737;8407740;8407743;8407746;8407749;8407750"); //job.getConfiguration().set("omit.reduce.input.records", "8407733;8407750"); //job.getConfiguration().set("heapdump.reduce.input.records", "8407751"); //job.getConfiguration().set("heapdump.reduce.output.records", "3897853"); job.getConfiguration().set("heapdump.task.attempt.ids", "attempt_201404281552_0001_r_000000_0"); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:clustering.similarity.ISimDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 2) { System.err.printf("usage: %s simpre_dir output_dir " + "[compression_or_not] [reduce_task_number]\n", getClass().getSimpleName()); System.exit(1);/*from w w w . j av a2 s . c om*/ } Configuration conf = getConf(); conf = MapReduceUtils.initConf(conf); Job job = Job.getInstance(conf, "isim job"); job.setJarByClass(ISimDriver.class); if (args.length > 2 && args[2].equals("0")) { FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(KeyValueTextInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); } else { job.setInputFormatClass(SequenceFileAsTextInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(args[0])); conf.setBoolean("mapreduce.map.output.compress", true); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.GzipCodec"); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class); SequenceFileOutputFormat.setOutputPath(job, new Path(args[1])); } if (args.length > 3) { conf.setInt("reduce.num", Integer.valueOf(args[3])); } else { conf.setInt("reduce.num", 5); } job.setMapperClass(ISimMapper.class); job.setMapOutputKeyClass(IntIntTupleWritable.class); job.setMapOutputValueClass(DoubleWritable.class); job.setCombinerClass(ISimCombiner.class); job.setPartitionerClass(HashPartitioner.class); job.setNumReduceTasks(conf.getInt("reduce.num", 1)); job.setReducerClass(ISimReducer.class); job.setOutputKeyClass(IntIntTupleWritable.class); job.setOutputValueClass(DoubleWritable.class); long starttime = System.currentTimeMillis(); boolean complete = job.waitForCompletion(true); long endtime = System.currentTimeMillis(); System.out.println("inverted similarity job finished in: " + (endtime - starttime) / 1000 + " seconds"); return complete ? 0 : 1; }
From source file:clustering.tf_idf.DocCntDriver.java
License:Apache License
Job configJob(String[] args) throws Exception { if (args.length < 2) { System.err.printf("usage: %s simhash_result_dir pre_step_output_dir\n", getClass().getSimpleName()); System.exit(1);/* w w w . j a va 2 s . c om*/ } Configuration conf = getConf(); conf = initConf(conf); Job job = Job.getInstance(conf, "tf idf pre job"); job.setJarByClass(WorkflowDriver.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(DocCntMapper.class); job.setCombinerClass(DocCntReducer.class); job.setReducerClass(DocCntReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(IntWritable.class); return job; }