List of usage examples for org.apache.hadoop.mapreduce Job getInstance
@Deprecated public static Job getInstance(Cluster ignored, Configuration conf) throws IOException
From source file:clustering.link_back.step2.Driver.java
License:Apache License
public Job configJob(String[] args) throws Exception { if (args.length < 3) { System.err.printf("usage: %s pre_step_result_dir step1_result_dir output_dir\n", getClass().getSimpleName()); System.exit(1);/*from ww w. ja va 2 s.co m*/ } Configuration conf = getConf(); conf = MapReduceUtils.initConf(conf); Job job = Job.getInstance(conf, "link back step 2 job"); job.setJarByClass(Driver.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileInputFormat.addInputPath(job, new Path(args[1])); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setMapperClass(clustering.link_back.step2.SetKeyMapper.class); job.setMapOutputKeyClass(Step2KeyWritable.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(JoinPartitioner.class); job.setGroupingComparatorClass(Step2GroupComparator.class); job.setReducerClass(clustering.link_back.step2.JoinReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(job, new Path(args[2])); return job; }
From source file:clustering.mst.Driver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 3) { System.err.printf("usage: %s similarity_result_dir document_count_file output_dir " + "[cluster_threshold] [reduce_number] [compression]\n", getClass().getSimpleName()); System.exit(1);/* w ww .ja v a 2s . c o m*/ } Path step1_OutputDir = new Path(args[2] + "/step1"); Path resultDir = new Path(args[2] + "/result"); URI docCntFile = new URI(args[1] + "/part-r-00000#docCnt"); Configuration conf = getConf(); conf = MapReduceUtils.initConf(conf); if (args.length > 3) { conf.setDouble("final.threshold", Double.valueOf(args[3])); } else { conf.setDouble("final.threshold", 0.2d); } if (args.length > 4) { conf.setInt("reduce.task.num", Integer.valueOf(args[4])); } else { conf.setInt("reduce.task.num", 5); } JobControl jobControl = new JobControl("mst jobs"); /* step 1, split and calculate the child msts */ Job childJob = Job.getInstance(conf, "mst child job"); childJob.setJarByClass(Driver.class); childJob.addCacheFile(docCntFile); if (args.length > 5 && args[5].equals("0")) { FileInputFormat.addInputPath(childJob, new Path(args[0])); childJob.setInputFormatClass(KeyValueTextInputFormat.class); } else { SequenceFileInputFormat.addInputPath(childJob, new Path(args[0])); childJob.setInputFormatClass(SequenceFileAsTextInputFormat.class); } FileOutputFormat.setOutputPath(childJob, step1_OutputDir); childJob.setMapperClass(ChildMapper.class); childJob.setMapOutputKeyClass(DoubleWritable.class); childJob.setMapOutputValueClass(Text.class); childJob.setPartitionerClass(ChildPartitioner.class); childJob.setReducerClass(ChildReducer.class); childJob.setNumReduceTasks(conf.getInt("reduce.task.num", 1)); childJob.setOutputKeyClass(DoubleWritable.class); childJob.setOutputValueClass(Text.class); ControlledJob controlledChildJob = new ControlledJob(conf); controlledChildJob.setJob(childJob); jobControl.addJob(controlledChildJob); /* step 2, merge step 1's output and calculate final mst */ Job finalJob = Job.getInstance(conf, "mst final job"); finalJob.setJarByClass(FinalReducer.class); finalJob.addCacheFile(docCntFile); FileInputFormat.addInputPath(finalJob, step1_OutputDir); finalJob.setInputFormatClass(KeyValueTextInputFormat.class); finalJob.setMapperClass(FinalMapper.class); finalJob.setMapOutputKeyClass(DoubleWritable.class); finalJob.setMapOutputValueClass(Text.class); finalJob.setReducerClass(FinalReducer.class); finalJob.setOutputKeyClass(IntWritable.class); finalJob.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(finalJob, resultDir); ControlledJob finalControlledJob = new ControlledJob(conf); finalControlledJob.setJob(finalJob); finalControlledJob.addDependingJob(controlledChildJob); jobControl.addJob(finalControlledJob); // run jobs MapReduceUtils.runJobs(jobControl); return finalJob.waitForCompletion(true) ? 0 : 1; }
From source file:clustering.simhash.Driver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 2) { System.err.printf("usage: %s init_result_dir output_dir [simhash_threshold]\n", getClass().getSimpleName()); System.exit(1);//from w w w.j a va2s . c o m } Path step1_outputDir = new Path(args[1] + "/step1"); Configuration conf = getConf(); conf = MapReduceUtils.initConf(conf); if (args.length > 2) { conf.setInt("simhash.threshold", Integer.valueOf(args[2])); } else { conf.setInt("simhash.threshold", 3); } JobControl jobControl = new JobControl("simhash jobs"); Job job1 = Job.getInstance(conf, "simhash step1 job"); job1.setJarByClass(Driver.class); FileInputFormat.addInputPath(job1, new Path(args[0])); job1.setInputFormatClass(KeyValueTextInputFormat.class); job1.setMapperClass(Step1Mapper.class); job1.setMapOutputKeyClass(LongWritable.class); job1.setMapOutputValueClass(Text.class); job1.setReducerClass(Step1Reducer.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job1, step1_outputDir); ControlledJob controlledJob1 = new ControlledJob(conf); controlledJob1.setJob(job1); jobControl.addJob(controlledJob1); Job job2 = Job.getInstance(conf, "simhash step2 job"); job2.setJarByClass(Driver.class); FileInputFormat.addInputPath(job2, step1_outputDir); job2.setInputFormatClass(KeyValueTextInputFormat.class); job2.setMapperClass(Step2Mapper.class); job2.setMapOutputKeyClass(IntWritable.class); job2.setMapOutputValueClass(Text.class); job2.setReducerClass(Step2Reducer.class); job2.setOutputKeyClass(IntWritable.class); job2.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job2, new Path(args[1] + "/result")); ControlledJob controlledJob2 = new ControlledJob(conf); controlledJob2.setJob(job2); controlledJob2.addDependingJob(controlledJob1); jobControl.addJob(controlledJob2); long starttime = System.currentTimeMillis(); clustering.Utils.MapReduceUtils.runJobs(jobControl); boolean complete = job2.waitForCompletion(true); long endtime = System.currentTimeMillis(); System.out.println("simhash job finished in: " + (endtime - starttime) / 1000 + " seconds"); return complete ? 0 : 1; }
From source file:clustering.similarity.ISimDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 2) { System.err.printf("usage: %s simpre_dir output_dir " + "[compression_or_not] [reduce_task_number]\n", getClass().getSimpleName()); System.exit(1);//from ww w. ja v a 2 s. c o m } Configuration conf = getConf(); conf = MapReduceUtils.initConf(conf); Job job = Job.getInstance(conf, "isim job"); job.setJarByClass(ISimDriver.class); if (args.length > 2 && args[2].equals("0")) { FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(KeyValueTextInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); } else { job.setInputFormatClass(SequenceFileAsTextInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(args[0])); conf.setBoolean("mapreduce.map.output.compress", true); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.GzipCodec"); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class); SequenceFileOutputFormat.setOutputPath(job, new Path(args[1])); } if (args.length > 3) { conf.setInt("reduce.num", Integer.valueOf(args[3])); } else { conf.setInt("reduce.num", 5); } job.setMapperClass(ISimMapper.class); job.setMapOutputKeyClass(IntIntTupleWritable.class); job.setMapOutputValueClass(DoubleWritable.class); job.setCombinerClass(ISimCombiner.class); job.setPartitionerClass(HashPartitioner.class); job.setNumReduceTasks(conf.getInt("reduce.num", 1)); job.setReducerClass(ISimReducer.class); job.setOutputKeyClass(IntIntTupleWritable.class); job.setOutputValueClass(DoubleWritable.class); long starttime = System.currentTimeMillis(); boolean complete = job.waitForCompletion(true); long endtime = System.currentTimeMillis(); System.out.println("inverted similarity job finished in: " + (endtime - starttime) / 1000 + " seconds"); return complete ? 0 : 1; }
From source file:clustering.similarity.PreDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 2) { System.err.printf(/*from w w w.j a v a 2s .c o m*/ "usage: %s inverted_index_result_dir output_dir" + " [compress_or_not] [reducer_number] [deci_number]\n", this.getClass().getSimpleName()); System.exit(1); } Configuration conf = getConf(); conf = MapReduceUtils.initConf(conf); conf.set("mapreduce.reduce.speculative", "false"); // TODO: 17-4-24 calculate split number from reducer number conf.setInt("split.num", 8); if (args.length > 3) { conf.setInt("reducer.num", Integer.valueOf(args[3])); } else { conf.setInt("reducer.num", 29); } if (args.length > 4) { conf.setInt("deci.number", Integer.valueOf(args[4])); } else { conf.setInt("deci.number", 3); } Job job = Job.getInstance(conf, "pre job"); job.setJarByClass(PreDriver.class); FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setMapperClass(PreMapper.class); job.setMapOutputKeyClass(IntIntTupleWritable.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(PrePartitioner.class); job.setNumReduceTasks(conf.getInt("reducer.num", 29)); job.setReducerClass(PreReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // set default compression if (args.length > 2 && args[2].equals("0")) { FileOutputFormat.setOutputPath(job, new Path(args[1])); } else { job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class); SequenceFileOutputFormat.setOutputPath(job, new Path(args[1])); } long starttime = System.currentTimeMillis(); boolean complete = job.waitForCompletion(true); long endtime = System.currentTimeMillis(); System.out.println("inverted similarity pre job finished in: " + (endtime - starttime) / 1000 + " seconds"); return complete ? 0 : 1; }
From source file:clustering.tf_idf.DocCntDriver.java
License:Apache License
Job configJob(String[] args) throws Exception { if (args.length < 2) { System.err.printf("usage: %s simhash_result_dir pre_step_output_dir\n", getClass().getSimpleName()); System.exit(1);// w w w .j av a2 s . c o m } Configuration conf = getConf(); conf = initConf(conf); Job job = Job.getInstance(conf, "tf idf pre job"); job.setJarByClass(WorkflowDriver.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(DocCntMapper.class); job.setCombinerClass(DocCntReducer.class); job.setReducerClass(DocCntReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(IntWritable.class); return job; }
From source file:clustering.tf_idf.TermCntDriver.java
License:Apache License
Job configJob(String[] args) throws Exception { if (args.length < 2) { System.err.printf("usage: %s simhash_result_dir step_1_output_dir\n", getClass().getSimpleName()); System.exit(1);/*from w w w .jav a2 s . com*/ } Configuration conf = getConf(); conf = initConf(conf); Job job = Job.getInstance(conf, "tf idf step1 job"); job.setJarByClass(TermCntDriver.class); FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setMapperClass(TermCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setCombinerClass(TermCountReducer.class); job.setReducerClass(TermCountReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job; }
From source file:co.cask.cdap.data.tools.HBaseTableExporter.java
License:Apache License
/** * Sets up the actual MapReduce job./*from w ww . ja va 2s . co m*/ * @param tx The transaction which needs to be passed to the Scan instance. This transaction is be used by * coprocessors to filter out the data corresonding to the invalid transactions . * @param tableName Name of the table which need to be exported as HFiles. * @return the configured job * @throws IOException */ public Job createSubmittableJob(Transaction tx, String tableName) throws IOException { Job job = Job.getInstance(hConf, "HBaseTableExporter"); job.setJarByClass(HBaseTableExporter.class); Scan scan = new Scan(); scan.setCacheBlocks(false); // Set the transaction attribute for the scan. scan.setAttribute(TxConstants.TX_OPERATION_ATTRIBUTE_KEY, new TransactionCodec().encode(tx)); job.setNumReduceTasks(0); TableMapReduceUtil.initTableMapperJob(tableName, scan, KeyValueImporter.class, null, null, job); FileSystem fs = FileSystem.get(hConf); Random rand = new Random(); Path root = new Path(fs.getWorkingDirectory(), "hbasetableexporter"); fs.mkdirs(root); while (true) { bulkloadDir = new Path(root, "" + rand.nextLong()); if (!fs.exists(bulkloadDir)) { break; } } HFileOutputFormat2.setOutputPath(job, bulkloadDir); HTable hTable = new HTable(hConf, tableName); HFileOutputFormat2.configureIncrementalLoad(job, hTable); return job; }
From source file:co.cask.cdap.hbase.wd.RowKeyDistributorTestBase.java
License:Apache License
private void testMapReduceInternal(long origKeyPrefix, Scan scan, int numValues, int startWithValue, int seekIntervalMinValue, int seekIntervalMaxValue) throws IOException, InterruptedException, ClassNotFoundException { int valuesCountInSeekInterval = writeTestData(origKeyPrefix, numValues, startWithValue, seekIntervalMinValue, seekIntervalMaxValue); // Reading data Configuration conf = new Configuration(testingUtility.getConfiguration()); conf.set("fs.defaultFS", "file:///"); conf.set("fs.default.name", "file:///"); conf.setInt("mapreduce.local.map.tasks.maximum", 16); conf.setInt("mapreduce.local.reduce.tasks.maximum", 16); Job job = Job.getInstance(conf, "testMapReduceInternal()-Job"); TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, job); // Substituting standard TableInputFormat which was set in TableMapReduceUtil.initTableMapperJob(...) job.setInputFormatClass(WdTableInputFormat.class); keyDistributor.addInfo(job.getConfiguration()); job.setOutputFormatClass(NullOutputFormat.class); job.setNumReduceTasks(0);/* w ww .jav a 2 s .c o m*/ boolean succeeded = job.waitForCompletion(true); Assert.assertTrue(succeeded); long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue(); Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords); // Need to kill the job after completion, after it could leave MRAppMaster running not terminated. // Not sure what causing this, but maybe problem in MiniYarnCluster job.killJob(); }
From source file:co.cask.cdap.internal.app.runtime.batch.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);/*from w w w .ja v a2 s .c o m*/ } String inputPath = otherArgs[0]; String outputPath = otherArgs[1]; Job job = Job.getInstance(conf, "word count"); configureJob(job, inputPath, outputPath); System.exit(job.waitForCompletion(true) ? 0 : 1); }