List of usage examples for org.apache.hadoop.mapreduce Job getInstance
@Deprecated public static Job getInstance(Cluster ignored, Configuration conf) throws IOException
From source file:com.github.ygf.pagerank.InLinks.java
License:Apache License
private void summarizeResults(Configuration conf, Path outputDir) throws Exception { int topResults = Integer.parseInt(conf.get("inlinks.top_results")); Job job = Job.getInstance(conf, "InLinks:TopN"); job.setJarByClass(InLinks.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(InLinksTopNMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(InLinksTopNReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(outputDir, "inlinks")); FileOutputFormat.setOutputPath(job, new Path(outputDir, "inlinks-top" + topResults)); job.setNumReduceTasks(1);/*from ww w. ja v a2 s. co m*/ job.waitForCompletion(true); }
From source file:com.github.ygf.pagerank.PageRank.java
License:Apache License
private void createTransitionMatrix(Configuration conf, Path linksFile, Path outputDir) throws Exception { // This job reads the links-simple-sorted.txt input file and generates // the corresponding transition matrix. The matrix is divided into // square blocks and each block is represented by the nonzero entries. // See Section 5.2 (and 5.2.3 in particular) of Mining of Massive Datasets // (http://infolab.stanford.edu/~ullman/mmds.html) for details. // The output is written to the "M" subdir in the output dir. Job job = Job.getInstance(conf, "PageRank:Matrix"); job.setJarByClass(PageRank.class); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(PageRankMatrixMapper.class); job.getConfiguration().setBoolean("mapreduce.map.output.compress", true); job.getConfiguration().setClass("mapreduce.map.output.compress.codec", DefaultCodec.class, CompressionCodec.class); job.setMapOutputKeyClass(ShortArrayWritable.class); job.setMapOutputValueClass(ShortArrayWritable.class); job.setReducerClass(PageRankMatrixReducer.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(ShortArrayWritable.class); job.setOutputValueClass(MatrixBlockWritable.class); FileInputFormat.addInputPath(job, linksFile); FileOutputFormat.setOutputPath(job, new Path(outputDir, "M")); job.waitForCompletion(true);//from www . j av a 2 s . c o m }
From source file:com.github.ygf.pagerank.PageRank.java
License:Apache License
private void pageRankIteration(int iter, Configuration conf, Path outputDir) throws Exception { // This job performs an iteration of the power iteration method to // compute PageRank. The map task processes each block M_{i,j}, loads // the corresponding stripe j of the vector v_{k-1} and produces the // partial result of the stripe i of the vector v_k. The reduce task // sums all the partial results of v_k and adds the teleportation factor // (the combiner only sums all the partial results). See Section 5.2 // (and 5.2.3 in particular) of Mining of Massive Datasets // (http://infolab.stanford.edu/~ullman/mmds.html) for details. The // output is written in a "vk" subdir of the output dir, where k is the // iteration number. MapFileOutputFormat is used to keep an array of the // stripes of v. Job job = Job.getInstance(conf, "PageRank:Iteration"); job.setJarByClass(PageRank.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(PageRankIterationMapper.class); job.setMapOutputKeyClass(ShortWritable.class); job.setMapOutputValueClass(FloatArrayWritable.class); job.setCombinerClass(PageRankIterationCombiner.class); job.setReducerClass(PageRankIterationReducer.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setOutputKeyClass(ShortWritable.class); job.setOutputValueClass(FloatArrayWritable.class); FileInputFormat.addInputPath(job, new Path(outputDir, "M")); FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter)); job.waitForCompletion(true);//from w w w . ja v a 2s . c om }
From source file:com.github.ygf.pagerank.PageRank.java
License:Apache License
private void summarizeResults(int iter, Configuration conf, Path outputDir) throws Exception { // This job creates a plain text file with the top N PageRanks and the // titles of the pages. Each map task emits the top N PageRanks it // receives, and the reduce task merges the partial results into the // global top N PageRanks. A single reducer is used in the job in order // to have access to all the individual top N PageRanks from the // mappers. The reducer looks up the titles in the index built by // TitleIndex. This job was designed considering that N is small. int topResults = Integer.parseInt(conf.get("pagerank.top_results")); Job job = Job.getInstance(conf, "PageRank:TopN"); job.setJarByClass(PageRank.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(PageRankTopNMapper.class); job.setMapOutputKeyClass(FloatWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(PageRankTopNReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(outputDir, "v" + iter)); FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter + "-top" + topResults)); job.setNumReduceTasks(1);// w w w . j a v a 2 s .c om job.waitForCompletion(true); }
From source file:com.github.ygf.pagerank.TitleIndex.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { System.out.println("Usage: TitleIndex <titles-sorted.txt> <output-dir>"); ToolRunner.printGenericCommandUsage(System.out); return 2; }/* w w w . ja v a 2 s. co m*/ Path titlesFile = new Path(args[0]); Path outputDir = new Path(args[1]); Configuration conf = getConf(); // Do not create _SUCCESS files. MapFileOutputFormat.getReaders calls // try to read the _SUCCESS as another MapFile dir. conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false"); // This job creates a MapFile of the titles indexed by the page id. // UnsplittableTextInputFormat is used to ensure that the same map task // gets all the lines in the titlesFile and it can count the line // numbers. The number of reduce tasks is set to 0. Job job = Job.getInstance(conf, "TitleIndex"); job.setJarByClass(InLinks.class); job.setInputFormatClass(UnsplittableTextInputFormat.class); job.setMapperClass(TitleIndexMapper.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, titlesFile); FileOutputFormat.setOutputPath(job, outputDir); job.setNumReduceTasks(0); job.waitForCompletion(true); return 0; }
From source file:com.google.cloud.bigtable.mapreduce.Export.java
License:Apache License
/** * Sets up the actual job.// w w w . j a v a2 s . co m * * @param conf The current configuration. * @param args The command line parameters. * @return The newly created job. * @throws java.io.IOException When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { conf.setIfUnset("hbase.client.connection.impl", BigtableConfiguration.getConnectionClass().getName()); conf.setIfUnset(BigtableOptionsFactory.BIGTABLE_RPC_TIMEOUT_MS_KEY, "60000"); conf.setBoolean(TableInputFormat.SHUFFLE_MAPS, true); String tableName = args[0]; Path outputDir = new Path(args[1]); Job job = Job.getInstance(conf, NAME + "_" + tableName); job.setJobName(NAME + "_" + tableName); job.setJarByClass(Export.class); // Set optional scan parameters Scan s = getConfiguredScanForJob(conf, args); TableMapReduceUtil.initTableMapperJob(tableName, s, IdentityTableMapper.class, ImmutableBytesWritable.class, Result.class, job, false); // No reducers. Just write straight to output files. job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Result.class); FileOutputFormat.setOutputPath(job, outputDir); // job conf doesn't contain the conf so doesn't have a default fs. return job; }
From source file:com.google.cloud.bigtable.mapreduce.Import.java
License:Open Source License
/** * Sets up the actual job.// w ww . j a va 2s.c o m * @param conf The current configuration. * @param args The command line parameters. * @return The newly created job. * @throws IOException When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { TableName tableName = TableName.valueOf(args[0]); conf.set(TABLE_NAME, tableName.getNameAsString()); Path inputDir = new Path(args[1]); Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName)); job.setJarByClass(Importer.class); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(SequenceFileInputFormat.class); String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY); // make sure we get the filter in the jars try { Class<? extends Filter> filter = conf.getClass(FILTER_CLASS_CONF_KEY, null, Filter.class); if (filter != null) { TableMapReduceUtil.addDependencyJars(conf, filter); } } catch (Exception e) { throw new IOException(e); } if (hfileOutPath != null) { job.setMapperClass(KeyValueImporter.class); try (Connection conn = ConnectionFactory.createConnection(conf); Table table = conn.getTable(tableName); RegionLocator regionLocator = conn.getRegionLocator(tableName)) { job.setReducerClass(KeyValueSortReducer.class); Path outputDir = new Path(hfileOutPath); FileOutputFormat.setOutputPath(job, outputDir); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); HFileOutputFormat2.configureIncrementalLoad(job, table, regionLocator); TableMapReduceUtil.addDependencyJars(job.getConfiguration(), com.google.common.base.Preconditions.class); } } else { // No reducers. Just write straight to table. Call initTableReducerJob // because it sets up the TableOutputFormat. job.setMapperClass(Importer.class); TableMapReduceUtil.initTableReducerJob(tableName.getNameAsString(), null, job); job.setNumReduceTasks(0); } return job; }
From source file:com.hadoop.mapreduce.examples.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set("mapreduce.app-submission.cross-platform", "true"); String ioArgs[] = { "input", "output2" }; String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: wordcount <in> [<in>...] <out>"); System.exit(2);/*ww w . ja va 2 s . c om*/ } //job Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); //map, combine, reduce job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); // job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.hortonworks.mapreduce.URLCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = this.getConf(); conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", " "); Job job = Job.getInstance(conf, "URLCount"); job.setJarByClass(getClass());/*from w w w. j ava 2 s. c o m*/ job.setInputFormatClass(KeyValueTextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(URLCountM.class); job.setReducerClass(URLCountR.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return (job.waitForCompletion(true) == true ? 0 : -1); }
From source file:com.hzy.test.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { // String input = "hdfs://192.168.1.118:9000/user/hdfs/log_kpi/"; // String output = "hdfs://192.168.1.118:9000/user/hdfs/log_kpi/wc/"; String input = "/tmp/data.txt"; // String input = args[0]; String output = "/tmp/t1"; // String output = args[1]; Configuration conf = HdfsDAO.config(); // conf.set("mapreduce.framework.name", "yarn"); //// conf.set("hbase.zookeeper.quorum", "hadoop01:2181"); // conf.set("fs.default.name", "hdfs://hadoop01:9000"); // conf.set("yarn.resourcemanager.resource-tracker.address", "hadoop01:8031"); // conf.set("yarn.resourcemanager.address", "hadoop01:8032"); // conf.set("yarn.resourcemanager.scheduler.address", "hadoop01:8030"); // conf.set("yarn.resourcemanager.admin.address", "hadoop01:8033"); // conf.set("mapreduce.jobhistory.address", "hadoop01:10020"); // conf.set("mapreduce.jobhistory.webapp.address", "hadoop01:19888"); // String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); // if (otherArgs.length < 2) { // System.err.println("Usage: wordcount <in> [<in>...] <out>"); // System.exit(2); // }/*from w ww . ja v a 2 s . c om*/ Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(input)); // } FileOutputFormat.setOutputPath(job, new Path(output)); System.exit(job.waitForCompletion(true) ? 0 : 1); }