List of usage examples for org.apache.hadoop.mapreduce Job getInstance
@Deprecated public static Job getInstance(Cluster ignored, Configuration conf) throws IOException
From source file:com.yassergonzalez.pagerank.PageRank.java
License:Apache License
private void pageRankIteration(int iter, Configuration conf, Path outputDir) throws Exception { // This job performs an iteration of the power iteration method to // compute PageRank. The map task processes each block M_{i,j}, loads // the corresponding stripe j of the vector v_{k-1} and produces the // partial result of the stripe i of the vector v_k. The reduce task // sums all the partial results of v_k and adds the teleportation factor // (the combiner only sums all the partial results). See Section 5.2 // (and 5.2.3 in particular) of Mining of Massive Datasets // (http://infolab.stanford.edu/~ullman/mmds.html) for details. The // output is written in a "vk" subdir of the output dir, where k is the // iteration number. MapFileOutputFormat is used to keep an array of the // stripes of v. Job job = Job.getInstance(conf, "PageRank:Iteration"); job.setJarByClass(PageRank.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(PageRankIterationMapper.class); job.setMapOutputKeyClass(ShortWritable.class); job.setMapOutputValueClass(FloatArrayWritable.class); job.setCombinerClass(PageRankIterationCombiner.class); job.setReducerClass(PageRankIterationReducer.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setOutputKeyClass(ShortWritable.class); job.setOutputValueClass(FloatArrayWritable.class); FileInputFormat.addInputPath(job, new Path(outputDir, "M")); FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter)); job.waitForCompletion(true);//from w w w . ja v a 2 s . c o m }
From source file:com.zjy.mongo.util.MongoTool.java
License:Apache License
private int runMapReduceJob(final Configuration conf) throws IOException { final Job job = Job.getInstance(conf, getJobName()); /**/*from www . j av a 2 s .c om*/ * Any arguments specified with -D <property>=<value> * on the CLI will be picked up and set here * They override any XML level values * Note that -D<space> is important - no space will * not work as it gets picked up by Java itself */ // TODO - Do we need to set job name somehow more specifically? // This may or may not be correct/sane job.setJarByClass(getClass()); final Class<? extends Mapper> mapper = MongoConfigUtil.getMapper(conf); if (LOG.isDebugEnabled()) { LOG.debug("Mapper Class: " + mapper); LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI)); } job.setMapperClass(mapper); Class<? extends Reducer> combiner = MongoConfigUtil.getCombiner(conf); if (combiner != null) { job.setCombinerClass(combiner); } job.setReducerClass(MongoConfigUtil.getReducer(conf)); job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf)); job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf)); job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf)); job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf)); Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf); Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf); if (mapOutputKeyClass != null) { job.setMapOutputKeyClass(mapOutputKeyClass); } if (mapOutputValueClass != null) { job.setMapOutputValueClass(mapOutputValueClass); } /** * Determines if the job will run verbosely e.g. print debug output * Only works with foreground jobs */ final boolean verbose = MongoConfigUtil.isJobVerbose(conf); /** * Run job in foreground aka wait for completion or background? */ final boolean background = MongoConfigUtil.isJobBackground(conf); try { if (background) { LOG.info("Setting up and running MapReduce job in background."); job.submit(); return 0; } else { LOG.info("Setting up and running MapReduce job in foreground, will wait for results. {Verbose? " + verbose + "}"); return job.waitForCompletion(true) ? 0 : 1; } } catch (final Exception e) { LOG.error("Exception while executing job... ", e); return 1; } }
From source file:counting.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); final int NUMBER_OF_NODES = 31; final int MAX_NUMBER_OF_TASKS = 1000; final double REDUCER_CONSTANT = 0.95; // or 1.75 if (otherArgs.length < 5) { System.err.println(//from w w w . jav a 2 s.c o m "Usage: wordcount <in> [<in>...] <out> <ngram> <combiner:yes/no> <custom partioner:yes/no>"); System.exit(2); } Job job = Job.getInstance(conf, "Word count"); // Setting map and reduce tasks //conf.setNumMapTasks(5); // Not possible with code in line? int NUMBER_OF_REDUCERS = (int) REDUCER_CONSTANT * NUMBER_OF_NODES * MAX_NUMBER_OF_TASKS; //System.out.println("Number of Reducers: " + NUMBER_OF_REDUCERS); job.setNumReduceTasks(12); // Placeholder job.setJarByClass(WordCount.class); job.setMapperClass(nGramMapper.class); nGramMapper.setN(Integer.parseInt(otherArgs[otherArgs.length - 3])); // Set ngram length System.out.println("n = " + nGramMapper.getN()); System.out.println("Combiner = " + otherArgs[otherArgs.length - 2]); System.out.println("Custom Partitioner = " + otherArgs[otherArgs.length - 1]); System.out.println("Number of reducers = " + NUMBER_OF_NODES); if (otherArgs[otherArgs.length - 2].equals("yes")) { job.setCombinerClass(IntSumReducer.class); } if (otherArgs[otherArgs.length - 1].equals("yes")) { job.setPartitionerClass(CustomPartitioner.class); //CustomPartitioner.setNumberOfReducers(NUMBER_OF_REDUCERS); } job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // Input paths for (int i = 0; i < otherArgs.length - 4; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } // Output paths FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 4])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:cp_a.CP_A.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(CP_A.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:cp_b.CP_B.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(CP_B.class); job.setMapperClass(TokenizerMapper.class); job.setReducerClass(IntSumReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:cp_c.CP_C.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(CP_C.class); job.setMapperClass(TokenizerMapper.class); job.setReducerClass(IntSumReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:de.gesundkrank.wikipedia.hadoop.util.RepackToMapFile.java
License:Open Source License
public int run(String basePath, String outputPath, boolean checkNew, boolean skipRedirect) throws Exception { Configuration configuration = getConf(); configuration.setBoolean("skipRedirect", skipRedirect); LOGGER.info("Tool name: " + getClass().getSimpleName()); Job job = Job.getInstance(configuration, getClass().getSimpleName()); job.setJarByClass(getClass());/*from w ww.j a v a 2s . c o m*/ job.setMapperClass(WikiMapper.class); job.setInputFormatClass(WikiInputFormat.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(WikiRevisionWritable.class); WikiDumpLoader wikiDumpLoader = new WikiDumpLoader(checkNew); wikiDumpLoader.addWikiDump(job, basePath); MapFileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setNumReduceTasks(1); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.l3s.common.features.hadoop.TimeSeriesJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Options opts = new Options(); Option jnameOpt = OptionBuilder.withArgName("job-name").hasArg(true).withDescription("Timeseries analysis") .create(JOB_NAME);// w ww . j a v a 2 s .c o m Option inputOpt = OptionBuilder.withArgName("input-path").hasArg(true) .withDescription("Timeseries file path (required)").create(INPUT_OPT); Option outputOpt = OptionBuilder.withArgName("output-path").hasArg(true) .withDescription("output file path (required)").create(OUTPUT_OPT); Option reduceOpt = OptionBuilder.withArgName("reduce-no").hasArg(true) .withDescription("number of reducer nodes").create(REDUCE_NO); Option rmOpt = OptionBuilder.withArgName("remove-out").hasArg(false) .withDescription("remove the output then create again before writing files onto it") .create(REMOVE_OUTPUT); Option cOpt = OptionBuilder.withArgName("compress-option").hasArg(true) .withDescription("compression option").create(COMPRESS_OPT); opts.addOption(jnameOpt); opts.addOption(inputOpt); opts.addOption(reduceOpt); opts.addOption(outputOpt); opts.addOption(rmOpt); opts.addOption(cOpt); CommandLine cl; CommandLineParser parser = new GnuParser(); try { cl = parser.parse(opts, args); } catch (ParseException e) { System.err.println("Error parsing command line: " + e.getMessage()); return -1; } if (!cl.hasOption(INPUT_OPT) || !cl.hasOption(OUTPUT_OPT)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(getClass().getName(), opts); ToolRunner.printGenericCommandUsage(System.out); return -1; } int reduceNo = DEFAULT_REDUCER_NO; if (cl.hasOption(REDUCE_NO)) { try { reduceNo = Integer.parseInt(cl.getOptionValue(REDUCE_NO)); } catch (NumberFormatException e) { System.err.println("Error parsing reducer number: " + e.getMessage()); } } String jobName = "Distributed timeseries [R] correlation"; if (cl.hasOption(JOB_NAME)) { jobName = cl.getOptionValue(JOB_NAME); jobName = jobName.replace('-', ' '); } if (cl.hasOption(REMOVE_OUTPUT)) { } String input = cl.getOptionValue(INPUT_OPT); String output = cl.getOptionValue(OUTPUT_OPT); Configuration conf = getConf(); //DistributedCache.createSymlink(conf); //DistributedCache.addCacheFile(new URI("hdfs://master.hadoop:8020/user/nguyen/lib/"), conf); Job job = Job.getInstance(conf, jobName); job.setJarByClass(TimeSeriesJob.class); job.setMapperClass(TimeSeriesMapper.class); job.setReducerClass(TimeSeriesReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Timeseries.class); job.setNumReduceTasks(reduceNo); job.setInputFormatClass(WholeFileInputFormat.class); WholeFileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, new Path(output)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.l3s.content.timex.extracting.ClueWeb09Timex.java
License:Apache License
/** * Runs this tool./* www . jav a 2 s . c o m*/ */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("input").hasArg().withDescription("input path").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("output").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption(OptionBuilder.withArgName("column").hasArg() .withDescription("column to store row data into (must exist)").create(COLUMN)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); cmdline = parser.parse(options, args); if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } if (!cmdline.hasOption(OUTPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); // String column = cmdline.getOptionValue(COLUMN); LOG.info("Tool name: " + ClueWeb09Timex.class.getSimpleName()); LOG.info(" - input: " + input); LOG.info(" - output: " + output); // LOG.info(" - column: " + column); Configuration conf = HBaseConfiguration.create(); conf.set("hbase.zookeeper.quorum", "node05.ib,node03.ib,node04.ib"); conf.set("hbase.zookeeper.property.clientPort", "2181"); conf.set("hbase.master", "master.ib"); // conf.set("conf.column", column); long milliSeconds = 10000 * 60 * 60; //x10 default conf.setLong("mapred.task.timeout", milliSeconds); Job job = Job.getInstance(conf, ClueWeb09Timex.class.getSimpleName() + " time-confident extraction + annotation + HBase import: " + input); //Configuration conf = new Configuration(); //Job job = Job.getInstance(conf, "web pages count"); job.setJarByClass(ClueWeb09Timex.class); job.setNumReduceTasks(0); job.setInputFormatClass(ClueWeb09InputFormat.class); job.setOutputFormatClass(TableOutputFormat.class); job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, output); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Writable.class); job.setMapperClass(TMapper.class); //job.setReducerClass(IntSumReducer.class); //job.setOutputKeyClass(Text.class); //job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(input)); //FileOutputFormat.setOutputPath(job, new Path(output)); job.waitForCompletion(true); return 0; }
From source file:de.l3s.content.timex.extracting.ClueWeb09TimexWriteToHDFS.java
License:Apache License
/** * Runs this tool.//from w w w . j a va 2 s . co m */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("input").hasArg().withDescription("input path").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("output").hasArg().withDescription("output path").create(OUTPUT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); cmdline = parser.parse(options, args); if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } if (!cmdline.hasOption(OUTPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); LOG.info("Tool name: " + ClueWeb09TimexWriteToHDFS.class.getSimpleName()); LOG.info(" - input: " + input); LOG.info(" - output: " + output); Configuration conf = new Configuration(); long milliSeconds = 10000 * 60 * 60; //x10 default conf.setLong("mapred.task.timeout", milliSeconds); Job job = Job.getInstance(conf, "extract CW tempex and output to HDFS"); job.setJarByClass(ClueWeb09TimexWriteToHDFS.class); job.setNumReduceTasks(0); job.setInputFormatClass(ClueWeb09InputFormat.class); job.setMapperClass(TMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.waitForCompletion(true); return 0; }