List of usage examples for org.apache.hadoop.mapreduce Job getInstance
@Deprecated public static Job getInstance(Cluster ignored) throws IOException
From source file:com.avira.couchdoop.demo.ImportDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 1) { System.err.println("Usage: <output_path>"); return 1; }//from www. jav a 2s .co m String output = args[0]; Job job = Job.getInstance(getConf()); job.setJarByClass(this.getClass()); // User classpath takes precedence in favor of Hadoop classpath. // This is because the Couchbase client requires a newer version of // org.apache.httpcomponents:httpcore. job.setUserClassesTakesPrecedence(true); // Input job.setInputFormatClass(CouchbaseViewInputFormat.class); // Mapper job.setMapperClass(ImportMapper.class); // Reducer job.setNumReduceTasks(0); // Output job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(output)); if (!job.waitForCompletion(true)) { return 2; } return 0; }
From source file:com.avira.couchdoop.jobs.CouchbaseExporter.java
License:Apache License
public Job configureJob(Configuration conf, String input) throws IOException { conf.setInt("mapreduce.map.failures.maxpercent", 5); conf.setInt("mapred.max.map.failures.percent", 5); conf.setInt("mapred.max.tracker.failures", 20); Job job = Job.getInstance(conf); job.setJarByClass(CouchbaseExporter.class); // Input//from w w w. j a v a 2 s . co m FileInputFormat.setInputPaths(job, input); // Mapper job.setMapperClass(CsvToCouchbaseMapper.class); job.setMapOutputKeyClass(String.class); job.setMapOutputValueClass(CouchbaseAction.class); // Reducer job.setNumReduceTasks(0); // Output job.setOutputFormatClass(CouchbaseOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CouchbaseAction.class); return job; }
From source file:com.avira.couchdoop.jobs.CouchbaseViewImporter.java
License:Apache License
public Job configureJob(Configuration conf, String output) throws IOException { conf.setInt("mapreduce.map.failures.maxpercent", 5); conf.setInt("mapred.max.map.failures.percent", 5); conf.setInt("mapred.max.tracker.failures", 20); Job job = Job.getInstance(conf); job.setJarByClass(CouchbaseViewImporter.class); // Input//from ww w. ja va2s . c o m job.setInputFormatClass(CouchbaseViewInputFormat.class); // Mapper job.setMapperClass(CouchbaseViewToFileMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // Reducer job.setNumReduceTasks(0); // Output job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(output)); return job; }
From source file:com.avira.couchdoop.jobs.CouchbaseViewToHBaseImporter.java
License:Apache License
public Job configureJob(Configuration conf, String outputTable) throws IOException { conf.setInt("mapreduce.map.failures.maxpercent", 5); conf.setInt("mapred.max.map.failures.percent", 5); conf.setInt("mapred.max.tracker.failures", 20); Job job = Job.getInstance(conf); job.setJarByClass(CouchbaseViewToHBaseImporter.class); // Input/*from w w w .j a v a2 s. c o m*/ job.setInputFormatClass(CouchbaseViewInputFormat.class); // Mapper job.setMapperClass(CouchbaseViewToHBaseMapper.class); // Reducer job.setNumReduceTasks(0); // Output TableMapReduceUtil.initTableReducerJob(outputTable, IdentityTableReducer.class, job); return job; }
From source file:com.bark.hadoop.lab3.PageRank.java
@Override public int run(String args[]) { String tmp = "/tmp/" + new Date().getTime(); // long timeStamp = new Date().getTime(); try {/* w w w. j av a2s . c o m*/ /** * Job 1: Parse XML input and read title,links */ Configuration conf = new Configuration(); conf.set("xmlinput.start", "<page>"); conf.set("xmlinput.end", "</page>"); Job job = Job.getInstance(conf); job.setJarByClass(PageRank.class); // specify a mapper job.setMapperClass(RedLinkMapper.class); // specify a reducer job.setReducerClass(RedLinkReducer.class); // specify output types job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // specify input and output DIRECTORIES FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(XmlInputFormat.class); FileOutputFormat.setOutputPath(job, new Path((args[1] + tmp + "/job1"))); job.setOutputFormatClass(TextOutputFormat.class); job.waitForCompletion(true); } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job1."); return 2; } /** * Job 2: Adjacency outGraph */ try { Configuration conf2 = new Configuration(); Job job2 = Job.getInstance(conf2); job2.setJarByClass(PageRank.class); // specify a mapper job2.setMapperClass(AdjMapper.class); // specify a reducer job2.setReducerClass(AdjReducer.class); // specify output types job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); // specify input and output DIRECTORIES FileInputFormat.addInputPath(job2, new Path((args[1] + tmp + "/job1"))); job2.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job2, new Path((args[1] + tmp + "/job2"))); job2.setOutputFormatClass(TextOutputFormat.class); job2.waitForCompletion(true); } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job2."); return 2; } /** * Job 3: PageCount */ try { Configuration conf3 = new Configuration(); /** * Change output separator to "=" instead of default \t for this job */ conf3.set("mapreduce.output.textoutputformat.separator", "="); Job job3 = Job.getInstance(conf3); job3.setJarByClass(PageRank.class); // specify a mapper job3.setMapperClass(PageCountMapper.class); // specify a reducer job3.setReducerClass(PageCountReducer.class); // specify output types job3.setOutputKeyClass(Text.class); job3.setOutputValueClass(IntWritable.class); // specify input and output DIRECTORIES FileInputFormat.addInputPath(job3, new Path((args[1] + tmp + "/job2"))); job3.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job3, new Path((args[1] + tmp + "/job3"))); job3.setOutputFormatClass(TextOutputFormat.class); job3.waitForCompletion(true); } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job3."); return 2; } /** * Job 4: PageRank */ for (int i = 1; i < 9; i++) { try { Configuration conf4 = new Configuration(); /** * Read number of nodes from the output of job 3 : pageCount */ Path path = new Path((args[1] + tmp + "/job3")); FileSystem fs = path.getFileSystem(conf4); RemoteIterator<LocatedFileStatus> ri = fs.listFiles(path, true); int n = 0; Pattern pt = Pattern.compile("(\\d+)"); while (ri.hasNext()) { LocatedFileStatus lfs = ri.next(); if (lfs.isFile() && n == 0) { FSDataInputStream inputStream = fs.open(lfs.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inputStream)); String s = null; while ((s = br.readLine()) != null) { Matcher mt = pt.matcher(s); if (mt.find()) { n = new Integer(mt.group(1)); break; } } } } /** * Done reading number of nodes, make it available to MapReduce * job key: N */ conf4.setInt("N", n); Job job4 = Job.getInstance(conf4); job4.setJarByClass(PageRank.class); // specify a mapper job4.setMapperClass(PageRankMapper.class); // specify a reducer job4.setReducerClass(PageRankReducer.class); // specify output types job4.setOutputKeyClass(Text.class); job4.setOutputValueClass(Text.class); // specify input and output DIRECTORIES if (i == 1) { FileInputFormat.addInputPath(job4, new Path((args[1] + tmp + "/job2"))); } else { FileInputFormat.addInputPath(job4, new Path((args[1] + tmp + "/job4/" + (i - 1)))); } job4.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job4, new Path((args[1] + tmp + "/job4/" + i))); job4.setOutputFormatClass(TextOutputFormat.class); job4.waitForCompletion(true); } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job4."); return 2; } } /** * Job 5: Sort iteration 1 and iteration 8 */ int returnCode = 0; for (int i = 0; i < 2; i++) { try { Configuration conf5 = new Configuration(); /** * Read number of nodes from the output of job 3 : pageCount */ Path path = new Path((args[1] + tmp + "/job3")); FileSystem fs = path.getFileSystem(conf5); RemoteIterator<LocatedFileStatus> ri = fs.listFiles(path, true); int n = 0; Pattern pt = Pattern.compile("(\\d+)"); while (ri.hasNext()) { LocatedFileStatus lfs = ri.next(); if (lfs.isFile() && n == 0) { FSDataInputStream inputStream = fs.open(lfs.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inputStream)); String s = null; while ((s = br.readLine()) != null) { Matcher mt = pt.matcher(s); if (mt.find()) { n = new Integer(mt.group(1)); break; } } } } /** * Done reading number of nodes, make it available to MapReduce * job key: N */ conf5.setInt("N", n); Job job5 = Job.getInstance(conf5); /** * one reducer only */ job5.setNumReduceTasks(1); job5.setSortComparatorClass(MyWritableComparator.class); job5.setJarByClass(PageRank.class); // specify a mapper job5.setMapperClass(SortMapper.class); job5.setMapOutputKeyClass(DoubleWritable.class); job5.setMapOutputValueClass(Text.class); // specify a reducer job5.setReducerClass(SortReducer.class); // specify output types job5.setOutputKeyClass(Text.class); job5.setOutputValueClass(DoubleWritable.class); // specify input and output DIRECTORIES int y = 7 * i + 1; FileInputFormat.addInputPath(job5, new Path((args[1] + tmp + "/job4/" + y))); job5.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job5, new Path((args[1] + tmp + "/job5/" + y))); job5.setOutputFormatClass(TextOutputFormat.class); returnCode = job5.waitForCompletion(true) ? 0 : 1; } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job5."); return 2; } } /** * Copy necessary output files to args[1] /** * Copy necessary output files to args[1] */ /** * Rename and copy OutLinkGraph */ try { Configuration conf = new Configuration(); Path outLinkGraph = new Path((args[1] + tmp + "/job2/part-r-00000")); FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf); Path output = new Path(args[1] + "/results/PageRank.outlink.out"); FileSystem outputFS = output.getFileSystem(conf); org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf); } catch (IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error while copying results."); return 2; } /** * Rename and copy total number of pages */ try { Configuration conf = new Configuration(); Path outLinkGraph = new Path((args[1] + tmp + "/job3/part-r-00000")); FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf); Path output = new Path(args[1] + "/results/PageRank.n.out"); FileSystem outputFS = output.getFileSystem(conf); org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf); } catch (IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error while copying results."); return 2; } /** * Rename and copy iteration 1 */ try { Configuration conf = new Configuration(); Path outLinkGraph = new Path((args[1] + tmp + "/job5/1/part-r-00000")); FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf); Path output = new Path(args[1] + "/results/PageRank.iter1.out"); FileSystem outputFS = output.getFileSystem(conf); org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf); } catch (IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error while copying results."); return 2; } /** * Rename and copy iteration 8 */ try { Configuration conf = new Configuration(); Path outLinkGraph = new Path((args[1] + tmp + "/job5/8/part-r-00000")); FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf); Path output = new Path(args[1] + "/results/PageRank.iter8.out"); FileSystem outputFS = output.getFileSystem(conf); org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf); } catch (IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error while copying results."); return 2; } return returnCode; }
From source file:com.ckelsel.hadoop.MaxTemperature.App.java
License:Open Source License
public static void main(String[] args) { if (args.length != 2) { System.err.println("Usage: MaxTemperature <input path> <output path>"); System.exit(-1);//from w ww .j av a 2 s. c o m } System.out.println(args[0]); System.out.println(args[1]); try { Configuration conf = new Configuration(); conf.set("mapred.job.tracker", "localhost:9001"); Job job = Job.getInstance(conf); job.setJarByClass(App.class); job.setJobName("Max temperature"); FileInputFormat.addInputPath(job, new Path(args[0])); // delete output if exists Path outPath = new Path(args[1]); outPath.getFileSystem(conf).delete(outPath, true); FileOutputFormat.setOutputPath(job, outPath); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); System.exit(job.waitForCompletion(true) ? 0 : -1); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:com.cloudera.ByteCount.java
License:Apache License
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(new Configuration()); // Trim off the hadoop-specific args String[] remArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); // Pull in properties Options options = new Options(); Option property = OptionBuilder.withArgName("property=value").hasArgs(2).withValueSeparator() .withDescription("use value for given property").create("D"); options.addOption(property);// ww w . j a v a2 s . c o m Option skipChecksums = new Option("skipChecksums", "skip checksums"); options.addOption(skipChecksums); Option profile = new Option("profile", "profile tasks"); options.addOption(profile); CommandLineParser parser = new BasicParser(); CommandLine line = parser.parse(options, remArgs); Properties properties = line.getOptionProperties("D"); for (Entry<Object, Object> prop : properties.entrySet()) { conf.set(prop.getKey().toString(), prop.getValue().toString()); System.out.println("Set config key " + prop.getKey() + " to " + prop.getValue()); } if (line.hasOption("skipChecksums")) { conf.setBoolean("bytecount.skipChecksums", true); System.out.println("Skipping checksums"); } if (line.hasOption("profile")) { conf.setBoolean("mapred.task.profile", true); conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples,depth=100,interval=1ms,lineno=y,thread=y,file=%s"); conf.set(MRJobConfig.NUM_MAP_PROFILES, "0"); conf.set("mapred.task.profile.maps", "1"); System.out.println("Profiling map tasks"); } // Get the positional arguments out remArgs = line.getArgs(); if (remArgs.length != 2) { System.err.println("Usage: ByteCount <inputBase> <outputBase>"); System.exit(1); } String inputBase = remArgs[0]; String outputBase = remArgs[1]; Job job = Job.getInstance(conf); job.setInputFormatClass(ByteBufferInputFormat.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(LongWritable.class); job.setMapperClass(ByteCountMapper.class); job.setReducerClass(ByteCountReducer.class); job.setCombinerClass(ByteCountReducer.class); job.setOutputKeyClass(ByteWritable.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job, new Path(inputBase)); FileOutputFormat.setOutputPath(job, new Path(outputBase)); job.setJarByClass(ByteCount.class); boolean success = job.waitForCompletion(true); Counters counters = job.getCounters(); System.out.println("\tRead counters"); printCounter(counters, READ_COUNTER.BYTES_READ); printCounter(counters, READ_COUNTER.LOCAL_BYTES_READ); printCounter(counters, READ_COUNTER.SCR_BYTES_READ); printCounter(counters, READ_COUNTER.ZCR_BYTES_READ); System.exit(success ? 0 : 1); }
From source file:com.cloudera.castagna.logparser.mr.StatusCodesStats.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }//from w w w .j a v a 2 s .co m Configuration configuration = getConf(); boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION, Constants.OPTION_USE_COMPRESSION_DEFAULT); if (useCompression) { configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.output.compression.type", "BLOCK"); configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); } boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERWRITE_OUTPUT, Constants.OPTION_OVERWRITE_OUTPUT_DEFAULT); FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { fs.delete(new Path(args[1]), true); } Job job = Job.getInstance(configuration); job.setJobName(Constants.STATUS_CODES_STATS); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(StatusCodesStatsMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setCombinerClass(StatusCodesStatsCombiner.class); job.setReducerClass(StatusCodesStatsReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); Utils.setReducers(job, configuration, log); job.setOutputFormatClass(TextOutputFormat.class); if (log.isDebugEnabled()) Utils.log(job, log); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.castagna.logparser.mr.TranscodeLogs.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }/* w w w . ja v a2s .c om*/ Configuration configuration = getConf(); boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERWRITE_OUTPUT, Constants.OPTION_OVERWRITE_OUTPUT_DEFAULT); FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { fs.delete(new Path(args[1]), true); } Job job = Job.getInstance(configuration); job.setJobName(Constants.STATUS_CODES_STATS); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(TranscodeLogsMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(0); job.setOutputFormatClass(TextOutputFormat.class); if (log.isDebugEnabled()) Utils.log(job, log); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.oryx.computation.common.JobStep.java
License:Open Source License
/** * Creates a new {@link MRPipeline} instance that contains common configuration * settings.//w w w. j a v a2s .co m * * @return a new {@link MRPipeline} instance, suitably configured */ protected final MRPipeline createBasicPipeline(Class<?> jarClass) throws IOException { Configuration conf = OryxConfiguration.get(getConf()); conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); conf.setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); conf.setBoolean("mapred.output.compress", true); conf.set("mapred.output.compression.type", "BLOCK"); conf.setClass("mapred.output.compression.codec", SnappyCodec.class, CompressionCodec.class); // Set old-style equivalents for Avro/Crunch's benefit conf.set("avro.output.codec", "snappy"); conf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true); conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, true); conf.setBoolean(TTConfig.TT_OUTOFBAND_HEARBEAT, true); conf.setInt(MRJobConfig.JVM_NUMTASKS_TORUN, -1); //conf.setBoolean("crunch.disable.deep.copy", true); // Giving one mapper a lot of data can cause issues in some stages, so default to disable this conf.setBoolean("crunch.disable.combine.file", true); Config appConfig = ConfigUtils.getDefaultConfig(); conf.set("crunch.tmp.dir", appConfig.getString("computation-layer.tmp-dir")); int mapMemoryMB = appConfig.getInt("computation-layer.mapper-memory-mb"); log.info("Mapper memory: {}", mapMemoryMB); int mapHeapMB = (int) (mapMemoryMB / 1.3); // Matches Hadoop's default log.info("Mappers have {}MB heap and can access {}MB RAM", mapHeapMB, mapMemoryMB); if (conf.get(MRJobConfig.MAP_JAVA_OPTS) != null) { log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS)); } conf.set(MRJobConfig.MAP_JAVA_OPTS, "-Xmx" + mapHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC"); log.info("Set {} to '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS)); // See comment below on CM conf.setInt("mapreduce.map.java.opts.max.heap", mapHeapMB); int reduceMemoryMB = appConfig.getInt("computation-layer.reducer-memory-mb"); log.info("Reducer memory: {}", reduceMemoryMB); if (isHighMemoryStep()) { reduceMemoryMB *= appConfig.getInt("computation-layer.worker-high-memory-factor"); log.info("Increasing {} to {} for high-memory step", MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB); } conf.setInt(MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB); int reduceHeapMB = (int) (reduceMemoryMB / 1.3); // Matches Hadoop's default log.info("Reducers have {}MB heap and can access {}MB RAM", reduceHeapMB, reduceMemoryMB); if (conf.get(MRJobConfig.REDUCE_JAVA_OPTS) != null) { log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS)); } conf.set(MRJobConfig.REDUCE_JAVA_OPTS, "-Xmx" + reduceHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC"); log.info("Set {} to '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS)); // I see this in CM but not in Hadoop docs; probably won't hurt as it's supposed to result in // -Xmx appended to opts above, which is at worst redundant conf.setInt("mapreduce.reduce.java.opts.max.heap", reduceHeapMB); conf.setInt("yarn.scheduler.capacity.minimum-allocation-mb", 128); conf.setInt("yarn.app.mapreduce.am.resource.mb", 384); // Pass total config state conf.set(CONFIG_SERIALIZATION_KEY, ConfigUtils.getDefaultConfig().root().render()); // Make sure to set any args to conf above this line! setConf(conf); Job job = Job.getInstance(conf); // Basic File IO settings FileInputFormat.setMaxInputSplitSize(job, 1L << 28); // ~268MB SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); log.info("Created pipeline configuration {}", job.getConfiguration()); return new MRPipeline(jarClass, getCustomJobName(), job.getConfiguration()); }