List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:SecondarySort.HashToMinWithSS.java
protected Job jobConfig() throws IOException { JobConf conf = new JobConf(); Job job = new Job(conf, "iteration"); job.setJarByClass(HashToMinWithSS.class); job.setReducerClass(ReduceSS.class); job.setPartitionerClass(LongPair.HPartitioner.class); job.setSortComparatorClass(LongPair.Comparator.class); job.setGroupingComparatorClass(LongPair.GroupComparator.class); job.setOutputKeyClass(LongPair.class); job.setOutputValueClass(Text.class); return job;/* www . j a v a2 s .c o m*/ }
From source file:SecondarySort.inputMaker.java
License:Open Source License
protected Job jobConfig() throws IOException { JobConf conf = new JobConf(); Job job = new Job(conf, "iteration"); job.setJarByClass(inputMaker.class); job.setMapperClass(MapMSS.class); job.setReducerClass(ReduceSS.class); job.setPartitionerClass(LongPair.HPartitioner.class); job.setSortComparatorClass(LongPair.Comparator.class); job.setGroupingComparatorClass(LongPair.GroupComparator.class); job.setOutputKeyClass(LongPair.class); job.setOutputValueClass(Text.class); return job;/*from w w w .ja v a 2 s . co m*/ }
From source file:simsql.runtime.MRLoader.java
License:Apache License
public long run(String inputPath, String outputPath, short typeCode, Relation r, int sortAtt) { // make a directory for the relation Configuration conf = new Configuration(); FileSystem dfs = null;/* w ww.j a va2s . com*/ try { dfs = FileSystem.get(conf); } catch (Exception e) { throw new RuntimeException("Cannot access HDFS!", e); } try { // if it exists, destroy it. Path path = new Path(outputPath); if (dfs.exists(path)) { dfs.delete(path, true); } } catch (Exception e) { throw new RuntimeException("Could not create the file to bulk load to!", e); } // find a file name String tempPath = null; if (inputPath.startsWith("hdfs:")) { tempPath = inputPath.replace("hdfs:", ""); } else { tempPath = "/tempDataFile_" + r.getName(); try { dfs.delete(new Path(tempPath), true); } catch (Exception e) { // ignore this. } // upload the text file try { dfs.copyFromLocalFile(false, true, new Path(inputPath), new Path(tempPath)); dfs.deleteOnExit(new Path(tempPath)); } catch (Exception e) { throw new RuntimeException("Failed to upload text file " + inputPath + " to HDFS!", e); } } // set up the new job's parameters. conf.setBoolean("mapred.compress.map.output", true); conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass()); conf.set("io.serializations", "simsql.runtime.RecordSerialization,simsql.runtime.RecordKeySerialization,org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt("simsql.loader.numAtts", r.getAttributes().size()); conf.setInt("simsql.loader.typeCode", (int) typeCode); conf.setInt("simsql.loader.sortAtt", sortAtt); String[] myStrings = new String[r.getAttributes().size()]; int j = 0; for (simsql.compiler.Attribute a : r.getAttributes()) { myStrings[j++] = a.getPhysicalRealization().getClass().getName(); } conf.setStrings("simsql.loader.types", myStrings); // create a job Job job; try { job = new Job(conf); } catch (Exception e) { throw new RuntimeException("Unable to create bulk loading job!", e); } // set the split size (number of mappers) long fSize = 0; if (inputPath.startsWith("hdfs")) { fSize = RelOp.getPathsTotalSize(new String[] { tempPath }); } else { fSize = new File(inputPath).length(); } FileInputFormat.setMinInputSplitSize(job, fSize / (long) numTasks); FileInputFormat.setMaxInputSplitSize(job, fSize / (long) numTasks); // and the number of reducers job.setNumReduceTasks(numTasks); // the mapper/reducer/jar job.setMapperClass(MRLoaderMapper.class); job.setReducerClass(MRLoaderReducer.class); job.setJarByClass(MRLoader.class); // I/O settings. job.setOutputFormatClass(RecordOutputFormat.class); job.setMapOutputKeyClass(RecordKey.class); job.setMapOutputValueClass(RecordWrapper.class); job.setOutputKeyClass(Nothing.class); job.setOutputValueClass(Record.class); try { FileInputFormat.setInputPaths(job, new Path(tempPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); } catch (Exception e) { throw new RuntimeException("Could not set job inputs/outputs", e); } job.setGroupingComparatorClass(RecordKeyGroupingComparator.class); job.setPartitionerClass(RecordPartitioner.class); job.setSortComparatorClass(RecordKeySortComparator.class); job.setJobName("MRLoader: " + inputPath + " ==> " + outputPath); // run it Counters counters; try { job.waitForCompletion(true); counters = job.getCounters(); } catch (Exception e) { throw new RuntimeException("Could not set up bulk loader job!", e); } // now, delete all the empty part files try { // get a filesystem FileSystem ddfs = FileSystem.get(conf); Path outPath = new Path(outputPath); if (ddfs.exists(outPath) && ddfs.isDirectory(outPath)) { FileStatus fstatus[] = ddfs.listStatus(outPath, new TableFileFilter()); for (FileStatus ff : fstatus) { if (ddfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around... ddfs.delete(ff.getPath(), true); } } } } catch (Exception e) { // this isn't disastrous } // get the counter for the output of the mapper. Counter bytesCounter = counters.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN); return bytesCounter.getValue(); }
From source file:simsql.runtime.RelOp.java
License:Apache License
public boolean run(RuntimeParameter params, boolean verbose) { ExampleRuntimeParameter pp = (ExampleRuntimeParameter) params; // build the jar. String jarFile = buildJarFile(params); // Get the default configuration object Configuration conf = new Configuration(); // set quite mode on/off conf.setQuietMode(!verbose);//from w w w .j a va 2s. c o m /*** conf.setBoolean("mapred.task.profile", true); conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," + "heap=sites,depth=8,force=n,thread=y,verbose=n,file=%s"); ***/ // tell it how to serialize and deserialize records and recordkeys conf.set("io.serializations", getSerializations()); conf.setBoolean("mapred.compress.map.output", true); int ioSortMB = conf.getInt("io.sort.mb", 256); conf.set("mapred.map.child.java.opts", "-Xmx" + (getMemPerMapper(params) + ioSortMB) + "m -Xms" + (getMemPerMapper(params)) + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log"); conf.set("mapred.reduce.child.java.opts", "-Xmx" + (getMemPerReducer(params) + ioSortMB) + "m -Xms" + (getMemPerMapper(params)) + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log"); conf.setInt("simsql.input.numSplits", pp.getNumCPUs()); conf.setInt("mapred.job.reuse.jvm.num.tasks", 1); // conf.setBoolean ("mapred.map.tasks.speculative.execution", false); // conf.setBoolean ("mapred.reduce.tasks.speculative.execution", false); // tell it to use the jar that we just created conf.set("mapred.jar", jarFile); // conf.set("tmpjars", "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core.jar"); conf.setBoolean("mapred.output.compress", true); conf.setStrings("mapred.output.compression.type", new String[] { "RECORD" }); // use snappy for the intermediate stuff conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass()); // do some additional operator-specific configurations setConfigurations(conf, params); // collect statistics for final relations always conf.setBoolean("simsql.collectStats", isFinal || collectStats); // figure out what file to map String[] inDirs = myInputNetwork.getPipelinedInputFiles(); inDirs = excludeAnyWhoWillNotBeMapped(inDirs); String inSingleString = inDirs[0]; conf.set("simsql.fileToMap", inSingleString); for (int i = 1; i < inDirs.length; i++) { inSingleString += "," + inDirs[i]; } // create and name the job Job job; try { job = new Job(conf); } catch (Exception e) { throw new RuntimeException("Unable to create a new job!", e); } job.setJobName(getJobName()); // set the map-reduce input and output types job.setMapOutputKeyClass(getMapOutputKeyClass()); job.setMapOutputValueClass(getMapOutputValueClass()); job.setOutputKeyClass(getOutputKeyClass()); job.setOutputValueClass(getOutputValueClass()); int numReducers = getNumReducers(params); job.setMapperClass(getMapperClass()); job.setReducerClass(getReducerClass()); // set the number of reducers job.setNumReduceTasks(numReducers); // set the input and the output formats... these extend FileInputFormat and FileOutputFormat job.setInputFormatClass(getInputFormatClass()); job.setOutputFormatClass(getOutputFormatClass()); // set the input and output paths try { System.out.println("input file: " + inSingleString); FileInputFormat.setInputPaths(job, inSingleString); FileInputFormat.setInputPathFilter(job, TableFileFilter.class); FileOutputFormat.setOutputPath(job, new Path(getOutput())); } catch (Exception e) { throw new RuntimeException("Unable to set up the input/output path for the job.", e); } // set the split size FileInputFormat.setMinInputSplitSize(job, getSplitSize(params)); FileInputFormat.setMaxInputSplitSize(job, getSplitSize(params)); // set the various sorting/grouping/mapping classes job.setGroupingComparatorClass(getGroupingComparatorClass()); job.setPartitionerClass(getPartitionerClass()); job.setSortComparatorClass(getSortComparatorClass()); // and now, submit the job and wait for things to finish int exitCode; try { exitCode = job.waitForCompletion(verbose) ? 0 : 1; // get the output bytes counter. Counters c = job.getCounters(); Counter mx = c.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN); // and use them to set the size of the output relation. if (myDB != null) { myDB.setTableSize(myDB.getTableName(getOutput()), mx.getValue()); myDB.setNumAtts(myDB.getTableName(getOutput()), getOutputAttNames().length); } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("Unable to run the job", e); } // now, delete all the empty part files try { // get a filesystem FileSystem dfs = FileSystem.get(conf); Path outPath = new Path(getOutput()); if (dfs.exists(outPath) && dfs.isDirectory(outPath)) { FileStatus fstatus[] = dfs.listStatus(outPath, new TableFileFilter()); for (FileStatus ff : fstatus) { if (dfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around... dfs.delete(ff.getPath(), true); } } } } catch (Exception e) { // this isn't disastrous } return (exitCode == 0); }
From source file:sixdegrees.AdjacencyList.java
License:Apache License
public void execute(String args[]) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: adjacencylist <in> <out>"); System.exit(2);/*from w w w. j ava 2 s. c o m*/ } Job job = new Job(conf, "Adjacency List"); job.setJarByClass(AdjacencyList.class); job.setMapperClass(AdjacencyMapper.class); job.setPartitionerClass(CustomPartitioner.class); job.setReducerClass(AdjacencyReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); job.setNumReduceTasks(6); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1] + "0")); job.waitForCompletion(true); }
From source file:sourcefiles.PartitionGraph.java
License:Apache License
/** * Runs this tool./* w ww . j ava 2 s .com*/ */ @Override @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(RANGE, "use range partitioner")); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of partitions") .create(NUM_PARTITIONS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES) || !cmdline.hasOption(NUM_PARTITIONS)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inPath = cmdline.getOptionValue(INPUT); String outPath = cmdline.getOptionValue(OUTPUT); int nodeCount = Integer.parseInt(cmdline.getOptionValue(NUM_NODES)); int numParts = Integer.parseInt(cmdline.getOptionValue(NUM_PARTITIONS)); boolean useRange = cmdline.hasOption(RANGE); LOG.info("Tool name: " + PartitionGraph.class.getSimpleName()); LOG.info(" - input dir: " + inPath); LOG.info(" - output dir: " + outPath); LOG.info(" - num partitions: " + numParts); LOG.info(" - node cnt: " + nodeCount); LOG.info(" - use range partitioner: " + useRange); Configuration conf = getConf(); conf.setInt("NodeCount", nodeCount); Job job = Job.getInstance(conf); job.setJobName(PartitionGraph.class.getSimpleName() + ":" + inPath); job.setJarByClass(PartitionGraph.class); job.setNumReduceTasks(numParts); FileInputFormat.setInputPaths(job, new Path(inPath)); FileOutputFormat.setOutputPath(job, new Path(outPath)); job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PageRankNodeEnhanced.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PageRankNodeEnhanced.class); if (useRange) { job.setPartitionerClass(RangePartitioner.class); } FileSystem.get(conf).delete(new Path(outPath), true); job.waitForCompletion(true); return 0; }
From source file:terasort.TeraSort.java
License:Apache License
public int run(String[] args) throws Exception { LOG.info("starting"); Job job = Job.getInstance(getConf()); Path inputDir = new Path(args[0]); Path outputDir = new Path(args[1]); FileSystem fs = FileSystem.get(getConf()); if (fs.exists(outputDir)) { fs.delete(outputDir, true);/* w w w.j a v a 2 s . co m*/ } boolean useSimplePartitioner = getUseSimplePartitioner(job); TeraInputFormat.setInputPaths(job, inputDir); FileOutputFormat.setOutputPath(job, outputDir); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TeraInputFormat.class); job.setOutputFormatClass(TeraOutputFormat.class); if (useSimplePartitioner) { job.setPartitionerClass(SimplePartitioner.class); } else { long start = System.currentTimeMillis(); Path partitionFile = new Path(outputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); try { TeraInputFormat.writePartitionFile(job, partitionFile); } catch (Throwable e) { LOG.error(e.getMessage()); return -1; } job.addCacheFile(partitionUri); long end = System.currentTimeMillis(); System.out.println("Spent " + (end - start) + "ms computing partitions."); job.setPartitionerClass(TotalOrderPartitioner.class); } job.getConfiguration().setInt("dfs.replication", getOutputReplication(job)); TeraOutputFormat.setFinalSync(job, true); int ret = job.waitForCompletion(true) ? 0 : 1; LOG.info("done"); return ret; }
From source file:uk.ac.cam.eng.extraction.hadoop.features.phrase.Source2TargetJob.java
License:Apache License
public static Job getJob(Configuration conf) throws IOException { conf.set("mapred.map.child.java.opts", "-Xmx200m"); conf.set("mapred.reduce.child.java.opts", "-Xmx5128m"); conf.setBoolean(MarginalReducer.SOURCE_TO_TARGET, true); Job job = new Job(conf); job.setJarByClass(Source2TargetJob.class); job.setJobName("Source2Taget"); job.setSortComparatorClass(Source2TargetComparator.class); job.setPartitionerClass(Source2TargetPartitioner.class); job.setMapperClass(KeepProvenanceCountsOnlyMapper.class); job.setReducerClass(MarginalReducer.class); job.setMapOutputKeyClass(RuleWritable.class); job.setMapOutputValueClass(ProvenanceCountMap.class); job.setOutputKeyClass(RuleWritable.class); job.setOutputValueClass(FeatureMap.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job;//from w ww . j a v a 2s . c om }
From source file:uk.ac.cam.eng.extraction.hadoop.features.phrase.Target2SourceJob.java
License:Apache License
public static Job getJob(Configuration conf) throws IOException { conf.set("mapred.map.child.java.opts", "-Xmx200m"); conf.set("mapred.reduce.child.java.opts", "-Xmx5128m"); conf.setBoolean(MarginalReducer.SOURCE_TO_TARGET, false); Job job = new Job(conf); job.setJarByClass(Target2SourceJob.class); job.setJobName("Target2Source"); job.setSortComparatorClass(Target2SourceComparator.class); job.setPartitionerClass(Target2SourcePartitioner.class); job.setMapperClass(SwappingMapper.class); job.setReducerClass(MarginalReducer.class); job.setMapOutputKeyClass(RuleWritable.class); job.setMapOutputValueClass(ProvenanceCountMap.class); job.setOutputKeyClass(RuleWritable.class); job.setOutputValueClass(FeatureMap.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job;//from www. j a v a 2 s . co m }
From source file:uk.ac.cam.eng.extraction.hadoop.merge.MergeJob.java
License:Apache License
public static Job getJob(Configuration conf) throws IOException { conf.set("mapred.map.child.java.opts", "-Xmx200m"); conf.set("mapred.reduce.child.java.opts", "-Xmx10240m"); Job job = new Job(conf); job.setJarByClass(MergeJob.class); job.setJobName("Merge"); job.setSortComparatorClass(MergeComparator.class); job.setPartitionerClass(MergePartitioner.class); job.setReducerClass(MergeReducer.class); job.setCombinerClass(MergeCombiner.class); job.setMapOutputKeyClass(RuleWritable.class); job.setMapOutputValueClass(AlignmentAndFeatureMap.class); job.setOutputKeyClass(RuleWritable.class); job.setOutputValueClass(AlignmentAndFeatureMap.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SimpleHFileOutputFormat.class); return job;// w w w . j a v a2 s . c om }