List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:edu.ucla.sspace.hadoop.HadoopJob.java
License:Open Source License
/** * Exceutes the word co-occurrence counting job on the corpus files in the * input directory using the current Hadoop instance, returning an iterator * over all the occurrences frequences found in the corpus. * * @param inputPaths the directories on the Hadoop distributed file system * containing all the corpus files that will be processed * * @return an iterator over the unique {@link WordCooccurrence} counts found * in the corpus. Note that if two words co-occur the same distance * apart multiple times, only one {@code WordCooccurrence} is * returned, where the number of co-occurrences is reflected by the * the {@link WordCooccurrence#getCount() getCount()} method. * * @throws Exception if Hadoop throws an {@code Exception} during its * execution or if the resulting output files cannot be read. *//*from w ww .ja v a2 s . c o m*/ public HadoopJobResults run(Collection<String> inputPaths) throws Exception { // Create a mostly unique file name for the output directory. String outputDir = "output-" + System.currentTimeMillis(); //conf.setBoolean("mapred.task.profile", true); Job job = new Job(conf, mapperClass.getName() + "-" + reducerClass.getName()); job.setJarByClass(HadoopJob.class); job.setMapperClass(mapperClass); job.setReducerClass(reducerClass); job.setMapOutputKeyClass(mapperOutputKey); job.setMapOutputValueClass(mapperOutputValue); job.setOutputKeyClass(outputKey); job.setOutputValueClass(outputValue); // Add all the specified directories as input paths for the job for (String inputDir : inputPaths) FileInputFormat.addInputPath(job, new Path(inputDir)); Path outputDirPath = new Path(outputDir); FileOutputFormat.setOutputPath(job, outputDirPath); job.waitForCompletion(true); // From the output directory, collect all the results files FileSystem fs = FileSystem.get(conf); FileStatus[] outputFiles = fs.listStatus(outputDirPath, new OutputFilePathFilter()); Collection<Path> paths = new LinkedList<Path>(); for (FileStatus status : outputFiles) { paths.add(status.getPath()); } return new HadoopJobResults(fs, paths); }
From source file:edu.udel.mxv.Mxv.java
@Override public int run(String[] args) throws Exception { if (args.length != 4) { System.err.println(USAGE); System.exit(1);//from ww w .j a v a 2s . c om } int n = Integer.parseInt(args[0]); String input_matrix = args[1]; String input_vector = args[2]; String output = args[3]; Configuration conf = getConf(); conf.set("vector.path", input_vector); conf.setInt("vector.n", n); Job job = new Job(conf); job.setJobName("mxv"); job.setJarByClass(getClass()); // mapper job.setMapperClass(MxvMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(DoubleWritable.class); // reducer job.setReducerClass(MxvRed.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(DoubleWritable.class); // job.setNumReduceTasks(num_red); FileInputFormat.addInputPath(job, new Path(input_matrix)); FileOutputFormat.setOutputPath(job, new Path(output)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java
License:Apache License
private String phase1(String inputPath, int reduceNo, String lang) throws IOException, InterruptedException, ClassNotFoundException { String output = "tmp/wiki-link/phase1"; Job job = Job.getInstance(getConf()); job.setJobName("Build Wikipedia Weighted Link Graph. Phase 1"); job.setJarByClass(BuildWikipediaWeightedLinkGraph.class); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.setNumReduceTasks(reduceNo);/*from w w w . j a va2 s . c o m*/ FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(output)); if ("en".equals(lang)) { job.setInputFormatClass(WikipediaPageInputFormat.class); } else throw new InterruptedException("Wikipedia dump with language " + lang + " is not supported "); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfStringInt.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfStringInt.class); job.setMapperClass(LinkEmitMapClass.class); job.setReducerClass(RedirectResolveReduceClass.class); job.waitForCompletion(true); return output; }
From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java
License:Apache License
private String phase2(String inputPath, int reduceNo) throws IOException, InterruptedException, ClassNotFoundException { String output = "tmp/wiki-link/phase2"; Job job = Job.getInstance(getConf()); job.setJobName("Build Wikipedia Weighted Link Graph. Phase 2"); job.setJarByClass(BuildWikipediaWeightedLinkGraph.class); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.setNumReduceTasks(reduceNo);//from w w w.j a va 2s. c o m FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfStringInt.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfStringInt.class); job.setReducerClass(DestinationIdResolveReduceClass.class); job.waitForCompletion(true); return output; }
From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java
License:Apache License
private String phase3(String inputPath, int reduceNo) throws IOException, InterruptedException, ClassNotFoundException { String output = "trace/phase3"; Job job = Job.getInstance(getConf()); job.setJobName("Build Wikipedia Weighted Link Graph. Phase 3"); job.setJarByClass(BuildWikipediaWeightedLinkGraph.class); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.setNumReduceTasks(reduceNo);/*from www . j av a2 s . c o m*/ FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfStringInt.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setReducerClass(SourceIdResolveReduceClass.class); job.waitForCompletion(true); return output; }
From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java
License:Apache License
private void task1(String inputPath, String outputPath) throws IOException, ClassNotFoundException, InterruptedException { LOG.info("Exracting anchor text (phase 1)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); Job job = Job.getInstance(getConf()); job.setJarByClass(ExtractWikipediaAnchorTextWithWindow.class); job.setJobName(/* w ww .jav a 2 s .c o m*/ String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath)); // 10 reducers is reasonable. job.setNumReduceTasks(10); // increase heap job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); job.getConfiguration().set("mapreduce.map.memory.mb", "6144"); job.getConfiguration().set("mapreduce.reduce.memory.mb", "6144"); job.getConfiguration().set("mapreduce.map.java.opts", "-Xmx6144m"); job.getConfiguration().set("mapreduce.reduce.java.opts", "-Xmx6144m"); job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); // job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(PairOfStringInt.class); job.setMapOutputValueClass(PairOfStrings.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PairOfIntString.class); job.setMapperClass(MyMapper1.class); job.setReducerClass(MyReducer1.class); job.setPartitionerClass(MyPartitioner1.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); }
From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java
License:Apache License
private void task2(String inputPath, String outputPath) throws IOException, ClassNotFoundException, InterruptedException { LOG.info("Exracting anchor text (phase 2)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); Job job = Job.getInstance(getConf()); job.setJarByClass(ExtractWikipediaAnchorTextWithWindow.class); job.setJobName(//from w ww . j a v a2s . c o m String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath)); // Gathers everything together for convenience; feasible for Wikipedia. job.setNumReduceTasks(1); // increase heap job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); job.getConfiguration().set("mapreduce.map.memory.mb", "6144"); job.getConfiguration().set("mapreduce.reduce.memory.mb", "6144"); job.getConfiguration().set("mapreduce.map.java.opts", "-Xmx6144m"); job.getConfiguration().set("mapreduce.reduce.java.opts", "-Xmx6144m"); job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(HMapSIW.class); job.setMapperClass(MyMapper2.class); job.setReducerClass(MyReducer2.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); // Clean up intermediate data. FileSystem.get(job.getConfiguration()).delete(new Path(inputPath), true); }
From source file:edu.umd.cloud9.example.bfs.EncodeBfsGraph.java
License:Apache License
@SuppressWarnings("static-access") @Override//www .j a v a 2s . co m public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("nodeid").hasArg().withDescription("source node").create(SRC_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(SRC_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); int src = Integer.parseInt(cmdline.getOptionValue(SRC_OPTION)); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - inputDir: " + inputPath); LOG.info(" - outputDir: " + outputPath); LOG.info(" - src: " + src); Job job = Job.getInstance(getConf()); job.setJobName(String.format("EncodeBfsGraph[%s: %s, %s: %s, %s: %d]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath, SRC_OPTION, src)); job.setJarByClass(EncodeBfsGraph.class); job.setNumReduceTasks(0); job.getConfiguration().setInt(SRC_OPTION, src); job.getConfiguration().setInt("mapred.min.split.size", 1024 * 1024 * 1024); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(BfsNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(BfsNode.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }
From source file:edu.umd.cloud9.example.bfs.FindNodeAtDistance.java
License:Apache License
@SuppressWarnings("static-access") @Override//from w w w.j av a2s . co m public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("distance").create(DISTANCE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(DISTANCE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); int distance = Integer.parseInt(cmdline.getOptionValue(DISTANCE_OPTION)); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - inputDir: " + inputPath); LOG.info(" - outputDir: " + outputPath); LOG.info(" - distance: " + distance); Job job = Job.getInstance(getConf()); job.setJobName(String.format("FindNodeAtDistance[%s: %s, %s: %s, %s: %d]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath, DISTANCE_OPTION, distance)); job.setJarByClass(FindNodeAtDistance.class); job.setNumReduceTasks(0); job.getConfiguration().setInt(DISTANCE_OPTION, distance); job.getConfiguration().setInt("mapred.min.split.size", 1024 * 1024 * 1024); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(BfsNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(BfsNode.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }
From source file:edu.umd.cloud9.example.bfs.FindReachableNodes.java
License:Apache License
@SuppressWarnings("static-access") @Override// ww w . j a v a 2 s. c om public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - inputDir: " + inputPath); LOG.info(" - outputDir: " + outputPath); Job job = Job.getInstance(getConf()); job.setJobName(String.format("FindReachableNodes:[%s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath)); job.setJarByClass(FindReachableNodes.class); job.setNumReduceTasks(0); job.getConfiguration().setInt("mapred.min.split.size", 1024 * 1024 * 1024); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(BfsNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(BfsNode.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }