List of usage examples for org.apache.hadoop.mapreduce Job getInstance
@Deprecated public static Job getInstance(Cluster ignored) throws IOException
From source file:edu.gslis.ts.hadoop.ThriftDumper.java
License:Apache License
public int run(String[] args) throws Exception { String inputPath = args[0];/*w w w . ja v a 2 s. co m*/ String outputPath = args[1]; Path topicsFile = new Path(args[2]); Path vocabFile = new Path(args[3]); Configuration config = getConf(); Job job = Job.getInstance(config); job.setJarByClass(ThriftDumper.class); job.setInputFormatClass(ThriftFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setReducerClass(ThriftDumperReducer.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(inputPath)); FileInputFormat.setInputDirRecursive(job, true); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.addCacheFile(topicsFile.toUri()); job.addCacheFile(vocabFile.toUri()); job.setMapperClass(ThriftDumperMapper.class); boolean b = job.waitForCompletion(true); if (!b) { throw new IOException("error with job"); } return 0; }
From source file:edu.gslis.ts.hadoop.ThriftRMScorerHbaseMR.java
License:Apache License
public int run(String[] args) throws Exception { String tableName = args[0];/*from w w w . ja v a2 s.c o m*/ Path topicsFile = new Path(args[1]); Path vocabFile = new Path(args[2]); Path outputPath = new Path(args[3]); Path stoplist = new Path(args[4]); // String queryId = args[1]; Configuration config = HBaseConfiguration.create(getConf()); config.set("hbase.table.name", tableName); Job job = Job.getInstance(config); job.setJarByClass(ThriftRMScorerHbaseMR.class); Scan scan = new Scan(); scan.setCaching(500); scan.setCacheBlocks(false); /* Filter prefixFilter = new PrefixFilter(Bytes.toBytes(queryId)); scan.setFilter(prefixFilter); */ TableMapReduceUtil.initTableMapperJob(tableName, scan, ThriftTableMapper.class, IntWritable.class, // mapper output key Text.class, // mapper output value job); job.setReducerClass(ThriftTableReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.addCacheFile(topicsFile.toUri()); job.addCacheFile(vocabFile.toUri()); job.addCacheFile(stoplist.toUri()); FileOutputFormat.setOutputPath(job, outputPath); boolean b = job.waitForCompletion(true); if (!b) { throw new IOException("error with job!"); } return 0; }
From source file:edu.gslis.ts.hadoop.ThriftSentenceScorerHbase.java
License:Apache License
public int run(String[] args) throws Exception { String tableName = args[0];//from w w w. j a v a 2s . c o m Path topicsFile = new Path(args[1]); Path vocabFile = new Path(args[2]); Path outputPath = new Path(args[3]); // String queryId = args[1]; Configuration config = HBaseConfiguration.create(getConf()); Job job = Job.getInstance(config); job.setJarByClass(ThriftSentenceScorerHbase.class); Scan scan = new Scan(); scan.setCaching(500); scan.setCacheBlocks(false); /* Filter prefixFilter = new PrefixFilter(Bytes.toBytes(queryId)); scan.setFilter(prefixFilter); */ TableMapReduceUtil.initTableMapperJob(tableName, scan, ThriftTableMapper.class, Text.class, // mapper output key Text.class, // mapper output value job); job.addCacheFile(topicsFile.toUri()); job.addCacheFile(vocabFile.toUri()); FileOutputFormat.setOutputPath(job, outputPath); boolean b = job.waitForCompletion(true); if (!b) { throw new IOException("error with job!"); } return 0; }
From source file:edu.gslis.ts.hadoop.ThriftWordCount.java
License:Apache License
public int run(String[] args) throws Exception { String inputPath = args[0];/*ww w. j a v a2 s. c o m*/ Path outputPath = new Path(args[1]); Job job = Job.getInstance(getConf()); job.setJarByClass(ThriftWordCount.class); job.setInputFormatClass(ThriftFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setReducerClass(ThriftWordCountReducer.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(inputPath)); FileInputFormat.setInputDirRecursive(job, true); FileOutputFormat.setOutputPath(job, outputPath); job.setMapperClass(ThriftWordCountMapper.class); boolean b = job.waitForCompletion(true); if (!b) { throw new IOException("error with job"); } return 0; }
From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java
License:Apache License
private String phase1(String inputPath, int reduceNo, String lang) throws IOException, InterruptedException, ClassNotFoundException { String output = "tmp/wiki-link/phase1"; Job job = Job.getInstance(getConf()); job.setJobName("Build Wikipedia Weighted Link Graph. Phase 1"); job.setJarByClass(BuildWikipediaWeightedLinkGraph.class); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.setNumReduceTasks(reduceNo);/* ww w . j a v a 2s . c om*/ FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(output)); if ("en".equals(lang)) { job.setInputFormatClass(WikipediaPageInputFormat.class); } else throw new InterruptedException("Wikipedia dump with language " + lang + " is not supported "); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfStringInt.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfStringInt.class); job.setMapperClass(LinkEmitMapClass.class); job.setReducerClass(RedirectResolveReduceClass.class); job.waitForCompletion(true); return output; }
From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java
License:Apache License
private String phase2(String inputPath, int reduceNo) throws IOException, InterruptedException, ClassNotFoundException { String output = "tmp/wiki-link/phase2"; Job job = Job.getInstance(getConf()); job.setJobName("Build Wikipedia Weighted Link Graph. Phase 2"); job.setJarByClass(BuildWikipediaWeightedLinkGraph.class); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.setNumReduceTasks(reduceNo);//from w w w . j av a 2 s .c om FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfStringInt.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfStringInt.class); job.setReducerClass(DestinationIdResolveReduceClass.class); job.waitForCompletion(true); return output; }
From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java
License:Apache License
private String phase3(String inputPath, int reduceNo) throws IOException, InterruptedException, ClassNotFoundException { String output = "trace/phase3"; Job job = Job.getInstance(getConf()); job.setJobName("Build Wikipedia Weighted Link Graph. Phase 3"); job.setJarByClass(BuildWikipediaWeightedLinkGraph.class); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.setNumReduceTasks(reduceNo);//from w w w. j a v a 2 s . c o m FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfStringInt.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setReducerClass(SourceIdResolveReduceClass.class); job.waitForCompletion(true); return output; }
From source file:edu.umd.cloud9.collection.wikipedia.CountWikipediaPages.java
License:Apache License
@SuppressWarnings("static-access") @Override// w w w .j av a 2s .c o m public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr") .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String language = "en"; // Assume 'en' by default. if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (!(language.length() == 2 || language.length() == 6)) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } String inputPath = cmdline.getOptionValue(INPUT_OPTION); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - XML dump file: " + inputPath); LOG.info(" - language: " + language); Job job = Job.getInstance(getConf()); job.setJarByClass(CountWikipediaPages.class); job.setJobName(String.format("CountWikipediaPages[%s: %s, %s: %s]", INPUT_OPTION, inputPath, LANGUAGE_OPTION, language)); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(inputPath)); if (language != null) { job.getConfiguration().set("wiki.language", language); } job.setInputFormatClass(WikipediaPageInputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapperClass(MyMapper.class); job.waitForCompletion(true); return 0; }
From source file:edu.umd.cloud9.collection.wikipedia.DumpWikipediaToPlainText.java
License:Apache License
@SuppressWarnings("static-access") @Override/* www . j a v a2 s .com*/ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption(OptionBuilder .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr") .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); options.addOption(OptionBuilder.withArgName("TEXT|HTML|WIKI").hasArg() .withDescription("Output Content Type TEXT, HTML, WIKI").create(CONTENT_FORMAT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String language = "en"; // Assume "en" by default. if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (!(language.length() == 2 || language.length() == 6)) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } String contentFormat = "TEXT"; // Assume "TEXT" by default. if (cmdline.hasOption(CONTENT_FORMAT_OPTION)) { contentFormat = cmdline.getOptionValue(CONTENT_FORMAT_OPTION); if (!contentFormat.equals("TEXT") && !contentFormat.equals("HTML") && !contentFormat.equals("WIKI")) { System.err.println("Error: \"" + contentFormat + "\" unknown content type!"); return -1; } } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - XML dump file: " + inputPath); LOG.info(" - output path : " + outputPath); LOG.info(" - language : " + language); LOG.info(" - content_type : " + contentFormat); Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJarByClass(DumpWikipediaToPlainText.class); job.setJobName(String.format("DumpWikipediaToPlainText[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath, LANGUAGE_OPTION, language, CONTENT_FORMAT_OPTION, contentFormat)); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); if (language != null) { job.getConfiguration().set("wiki.language", language); } if (contentFormat != null) { job.getConfiguration().set("wiki.content_format", contentFormat); } job.setInputFormatClass(WikipediaPageInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Delete the output directory if it exists already. FileSystem.get(getConf()).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }
From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java
License:Apache License
private void task1(String inputPath, String outputPath) throws IOException, ClassNotFoundException, InterruptedException { LOG.info("Exracting anchor text (phase 1)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); Job job = Job.getInstance(getConf()); job.setJarByClass(ExtractWikipediaAnchorTextWithWindow.class); job.setJobName(/* w ww . jav a2 s . c o m*/ String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath)); // 10 reducers is reasonable. job.setNumReduceTasks(10); // increase heap job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); job.getConfiguration().set("mapreduce.map.memory.mb", "6144"); job.getConfiguration().set("mapreduce.reduce.memory.mb", "6144"); job.getConfiguration().set("mapreduce.map.java.opts", "-Xmx6144m"); job.getConfiguration().set("mapreduce.reduce.java.opts", "-Xmx6144m"); job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); // job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(PairOfStringInt.class); job.setMapOutputValueClass(PairOfStrings.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PairOfIntString.class); job.setMapperClass(MyMapper1.class); job.setReducerClass(MyReducer1.class); job.setPartitionerClass(MyPartitioner1.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); }