List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath
public static void addInputPath(JobConf conf, Path path)
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * Extracts redirects and the target for each. * * @param inputPath//from w ww .java2s . c o m * @param outputPath * @throws IOException */ private void task0(String inputPath, String outputPath) throws IOException { LOG.info("Extracting redirects (phase 0)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase0[input: %s, output: %s]", inputPath, outputPath)); conf.setNumReduceTasks(1); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(MyMapper0.class); conf.setReducerClass(IdentityReducer.class); JobClient.runJob(conf); }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * Maps from Wikipedia article to (srcID, (targetID, anchor). * * @param inputPath//from w ww . jav a2 s. c o m * @param outputPath * @throws IOException */ private void task1(String inputPath, String outputPath) throws IOException { LOG.info("Extracting anchor text (phase 1)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath)); // 10 reducers is reasonable. conf.setNumReduceTasks(10); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(PairOfStringInt.class); conf.setMapOutputValueClass(PairOfStrings.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfStrings.class); conf.setMapperClass(MyMapper1.class); conf.setReducerClass(MyReducer1.class); conf.setPartitionerClass(MyPartitioner1.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * * Maps from (srcID, (targetID, anchor) to (targetID, (anchor, count)). * * @param inputPath//from ww w .jav a 2s . co m * @param outputPath * @throws IOException */ private void task2(String inputPath, String outputPath, String redirPath) throws IOException { LOG.info("Extracting anchor text (phase 2)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); Random r = new Random(); //String tmpOutput = "tmp-" + this.getClass().getCanonicalName() + "-" + r.nextInt(10000); //LOG.info( "intermediate folder for merge " + tmpOutput ); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath)); // Gathers everything together for convenience; feasible for Wikipedia. conf.setNumReduceTasks(1); try { DistributedCache.addCacheFile(new URI(redirPath + "/part-00000" + "#" + "redirs.dat"), conf); DistributedCache.createSymlink(conf); } catch (URISyntaxException e) { e.printStackTrace(); } FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); //FileOutputFormat.setOutputPath(conf, new Path(tmpOutput)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(MapFileOutputFormat.class); // conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(HMapSIW.class); conf.setMapperClass(MyMapper2.class); conf.setReducerClass(MyReducer2.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); // Clean up intermediate data. FileSystem.get(conf).delete(new Path(inputPath), true); /* //merge String finalO = outputPath+"/part-00000/data"; FileSystem.get(conf).mkdirs( new Path( outputPath + "part-00000") ); getMergeInHdfs( tmpOutput, finalO, conf ); FileSystem.get(conf).delete(new Path(tmpOutput), true); */ }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * Extracts CF for each found anchor.// w w w . jav a 2 s . c o m * * @param inputPath * @param mapPath * @param outputPath * @throws IOException */ private void task3(String inputPath, String mapPath, String outputPath) throws IOException { LOG.info("Extracting anchor text (phase 3)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); LOG.info(" - mapping: " + mapPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase3[input: %s, output: %s]", inputPath, outputPath)); conf.setNumReduceTasks(1); String location = "map.dat"; try { DistributedCache.addCacheFile(new URI(mapPath + "/part-00000/data" + "#" + location), conf); //DistributedCache.addCacheFile(new URI(mapPath + "/singleentitymap.data" + "#" + location), conf); DistributedCache.createSymlink(conf); } catch (URISyntaxException e) { e.printStackTrace(); } FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(MapFileOutputFormat.class); // conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MyMapper3.class); conf.setCombinerClass(MyReducer3.class); conf.setReducerClass(MyReducer3.class); JobClient.runJob(conf); }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * Maps from (targetID, (anchor, count)) to (anchor, (targetID, count)). * * @param inputPath/*from w ww. java 2s . c om*/ * @param outputPath * @throws IOException */ private void task4(String inputPath, String outputPath) throws IOException { LOG.info("Extracting anchor text (phase 4)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase4[input: %s, output: %s]", inputPath, outputPath)); conf.setNumReduceTasks(1); //FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-00000/data")); FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-*/data")); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(MapFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(HMapSIW.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(HMapSIW.class); conf.setMapperClass(MyMapper4.class); conf.setReducerClass(MyReducer4.class); JobClient.runJob(conf); }
From source file:contrail.correct.InvokeFlash.java
License:Apache License
public RunningJob runJob() throws Exception { JobConf conf = new JobConf(InvokeFlash.class); conf.setJobName("Flash invocation"); String inputPath = (String) stage_options.get("inputpath"); String outputPath = (String) stage_options.get("outputpath"); String flashPath = (String) stage_options.get("flash_binary"); if (flashPath.length() == 0) { throw new Exception("Flash binary location required"); }/*www . ja va2 s.c o m*/ DistributedCache.addCacheFile(new Path(flashPath).toUri(), conf); //Sets the parameters in JobConf initializeJobConfiguration(conf); AvroJob.setMapperClass(conf, RunFlashMapper.class); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); //Input MatePair read = new MatePair(); AvroJob.setInputSchema(conf, read.getSchema()); AvroJob.setOutputSchema(conf, new fastqrecord().getSchema()); //Map Only Job conf.setNumReduceTasks(0); // Delete the output directory if it exists already Path out_path = new Path(outputPath); if (FileSystem.get(conf).exists(out_path)) { FileSystem.get(conf).delete(out_path, true); } long starttime = System.currentTimeMillis(); RunningJob result = JobClient.runJob(conf); long endtime = System.currentTimeMillis(); float diff = (float) (((float) (endtime - starttime)) / 1000.0); System.out.println("Runtime: " + diff + " s"); return result; }
From source file:contrail.correct.InvokeQuakeForMatePairs.java
License:Apache License
public RunningJob runJob() throws Exception { JobConf conf = new JobConf(InvokeQuakeForMatePairs.class); conf.setJobName("Quake Paired invocation"); String inputPath = (String) stage_options.get("inputpath"); String outputPath = (String) stage_options.get("outputpath"); String quakePath = (String) stage_options.get("quake_binary"); String bithashPath = (String) stage_options.get("bithashpath"); //User wants to run quake if (quakePath.length() != 0) { DistributedCache.addCacheFile(new Path(quakePath).toUri(), conf); } else {//from ww w .j a va2 s.c o m throw new Exception("Please specify Quake binary path"); } if (bithashPath.length() != 0) { DistributedCache.addCacheFile(new Path(bithashPath).toUri(), conf); } else { throw new Exception("Please specify bithash path"); } //Sets the parameters in JobConf initializeJobConfiguration(conf); AvroJob.setMapperClass(conf, RunQuakeMapper.class); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); ///Input MatePair read = new MatePair(); AvroJob.setInputSchema(conf, read.getSchema()); AvroJob.setOutputSchema(conf, new fastqrecord().getSchema()); //Map Only Job conf.setNumReduceTasks(0); // Delete the output directory if it exists already Path out_path = new Path(outputPath); if (FileSystem.get(conf).exists(out_path)) { FileSystem.get(conf).delete(out_path, true); } long starttime = System.currentTimeMillis(); RunningJob result = JobClient.runJob(conf); long endtime = System.currentTimeMillis(); float diff = (float) (((float) (endtime - starttime)) / 1000.0); System.out.println("Runtime: " + diff + " s"); return result; }
From source file:contrail.correct.InvokeQuakeForSingles.java
License:Apache License
public RunningJob runJob() throws Exception { JobConf conf = new JobConf(InvokeQuakeForSingles.class); conf.setJobName("Quake Singles invocation"); String inputPath = (String) stage_options.get("inputpath"); String outputPath = (String) stage_options.get("outputpath"); String quakePath = (String) stage_options.get("quake_binary"); String bithashPath = (String) stage_options.get("bithashpath"); //User wants to run quake if (quakePath.length() != 0) { DistributedCache.addCacheFile(new Path(quakePath).toUri(), conf); } else {//from w w w . j a v a 2 s. com throw new Exception("Please specify Quake path"); } if (bithashPath.length() != 0) { DistributedCache.addCacheFile(new Path(bithashPath).toUri(), conf); } else { throw new Exception("Please specify bithash path"); } //Sets the parameters in JobConf initializeJobConfiguration(conf); AvroJob.setMapperClass(conf, RunQuakeMapper.class); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); ///Input fastqrecord read = new fastqrecord(); AvroJob.setInputSchema(conf, read.getSchema()); AvroJob.setOutputSchema(conf, new fastqrecord().getSchema()); //Map Only Job conf.setNumReduceTasks(0); // Delete the output directory if it exists already Path out_path = new Path(outputPath); if (FileSystem.get(conf).exists(out_path)) { FileSystem.get(conf).delete(out_path, true); } long starttime = System.currentTimeMillis(); RunningJob result = JobClient.runJob(conf); long endtime = System.currentTimeMillis(); float diff = (float) (((float) (endtime - starttime)) / 1000.0); System.out.println("Runtime: " + diff + " s"); return result; }
From source file:contrail.correct.KmerCounter.java
License:Open Source License
public RunningJob runJob() throws Exception { String inputPath = (String) stage_options.get("inputpath"); String outputPath = (String) stage_options.get("outputpath"); JobConf conf = new JobConf(KmerCounter.class); conf.setJobName("Kmer Counter "); initializeJobConfiguration(conf);/*from ww w.j a v a 2s.c o m*/ FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); fastqrecord read = new fastqrecord(); AvroJob.setInputSchema(conf, read.getSchema()); AvroJob.setOutputSchema(conf, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(conf, KmerCounterMapper.class); AvroJob.setReducerClass(conf, KmerCounterReducer.class); // Delete the output directory if it exists already Path out_path = new Path(outputPath); if (FileSystem.get(conf).exists(out_path)) { FileSystem.get(conf).delete(out_path, true); } long starttime = System.currentTimeMillis(); RunningJob run = JobClient.runJob(conf); long endtime = System.currentTimeMillis(); float diff = (float) (((float) (endtime - starttime)) / 1000.0); System.out.println("Runtime: " + diff + " s"); return run; }
From source file:contrail.stages.GraphStats.java
License:Open Source License
@Override public RunningJob runJob() throws Exception { String[] required_args = { "inputpath", "outputpath" }; checkHasParametersOrDie(required_args); String inputPath = (String) stage_options.get("inputpath"); String outputPath = (String) stage_options.get("outputpath"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); Configuration base_conf = getConf(); JobConf conf = null;/*from w w w .ja va 2 s . c o m*/ if (base_conf != null) { conf = new JobConf(getConf(), PairMergeAvro.class); } else { conf = new JobConf(PairMergeAvro.class); } conf.setJobName("GraphStats " + inputPath); initializeJobConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); Pair<Integer, GraphStatsData> mapOutput = new Pair<Integer, GraphStatsData>(0, new GraphStatsData()); GraphNodeData nodeData = new GraphNodeData(); AvroJob.setInputSchema(conf, nodeData.getSchema()); AvroJob.setMapOutputSchema(conf, mapOutput.getSchema()); AvroJob.setOutputSchema(conf, mapOutput.value().getSchema()); AvroJob.setMapperClass(conf, GraphStatsMapper.class); AvroJob.setCombinerClass(conf, GraphStatsCombiner.class); AvroJob.setReducerClass(conf, GraphStatsReducer.class); // Use a single reducer task that we accumulate all the stats in one // reducer. conf.setNumReduceTasks(1); if (stage_options.containsKey("writeconfig")) { writeJobConfig(conf); } else { // Delete the output directory if it exists already Path out_path = new Path(outputPath); if (FileSystem.get(conf).exists(out_path)) { // TODO(jlewi): We should only delete an existing directory // if explicitly told to do so. sLogger.info("Deleting output path: " + out_path.toString() + " " + "because it already exists."); FileSystem.get(conf).delete(out_path, true); } long starttime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); long endtime = System.currentTimeMillis(); float diff = (float) ((endtime - starttime) / 1000.0); System.out.println("Runtime: " + diff + " s"); // Create iterators to read the output Iterator<GraphStatsData> binsIterator = createOutputIterator(); // Compute the N50 stats for each bin. ArrayList<GraphN50StatsData> N50Stats = computeN50Stats(binsIterator); // Write the N50 stats to a file. writeN50StatsToFile(N50Stats); Integer topn_contigs = (Integer) stage_options.get("topn_contigs"); if (topn_contigs > 0) { // Get the lengths of the n contigs. binsIterator = createOutputIterator(); List<Integer> topN = topNContigs(binsIterator, topn_contigs); writeTopNContigs(topN); } return job; } return null; }