List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath
public static void addInputPath(JobConf conf, Path path)
From source file:WikipediaDocnoMappingBuilder.java
License:Apache License
@SuppressWarnings("static-access") @Override/*from w ww . j av a2 s. co m*/ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file") .create(OUTPUT_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() .withDescription("two-letter language code").create(LANGUAGE_OPTION)); options.addOption(KEEP_ALL_OPTION, false, "keep all pages"); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION); boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION); String tmpPath = "tmp-" + WikipediaDocnoMappingBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input: " + inputPath); LOG.info(" - output file: " + outputFile); LOG.info(" - keep all pages: " + keepAll); LOG.info(" - language: " + language); // Job job = Job.getInstance(getConf()); JobConf conf = new JobConf(WikipediaDocnoMappingBuilder.class); conf.setJarByClass(WikipediaDocnoMappingBuilder.class); conf.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language)); conf.setBoolean(KEEP_ALL_OPTION, keepAll); // .getConfiguration().setBoolean(KEEP_ALL_OPTION, keepAll); if (language != null) { conf.set("wiki.language", language); } conf.setNumReduceTasks(1); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(tmpPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(IntWritable.class); conf.setInputFormat(WikipediaPageInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. FileSystem.get(getConf()).delete(new Path(tmpPath), true); // job.waitForCompletion(true); RunningJob job = JobClient.runJob(conf); job.waitForCompletion(); // JobClient jobClient = new JobClient(conf); long cnt = keepAll ? job.getCounters().findCounter(PageTypes.TOTAL).getValue() : job.getCounters().findCounter(PageTypes.ARTICLE).getValue(); WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(getConf()), tmpPath + "/part-00000", (int) cnt, outputFile); FileSystem.get(getConf()).delete(new Path(tmpPath), true); return 0; }
From source file:SleepJob.java
License:Apache License
public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount) { JobConf job = new JobConf(getConf(), SleepJob.class); job.setNumMapTasks(numMapper);//from w ww.j a va2 s. c o m job.setNumReduceTasks(numReducer); job.setMapperClass(SleepJob.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(SleepJob.class); job.setOutputFormat(NullOutputFormat.class); job.setInputFormat(SleepInputFormat.class); job.setPartitionerClass(SleepJob.class); job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path("ignored")); job.setLong("sleep.job.map.sleep.time", mapSleepTime); job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime); job.setInt("sleep.job.map.sleep.count", mapSleepCount); job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount); return job; }
From source file:HoopRemoteTask.java
License:Open Source License
/** * *//* w ww.j av a2 s . c o m*/ public static void main(String args[]) throws Exception { // run the HoopLink constructor; We need this to have a global settings registry @SuppressWarnings("unused") HoopLink link = new HoopLink(); dbg("main ()"); showTimeStamp(); /** * I've taken out the statistics portion since it relies on code that isn't distributed * The next version will have this solved. I might try the solution in: * http://stackoverflow.com/questions/7443074/initialize-public-static-variable-in-hadoop-through-arguments * Although chances are I will switch to using Hoop to collect much better performance and distribution * statistics. See Hoop.java for more information */ HoopPerformanceMeasure metrics = new HoopPerformanceMeasure(); metrics.setMarker("main"); HoopLink.metrics.getDataSet().add(metrics); if (parseArgs(args) == false) { usage(); return; } if (HoopLink.postonly == true) { postOnly(); return; } if (HoopLink.task.equals("none") == true) { dbg("No task defined, please use the commandline option -task <task>"); return; } dbg("Starting system ..."); HoopRemoteTask driver = new HoopRemoteTask(); if (HoopLink.useHadoop == false) { dbg("Starting built-in mapper ..."); driver.indexDocuments(); } else { dbg("Starting hadoop job ..."); Configuration conf = new Configuration(); // TRANSFER SETTHoopGS FROM HoopLink to Configuration!!! transferConf(conf); // Now we're feeling much better HoopRemoteTask.hdfs = FileSystem.get(conf); if (HoopLink.dbglocal == true) { dbg("Enabling local debugging ..."); conf.set("mapred.job.tracker", "local"); } else dbg("Disabling local debugging"); JobConf job = new JobConf(conf, HoopRemoteTask.class); job.setJobName(driver.getClassName()); driver.setJob(job); @SuppressWarnings("unused") String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); job.setJarByClass(HoopRemoteTask.class); if (HoopLink.task.equals("invert") == true) { dbg("Configuring job for invert task ..."); job.setReducerClass(HoopInvertedListReducer.class); job.setMapperClass(HoopInvertedListMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); } if (HoopLink.task.equals("wordcount") == true) { dbg("Configuring job for wordcount task ..."); job.setReducerClass(HoopWordCountReducer.class); job.setMapperClass(HoopWordCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); } dbg("Using input path: " + HoopLink.datapath); dbg("Using output path: " + HoopLink.outputpath); FileInputFormat.addInputPath(job, new Path(HoopLink.datapath)); FileOutputFormat.setOutputPath(job, new Path(HoopLink.outputpath)); job.setInputFormat(HoopWholeFileInputFormat.class); if ((HoopLink.shardcreate.equals("mos") == true) && (HoopLink.nrshards > 1)) { dbg("Setting output to sharded output streams class ..."); job.setOutputFormat(HoopShardedOutputFormat.class); } else job.setOutputFormat(TextOutputFormat.class); /** * Temporarily commented out for testing purposes */ //job.setPartitionerClass (HoopPartitioner.class); driver.register("Main"); JobClient.runJob(job); postProcess(conf); } showTimeStamp(); metrics.closeMarker(); long timeTaken = metrics.getYValue(); //long timeTaken=metrics.getMarkerRaw (); metrics.printMetrics(timeTaken); driver.unregister(); /** * I've taken out the statistics portion since it relies on code that isn't distributed * The next version will have this solved. I might try the solution in: * http://stackoverflow.com/questions/7443074/initialize-public-static-variable-in-hadoop-through-arguments * Although chances are I will switch to using Hoop to collect much better performance and distribution * statistics. See Hoop.java for more information */ //stats.calcStatistics(); //dbg (stats.printStatistics()); }
From source file:NaivePageRank.java
License:Apache License
public static void main(String[] args) throws Exception { int iteration = -1; String inputPath = args[0];/*from w w w.j a v a 2 s. c om*/ String outputPath = args[1]; int specIteration = 0; if (args.length > 2) { specIteration = Integer.parseInt(args[2]); } int numNodes = 100000; if (args.length > 3) { numNodes = Integer.parseInt(args[3]); } int numReducers = 32; if (args.length > 4) { numReducers = Integer.parseInt(args[4]); } System.out.println("specified iteration: " + specIteration); long start = System.currentTimeMillis(); /** * job to count out-going links for each url */ JobConf conf = new JobConf(NaivePageRank.class); conf.setJobName("PageRank-Count"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(CountMapper.class); conf.setReducerClass(CountReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/count")); conf.setNumReduceTasks(numReducers); JobClient.runJob(conf); /******************** Initial Rank Assignment Job ***********************/ conf = new JobConf(NaivePageRank.class); conf.setJobName("PageRank-Initialize"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(InitialRankAssignmentMapper.class); conf.setReducerClass(InitialRankAssignmentReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration)); conf.setNumReduceTasks(numReducers); // conf.setIterative(false); JobClient.runJob(conf); iteration++; do { /****************** Join Job ********************************/ conf = new JobConf(NaivePageRank.class); conf.setJobName("PageRank-Join"); conf.setOutputKeyClass(Text.class); // conf.setOutputValueClass(Text.class); conf.setMapperClass(ComputeRankMap.class); conf.setReducerClass(ComputeRankReduce.class); conf.setMapOutputKeyClass(TextPair.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setPartitionerClass(FirstPartitioner.class); conf.setOutputKeyComparatorClass(KeyComparator.class); conf.setOutputValueGroupingComparator(GroupComparator.class); // relation table FileInputFormat.setInputPaths(conf, new Path(inputPath)); // rank table FileInputFormat.addInputPath(conf, new Path(outputPath + "/i" + (iteration - 1))); // count table FileInputFormat.addInputPath(conf, new Path(outputPath + "/count")); FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration)); conf.setNumReduceTasks(numReducers); JobClient.runJob(conf); iteration++; /******************** Rank Aggregate Job ***********************/ conf = new JobConf(NaivePageRank.class); conf.setJobName("PageRank-Aggregate"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapperClass(RankAggregateMapper.class); conf.setReducerClass(RankAggregateReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(outputPath + "/i" + (iteration - 1))); FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration)); conf.setNumReduceTasks(numReducers); conf.setInt("haloop.num.nodes", numNodes); JobClient.runJob(conf); iteration++; } while (iteration < 2 * specIteration); long end = System.currentTimeMillis(); System.out.println("running time " + (end - start) / 1000 + "s"); }
From source file:RepackWikipedia.java
License:Apache License
@SuppressWarnings("static-access") @Override//from ww w . j a v a 2s . com public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location") .create(OUTPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file") .create(MAPPING_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("block|record|none").hasArg() .withDescription("compression type").create(COMPRESSION_TYPE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de").hasArg().withDescription("two-letter language code") .create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION); String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION); if (!"block".equals(compressionType) && !"record".equals(compressionType) && !"none".equals(compressionType)) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } // this is the default block size int blocksize = 1000000; //Job job = Job.getInstance(getConf()); JobConf conf = new JobConf(RepackWikipedia.class); conf.setJarByClass(RepackWikipedia.class); conf.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language)); conf.set(DOCNO_MAPPING_FIELD, mappingFile); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - XML dump file: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - docno mapping data file: " + mappingFile); LOG.info(" - compression type: " + compressionType); LOG.info(" - language: " + language); if ("block".equals(compressionType)) { LOG.info(" - block size: " + blocksize); } conf.setNumReduceTasks(0); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); if ("none".equals(compressionType)) { FileOutputFormat.setCompressOutput(conf, false); } else { FileOutputFormat.setCompressOutput(conf, true); if ("record".equals(compressionType)) { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); conf.setInt("io.seqfile.compress.blocksize", blocksize); } } if (language != null) { conf.set("wiki.language", language); } conf.setInputFormat(WikipediaPageInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(WikipediaPage.class); conf.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(getConf()).delete(new Path(outputPath), true); //job.waitForCompletion(true); JobClient.runJob(conf); return 0; }
From source file:SleepJobWithArray.java
License:Apache License
public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount) { JobConf job = new JobConf(getConf(), SleepJobWithArray.class); job.setNumMapTasks(numMapper);//from w ww . j av a 2s .c o m job.setNumReduceTasks(numReducer); job.setMapperClass(SleepJobWithArray.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(SleepJobWithArray.class); job.setOutputFormat(NullOutputFormat.class); job.setInputFormat(SleepInputFormat.class); job.setPartitionerClass(SleepJobWithArray.class); job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path("ignored")); job.setLong("sleep.job.map.sleep.time", mapSleepTime); job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime); job.setInt("sleep.job.map.sleep.count", mapSleepCount); job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount); return job; }
From source file:azkaban.jobtype.examples.java.WordCount.java
License:Apache License
public void run() throws Exception { logger.info(String.format("Starting %s", getClass().getSimpleName())); // hadoop conf should be on the classpath JobConf jobconf = getJobConf();/* w w w .j a va 2s.com*/ jobconf.setJarByClass(WordCount.class); jobconf.setOutputKeyClass(Text.class); jobconf.setOutputValueClass(IntWritable.class); jobconf.setMapperClass(Map.class); jobconf.setReducerClass(Reduce.class); jobconf.setInputFormat(TextInputFormat.class); jobconf.setOutputFormat(TextOutputFormat.class); FileInputFormat.addInputPath(jobconf, new Path(inputPath)); FileOutputFormat.setOutputPath(jobconf, new Path(outputPath)); if (forceOutputOverrite) { FileSystem fs = FileOutputFormat.getOutputPath(jobconf).getFileSystem(jobconf); fs.delete(FileOutputFormat.getOutputPath(jobconf), true); } super.run(); }
From source file:azkaban.jobtype.javautils.HadoopUtils.java
License:Apache License
public static JobConf addAllSubPaths(JobConf conf, Path path) throws IOException { if (shouldPathBeIgnored(path)) { throw new IllegalArgumentException(String.format("Path[%s] should be ignored.", path)); }//from w w w . j a v a 2 s. c om final FileSystem fs = path.getFileSystem(conf); if (fs.exists(path)) { for (FileStatus status : fs.listStatus(path)) { if (!shouldPathBeIgnored(status.getPath())) { if (status.isDir()) { addAllSubPaths(conf, status.getPath()); } else { FileInputFormat.addInputPath(conf, status.getPath()); } } } } return conf; }
From source file:babel.prep.corpus.CorpusGenerator.java
License:Apache License
/** * Configures a map-only dataset generation job. *//*from w w w. ja v a2s . c o m*/ protected JobConf createJobConf(String crawlDir, String pagesSubDir, boolean xmlOut) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("create " + (xmlOut ? "xml formatted" : "") + " dataset from " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CorpusGenMapper.class); job.setOutputFormat(xmlOut ? MultipleXMLLangFileOutputFormat.class : MultipleLangFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, CORPUS_SUBDIR), "corpus." + (xmlOut ? PARAM_XML + "." : "") + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); return job; }
From source file:babel.prep.datedcorpus.DatedCorpusGenerator.java
License:Apache License
/** * Configures a map-only dataset generation job. *//*from w ww . j a v a2 s . c om*/ protected JobConf createJobConf(String crawlDir, String pagesSubDir) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("create dated dataset from " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(DatedCorpusGenMapper.class); job.setReducerClass(DatedCorpusGenReducer.class); job.setMapOutputValueClass(PageVersion.class); job.setOutputFormat(DatedLangFilesOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, CORPUS_SUBDIR), "datedcorpus." + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); return job; }