List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath
public static void addInputPath(JobConf conf, Path path)
From source file:edu.umd.cloud9.collection.clue.ClueWarcForwardIndexBuilder.java
License:Apache License
/** * Runs this tool.// www . ja v a2s. c o m */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("(required) collection path (must be block-compressed SequenceFiles)") .create(COLLECTION_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path") .create(INDEX_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } JobConf conf = new JobConf(getConf(), ClueWarcForwardIndexBuilder.class); FileSystem fs = FileSystem.get(conf); String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); String indexFile = cmdline.getOptionValue(INDEX_OPTION); LOG.info("Tool name: " + ClueWarcForwardIndexBuilder.class.getSimpleName()); LOG.info(" - collection path: " + collectionPath); LOG.info(" - index file: " + indexFile); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); Random random = new Random(); Path outputPath = new Path( "tmp-" + ClueWarcForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000)); conf.setJobName(ClueWarcForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath); conf.setNumMapTasks(100); conf.setNumReduceTasks(1); // Note, we have to add the files one by one, otherwise, SequenceFileInputFormat // thinks its a MapFile. for (FileStatus status : fs.listStatus(new Path(collectionPath))) { FileInputFormat.addInputPath(conf, status.getPath()); } FileOutputFormat.setOutputPath(conf, outputPath); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already fs.delete(outputPath, true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.findCounter(Blocks.Total).getCounter(); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF(ClueWarcForwardIndex.class.getCanonicalName()); out.writeUTF(collectionPath); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } fs.delete(outputPath, true); return 0; }
From source file:edu.umd.cloud9.collection.clue.CountClueWarcRecords.java
License:Apache License
/** * Runs this tool./* www . ja v a 2s . c om*/ */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(ORIGINAL_OPTION, "use original ClueWeb09 distribution")); options.addOption(new Option(REPACKED_OPTION, "use repacked SequenceFiles")); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("path: base path for 'original', actual path for 'repacked'").create(PATH_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("DocnoMapping data path") .create(MAPPING_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("segment number (required if 'original')").create(SEGMENT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("output file to write the number of records").create(COUNT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } boolean repacked; if (cmdline.hasOption(REPACKED_OPTION)) { repacked = true; } else if (cmdline.hasOption(ORIGINAL_OPTION)) { repacked = false; } else { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Expecting either -original or -repacked"); return -1; } if (!cmdline.hasOption(PATH_OPTION) || !cmdline.hasOption(MAPPING_OPTION) || (!repacked && !cmdline.hasOption(SEGMENT_OPTION))) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String path = cmdline.getOptionValue(PATH_OPTION); String mappingFile = cmdline.getOptionValue(MAPPING_OPTION); int segment = 1; if (!repacked) { segment = Integer.parseInt(cmdline.getOptionValue(SEGMENT_OPTION)); } LOG.info("Tool name: " + CountClueWarcRecords.class.getSimpleName()); LOG.info(" - repacked: " + repacked); LOG.info(" - path: " + path); LOG.info(" - mapping file: " + mappingFile); if (!repacked) { LOG.info(" - segment number: " + segment); } FileSystem fs = FileSystem.get(getConf()); int mapTasks = 10; JobConf conf = new JobConf(getConf(), CountClueWarcRecords.class); conf.setJobName( CountClueWarcRecords.class.getSimpleName() + (repacked ? ":" + path : ":segment" + segment)); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); if (repacked) { // Note, we have to add the files one by one, otherwise, SequenceFileInputFormat // thinks its a MapFile. for (FileStatus status : fs.listStatus(new Path(path))) { FileInputFormat.addInputPath(conf, status.getPath()); } } else { ClueCollectionPathConstants.addEnglishCollectionPart(conf, path, segment); } DistributedCache.addCacheFile(new URI(mappingFile), conf); if (repacked) { conf.setInputFormat(SequenceFileInputFormat.class); } else { conf.setInputFormat(ClueWarcInputFormat.class); } conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Records.PAGES).getCounter(); LOG.info("Read " + numDocs + " docs."); if (cmdline.hasOption(COUNT_OPTION)) { String f = cmdline.getOptionValue(COUNT_OPTION); FSDataOutputStream out = fs.create(new Path(f)); out.write(new Integer(numDocs).toString().getBytes()); out.close(); } return 0; }
From source file:edu.umd.cloud9.collection.trecweb.RepackGov2Documents.java
License:Apache License
/** * Runs this tool./*from w ww . j a v a 2 s . c om*/ */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String basePath = args[0]; String outputPath = args[1]; String compressionType = args[2]; if (!compressionType.equals("block") && !compressionType.equals("record") && !compressionType.equals("none")) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); System.exit(-1); } // this is the default block size int blocksize = 1000000; JobConf conf = new JobConf(RepackGov2Documents.class); conf.setJobName("RepackGov2Documents"); sLogger.info("Tool name: RepackGov2Documents"); sLogger.info(" - base path: " + basePath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - compression type: " + compressionType); if (compressionType.equals("block")) { sLogger.info(" - block size: " + blocksize); } int mapTasks = 10; conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(500); // 272 for (int i = 0; i <= 272; i++) { String path = basePath + "/GX"; String indexNum = Integer.toString(i); if (indexNum.length() == 1) { path += "00"; } if (indexNum.length() == 2) { path += "0"; } path += indexNum; FileInputFormat.addInputPath(conf, new Path(path)); } SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); if (compressionType.equals("none")) { SequenceFileOutputFormat.setCompressOutput(conf, false); } else { SequenceFileOutputFormat.setCompressOutput(conf, true); if (compressionType.equals("record")) { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); conf.setInt("io.seqfile.compress.blocksize", blocksize); } } conf.setInputFormat(TrecWebDocumentInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(TrecWebDocument.class); conf.setMapperClass(MyMapper.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.collection.trecweb.RepackWt10gDocuments.java
License:Apache License
/** * Runs this tool./*from w w w . j av a2 s . c o m*/ */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String basePath = args[0]; String outputPath = args[1]; String compressionType = args[2]; if (!compressionType.equals("block") && !compressionType.equals("record") && !compressionType.equals("none")) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); System.exit(-1); } // this is the default block size int blocksize = 1000000; JobConf conf = new JobConf(RepackWt10gDocuments.class); conf.setJobName("RepackWt10gDocuments"); sLogger.info("Tool name: RepackWt10gDocuments"); sLogger.info(" - base path: " + basePath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - compression type: " + compressionType); if (compressionType.equals("block")) { sLogger.info(" - block size: " + blocksize); } int mapTasks = 10; conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(50); for (int i = 1; i <= 104; i++) { String path = basePath + "/WTX"; String indexNum = Integer.toString(i); if (indexNum.length() == 1) { path += "00"; } if (indexNum.length() == 2) { path += "0"; } path += indexNum; FileInputFormat.addInputPath(conf, new Path(path)); } SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); if (compressionType.equals("none")) { SequenceFileOutputFormat.setCompressOutput(conf, false); } else { SequenceFileOutputFormat.setCompressOutput(conf, true); if (compressionType.equals("record")) { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); conf.setInt("io.seqfile.compress.blocksize", blocksize); } } conf.setInputFormat(TrecWebDocumentInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(TrecWebDocument.class); conf.setMapperClass(MyMapper.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.pagerank.FindMaxPageRankNodes.java
License:Apache License
/** * Runs this tool./*from w ww . j a v a 2 s . c o m*/ */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int n = Integer.parseInt(args[2]); sLogger.info("Tool name: FindMaxPageRankNodes"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); sLogger.info(" - n: " + n); JobConf conf = new JobConf(FindMaxPageRankNodes.class); conf.setJobName("FindMaxPageRankNodes"); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.setInt("n", n); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(FloatWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(FloatWritable.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java
License:Open Source License
private static long selfJoinLocal(Path in, Path out, OperationsParams params) throws IOException { if (isOneShotReadMode) { // Ensure all objects are read in one shot params.setInt(SpatialSite.MaxBytesInOneRead, -1); params.setInt(SpatialSite.MaxShapesInOneRead, -1); } else {/*from w ww . j a v a 2 s.c om*/ params.setInt(SpatialSite.MaxBytesInOneRead, maxBytesInOneRead); params.setInt(SpatialSite.MaxShapesInOneRead, maxShapesInOneRead); } ShapeArrayInputFormat inputFormat = new ShapeArrayInputFormat(); JobConf job = new JobConf(params); FileInputFormat.addInputPath(job, in); InputSplit[] splits = inputFormat.getSplits(job, 1); FileSystem outFs = out.getFileSystem(params); final PrintStream writer = new PrintStream(outFs.create(out)); // Process all input files long resultSize = 0; for (InputSplit split : splits) { ShapeArrayRecordReader reader = new ShapeArrayRecordReader(job, (FileSplit) split); final Text temp = new Text(); Rectangle key = reader.createKey(); ArrayWritable value = reader.createValue(); if (reader.next(key, value)) { Shape[] writables = (Shape[]) value.get(); resultSize += SpatialAlgorithms.SelfJoin_planeSweep(writables, true, new OutputCollector<Shape, Shape>() { @Override public void collect(Shape r, Shape s) throws IOException { writer.print(r.toText(temp)); writer.print(","); writer.println(s.toText(temp)); } }, null); if (reader.next(key, value)) { throw new RuntimeException("Error! Not all values read in one shot."); } } reader.close(); } writer.close(); return resultSize; }
From source file:edu.umn.cs.spatialHadoop.operations.Repartition.java
License:Open Source License
public static <S extends Shape> void repartitionLocal(Path inFile, Path outFile, CellInfo[] cells, OperationsParams params) throws IOException { String sindex = params.get("sindex"); Shape shape = params.getShape("shape"); JobConf job = new JobConf(params, Repartition.class); ShapeRecordWriter<Shape> writer; if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) { writer = new GridRecordWriter<Shape>(outFile, job, null, cells); } else if (sindex.equals("rtree") || sindex.equals("r+tree")) { writer = new RTreeGridRecordWriter<Shape>(outFile, job, null, cells); writer.setStockObject(shape);/*w w w . jav a 2s . com*/ } else { throw new RuntimeException("Unupoorted spatial idnex: " + sindex); } // Read input file(s) FileInputFormat.addInputPath(job, inFile); ShapeInputFormat<S> inputFormat = new ShapeInputFormat<S>(); InputSplit[] splits = inputFormat.getSplits(job, 1); for (InputSplit split : splits) { ShapeRecordReader<Shape> reader = new ShapeRecordReader<Shape>(params, (FileSplit) split); Rectangle c = reader.createKey(); while (reader.next(c, shape)) { if (shape.getMBR() != null) writer.write(NullWritable.get(), shape); } reader.close(); } writer.close(null); }
From source file:edu.umn.cs.spatialHadoop.operations.Repartition.java
License:Open Source License
/** * @param inFile The input raw file that needs to be indexed. * @param outputPath The output path where the index will be written. * @param params The parameters and configuration of the underlying job * @throws IOException If an exception happens while preparing the job. * @throws InterruptedException If the underlying MapReduce job was interrupted. *//* w ww .j av a2 s . co m*/ public static void repartition(Path inFile, Path outputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, FileMBR.class); FileInputFormat.addInputPath(job, inFile); ShapeInputFormat<Shape> inputFormat = new ShapeInputFormat<Shape>(); boolean autoLocal = inputFormat.getSplits(job, 1).length <= 3; boolean isLocal = params.getBoolean("local", autoLocal); if (isLocal) repartitionLocal(inFile, outputPath, params); else repartitionMapReduce(inFile, outputPath, null, params); }
From source file:fm.last.hadoop.programs.labs.trackstats.TrackStatisticsProgram.java
License:Apache License
/** * Create a JobConf for a Job that will calculate the number of unique listeners per track. * // w w w . jav a 2 s . c om * @param inputDir The path to the folder containing the raw listening data files. * @return The unique listeners JobConf. */ private JobConf getUniqueListenersJobConf(Path inputDir) { log.info("Creating configuration for unique listeners Job"); // output results to a temporary intermediate folder, this will get deleted by start() method Path uniqueListenersOutput = new Path("uniqueListeners"); JobConf conf = new JobConf(TrackStatisticsProgram.class); conf.setOutputKeyClass(IntWritable.class); // track id conf.setOutputValueClass(IntWritable.class); // number of unique listeners conf.setInputFormat(TextInputFormat.class); // raw listening data conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapperClass(UniqueListenersMapper.class); conf.setCombinerClass(UniqueListenersCombiner.class); conf.setReducerClass(UniqueListenersReducer.class); FileInputFormat.addInputPath(conf, inputDir); FileOutputFormat.setOutputPath(conf, uniqueListenersOutput); conf.setJobName("uniqueListeners"); return conf; }
From source file:fm.last.hadoop.programs.labs.trackstats.TrackStatisticsProgram.java
License:Apache License
/** * Creates a JobConf for a Job that will sum up the TrackStatistics per track. * //from w w w . j a va2 s. co m * @param inputDir The path to the folder containing the raw input data files. * @return The sum JobConf. */ private JobConf getSumJobConf(Path inputDir) { log.info("Creating configuration for sum job"); // output results to a temporary intermediate folder, this will get deleted by start() method Path playsOutput = new Path("sum"); JobConf conf = new JobConf(TrackStatisticsProgram.class); conf.setOutputKeyClass(IntWritable.class); // track id conf.setOutputValueClass(TrackStats.class); // statistics for a track conf.setInputFormat(TextInputFormat.class); // raw listening data conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapperClass(SumMapper.class); conf.setCombinerClass(SumReducer.class); conf.setReducerClass(SumReducer.class); FileInputFormat.addInputPath(conf, inputDir); FileOutputFormat.setOutputPath(conf, playsOutput); conf.setJobName("sum"); return conf; }