List of usage examples for org.apache.hadoop.fs FileSystem get
public static FileSystem get(Configuration conf) throws IOException
From source file:ReadSeqFile.java
License:Open Source License
public static void main(String[] args) throws IOException { String filename = "/tmp/output/part-00000"; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(filename), conf); IndexKey key = new IndexKey(); IndexValue value = new IndexValue(); for (int i = 0; i < 100; i++) { reader.next(key, value);/*from w w w .j ava2 s . c om*/ } }
From source file:ExtractTopPersonalizedPageRankNodes.java
License:Apache License
/** * Runs this tool./*from w ww. j a v a 2 s . c o m*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("top n").create(TOP)); options.addOption(OptionBuilder.withArgName("src").hasArg().withDescription("source node").create(SRC)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(TOP)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = "abc";//cmdline.getOptionValue(OUTPUT); int n = Integer.parseInt(cmdline.getOptionValue(TOP)); //LOG.info("Tool name: " + ExtractTopPersonalizedPageRankNodes.class.getSimpleName()); //LOG.info(" - input: " + inputPath); //LOG.info(" - output: " + outputPath); //LOG.info(" - top: " + n); Configuration conf = getConf(); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.setInt(TOP_PG, n); Job job = Job.getInstance(conf); job.setJobName(ExtractTopPersonalizedPageRankNodes.class.getName() + ":" + inputPath); job.setJarByClass(ExtractTopPersonalizedPageRankNodes.class); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(PairOfIntFloat.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyMapper.class); job.setPartitionerClass(MyPartitioner.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); job.waitForCompletion(true); FileSystem fileSystem = FileSystem.get(conf); Path path = new Path(outputPath + "/part-r-00000"); ; //MapFile.Reader reader = new MapFile.Reader(new Path(outputPath+ "/part-r-00000"),conf); // InputStream fis=new FileInputStream(outputPath+"/part-r-00000"); BufferedReader br = new BufferedReader(new InputStreamReader(fileSystem.open(path))); String s; float key;//=new FloatWritable(); int value;//=new IntWritable(); while ((s = br.readLine()) != null) { String[] sources = s.split("\\s+"); key = Float.parseFloat(sources[0]); value = Integer.parseInt(sources[1]); if (key == 0.0f) { System.out.print("\n" + "Source: " + value + "\n"); } else { System.out.print(String.format("%.5f %d", key, value) + "\n"); } } //reader.close(); br.close(); //while(!SysOut.isEmpty()) //{ // System.out.print(SysOut.poll()); //} return 0; }
From source file:HDFSFileFinder.java
License:Apache License
private static void getBlockLocationsFromHdfs() { StringBuilder sb = new StringBuilder(); Configuration conf = new Configuration(); boolean first = true; // make connection to hdfs try {/*from ww w . jav a 2 s . c o m*/ if (verbose) { writer.println("DEBUG: Trying to connect to " + fsName); } FileSystem fs = FileSystem.get(conf); Path file = new Path(fileName); FileStatus fStatus = fs.getFileStatus(file); status = fStatus; bLocations = fs.getFileBlockLocations(status, 0, status.getLen()); //print out all block locations for (BlockLocation aLocation : bLocations) { String[] names = aLocation.getHosts(); for (String name : names) { InetAddress addr = InetAddress.getByName(name); String host = addr.getHostName(); int idx = host.indexOf('.'); String hostname; if (0 < idx) { hostname = host.substring(0, host.indexOf('.')); } else { hostname = host; } if (first) { sb.append(hostname); first = false; } else { sb.append(",").append(hostname); } } } sb.append(NEWLINE); } catch (IOException e) { writer.println("Error getting block location data from namenode"); e.printStackTrace(); } writer.print(sb.toString()); writer.flush(); }
From source file:BigramRelativeFrequencyJson.java
License:Apache License
/** * Runs this tool.//from w ww .j ava 2 s . c o m */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int reduceTasks = Integer.parseInt(args[2]); LOG.info("Tool name: " + BigramRelativeFrequencyJson.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - num reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(BigramRelativeFrequencyJson.class.getSimpleName()); job.setJarByClass(BigramRelativeFrequencyJson.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(MyTuple.class); job.setMapOutputValueClass(FloatWritable.class); job.setOutputKeyClass(MyTuple.class); job.setOutputValueClass(FloatWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.setPartitionerClass(MyPartitioner.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:PairsPMI_M.java
License:Apache License
/** * Runs this tool.// w w w . jav a 2 s .com */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } // First MapReduce Job String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool name: " + PairsPMI_M.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - tmp path: " + outputPath + "/tmp"); LOG.info(" - num reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(PairsPMI_M.class.getSimpleName()); job.setJarByClass(PairsPMI_M.class); // Delete the tmp directory if it exists already Path tmpDir = new Path("tmp_wj"); FileSystem.get(getConf()).delete(tmpDir, true); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path("tmp_wj")); job.setMapOutputKeyClass(PairOfStrings.class); job.setMapOutputValueClass(FloatWritable.class); job.setOutputKeyClass(PairOfStrings.class); job.setOutputValueClass(FloatWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); // job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); double time1 = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Job Finished in " + time1 + " seconds"); numRecords = job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS") .getValue(); /* * Second MapReduce Job */ LOG.info("Tool name: " + PairsPMI_M.class.getSimpleName()); LOG.info("second stage of MapReduce"); LOG.info(" - input from tmp path: " + outputPath + "/tmp_wj"); LOG.info(" - output path: " + outputPath); LOG.info(" - num reducers: " + reduceTasks); // set the global variable Configuration conf = getConf(); conf.setLong("numRec", numRecords); job = Job.getInstance(getConf()); job.setJobName(PairsPMI_M.class.getSimpleName()); job.setJarByClass(PairsPMI_M.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path("tmp_wj/part*")); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(PairOfStrings.class); job.setMapOutputValueClass(FloatWritable.class); // job.setOutputKeyClass(PairOfStrings.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FloatWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); // job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapperSecond.class); // job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducerSecond.class); job.setPartitionerClass(MyPartitioner.class); startTime = System.currentTimeMillis(); job.waitForCompletion(true); double time2 = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Second job finished in " + time2 + " seconds"); System.out.println("Total time: " + (time1 + time2) + " seconds"); return 0; }
From source file:RepackWikipedia.java
License:Apache License
@SuppressWarnings("static-access") @Override//from w w w . j a va 2 s.c om public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location") .create(OUTPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file") .create(MAPPING_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("block|record|none").hasArg() .withDescription("compression type").create(COMPRESSION_TYPE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de").hasArg().withDescription("two-letter language code") .create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION); String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION); if (!"block".equals(compressionType) && !"record".equals(compressionType) && !"none".equals(compressionType)) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } // this is the default block size int blocksize = 1000000; //Job job = Job.getInstance(getConf()); JobConf conf = new JobConf(RepackWikipedia.class); conf.setJarByClass(RepackWikipedia.class); conf.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language)); conf.set(DOCNO_MAPPING_FIELD, mappingFile); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - XML dump file: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - docno mapping data file: " + mappingFile); LOG.info(" - compression type: " + compressionType); LOG.info(" - language: " + language); if ("block".equals(compressionType)) { LOG.info(" - block size: " + blocksize); } conf.setNumReduceTasks(0); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); if ("none".equals(compressionType)) { FileOutputFormat.setCompressOutput(conf, false); } else { FileOutputFormat.setCompressOutput(conf, true); if ("record".equals(compressionType)) { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); conf.setInt("io.seqfile.compress.blocksize", blocksize); } } if (language != null) { conf.set("wiki.language", language); } conf.setInputFormat(WikipediaPageInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(WikipediaPage.class); conf.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(getConf()).delete(new Path(outputPath), true); //job.waitForCompletion(true); JobClient.runJob(conf); return 0; }
From source file:BP.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 10) { for (int i = 0; i < args.length; i++) { System.out.println("Args: " + i + " " + args[i]); }/*from w w w . j a va 2s .c om*/ System.out.println(args.length); return printUsage(); } edge_path = new Path(args[0]); prior_path = new Path(args[1]); output_path = new Path(args[2]); number_msg = Long.parseLong(args[3]); nreducer = Integer.parseInt(args[4]); nreducer = 1; max_iter = Integer.parseInt(args[5]); nstate = Integer.parseInt(args[7]); edge_potential_str = read_edge_potential(args[8]); int cur_iter = 1; if (args[9].startsWith("new") == false) { cur_iter = Integer.parseInt(args[9].substring(4)); } System.out.println("edge_path=" + edge_path.toString() + ", prior_path=" + prior_path.toString() + ", output_path=" + output_path.toString() + ", |E|=" + number_msg + ", nreducer=" + nreducer + ", maxiter=" + max_iter + ", nstate=" + nstate + ", edge_potential_str=" + edge_potential_str + ", cur_iter=" + cur_iter); fs = FileSystem.get(getConf()); // Run Stage1 and Stage2. if (cur_iter == 1) { System.out.println("BP: Initializing messages..."); JobClient.runJob(configInitMessage()); } double converge_threshold = number_msg * EPS * nstate; int i; for (i = cur_iter; i <= max_iter; i++) { System.out.println(" *** ITERATION " + (i) + "/" + max_iter + " ***"); JobClient.runJob(configUpdateMessage()); JobClient.runJob(configCheckErr()); JobClient.runJob(configSumErr()); String line = readLocaldirOneline(sum_error_path.toString()); fs.delete(check_error_path, true); fs.delete(sum_error_path, true); String[] parts = line.split("\t"); int n = Integer.parseInt(parts[0]); double sum = Double.parseDouble(parts[1]); System.out.println("Converged Msg: " + (number_msg - n)); System.out.println("Sum Error: " + sum); if (sum < converge_threshold) { break; } // rotate directory fs.delete(message_cur_path); fs.rename(message_next_path, message_cur_path); } System.out.println("CONVERGE_ITER " + i); System.out.println("BP: Computing beliefs..."); JobClient.runJob(configComputeBelief()); System.out.println("BP finished. The belief vector is in the HDFS " + args[2]); return 0; }
From source file:HoodieJavaStreamingApp.java
License:Apache License
/** * * @throws Exception/*from w w w. j av a2 s . c o m*/ */ public void run() throws Exception { // Spark session setup.. SparkSession spark = SparkSession.builder().appName("Hoodie Spark Streaming APP") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]") .getOrCreate(); JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext()); // folder path clean up and creation, preparing the environment FileSystem fs = FileSystem.get(jssc.hadoopConfiguration()); fs.delete(new Path(streamingSourcePath), true); fs.delete(new Path(streamingCheckpointingPath), true); fs.delete(new Path(tablePath), true); fs.mkdirs(new Path(streamingSourcePath)); // Generator of some records to be loaded in. HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); List<String> records1 = DataSourceTestUtils.convertToStringList(dataGen.generateInserts("001", 100)); Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2)); List<String> records2 = DataSourceTestUtils.convertToStringList(dataGen.generateUpdates("002", 100)); Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2)); // setup the input for streaming Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(streamingSourcePath); // start streaming and showing ExecutorService executor = Executors.newFixedThreadPool(2); // thread for spark strucutured streaming Future<Void> streamFuture = executor.submit(new Callable<Void>() { public Void call() throws Exception { logger.info("===== Streaming Starting ====="); stream(streamingInput); logger.info("===== Streaming Ends ====="); return null; } }); // thread for adding data to the streaming source and showing results over time Future<Void> showFuture = executor.submit(new Callable<Void>() { public Void call() throws Exception { logger.info("===== Showing Starting ====="); show(spark, fs, inputDF1, inputDF2); logger.info("===== Showing Ends ====="); return null; } }); // let the threads run streamFuture.get(); showFuture.get(); executor.shutdown(); }
From source file:ComputeCooccurrenceMatrixStripes.java
License:Apache License
/** * Runs this tool./* ww w .jav a2 s .com*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; int window = cmdline.hasOption(WINDOW) ? Integer.parseInt(cmdline.getOptionValue(WINDOW)) : 2; LOG.info("Tool: " + ComputeCooccurrenceMatrixStripes.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - window: " + window); LOG.info(" - number of reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(ComputeCooccurrenceMatrixStripes.class.getSimpleName()); job.setJarByClass(ComputeCooccurrenceMatrixStripes.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); job.getConfiguration().setInt("window", window); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(Text.class); job.setOutputValueClass(String2IntOpenHashMapWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(String2IntOpenHashMapWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:BuildPageRankRecords.java
License:Apache License
/** * Runs this tool.//from www. ja v a 2 s .c om */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES)); LOG.info("Tool name: " + BuildPageRankRecords.class.getSimpleName()); LOG.info(" - inputDir: " + inputPath); LOG.info(" - outputDir: " + outputPath); LOG.info(" - numNodes: " + n); Configuration conf = getConf(); conf.setInt(NODE_CNT_FIELD, n); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); Job job = Job.getInstance(conf); job.setJobName(BuildPageRankRecords.class.getSimpleName() + ":" + inputPath); job.setJarByClass(BuildPageRankRecords.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }