Example usage for org.apache.hadoop.mapreduce Job getInstance

List of usage examples for org.apache.hadoop.mapreduce Job getInstance

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getInstance.

Prototype

@Deprecated
public static Job getInstance(Cluster ignored) throws IOException 

Source Link

Document

Creates a new Job with no particular Cluster .

Usage

From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java

License:Apache License

private void task2(String inputPath, String outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    LOG.info("Exracting anchor text (phase 2)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(ExtractWikipediaAnchorTextWithWindow.class);
    job.setJobName(//from w w  w .j  a v a2 s. c  o m
            String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath));

    // Gathers everything together for convenience; feasible for Wikipedia.
    job.setNumReduceTasks(1);

    // increase heap
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");

    job.getConfiguration().set("mapreduce.map.memory.mb", "6144");
    job.getConfiguration().set("mapreduce.reduce.memory.mb", "6144");
    job.getConfiguration().set("mapreduce.map.java.opts", "-Xmx6144m");
    job.getConfiguration().set("mapreduce.reduce.java.opts", "-Xmx6144m");
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(HMapSIW.class);

    job.setMapperClass(MyMapper2.class);
    job.setReducerClass(MyReducer2.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    // Clean up intermediate data.
    FileSystem.get(job.getConfiguration()).delete(new Path(inputPath), true);
}

From source file:edu.umd.cloud9.collection.wikipedia.RepackWikipedia.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*  ww w  . j  a v  a  2s  . c om*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file")
            .create(MAPPING_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("block|record|none").hasArg()
            .withDescription("compression type").create(COMPRESSION_TYPE_OPTION));
    options.addOption(OptionBuilder
            .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr")
            .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION);
    String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION);

    if (!"block".equals(compressionType) && !"record".equals(compressionType)
            && !"none".equals(compressionType)) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (!(language.length() == 2 || language.length() == 6)) {
            // Added length check for 6 to include languages like zh_yue
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    // this is the default block size
    int blocksize = 1000000;

    Job job = Job.getInstance(getConf());
    job.setJarByClass(RepackWikipedia.class);
    job.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language));

    job.getConfiguration().set(DOCNO_MAPPING_FIELD, mappingFile);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - docno mapping data file: " + mappingFile);
    LOG.info(" - compression type: " + compressionType);
    LOG.info(" - language: " + language);

    if ("block".equals(compressionType)) {
        LOG.info(" - block size: " + blocksize);
    }

    job.setNumReduceTasks(0);

    SequenceFileInputFormat.addInputPath(job, new Path(inputPath));
    SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));

    if ("none".equals(compressionType)) {
        SequenceFileOutputFormat.setCompressOutput(job, false);
    } else {
        SequenceFileOutputFormat.setCompressOutput(job, true);

        if ("record".equals(compressionType)) {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
            job.getConfiguration().setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }

    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(WikipediaPageFactory.getWikipediaPageClass(language));

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMappingBuilder.java

License:Apache License

@SuppressWarnings("static-access")
@Override//from   w w  w  .  ja v  a  2 s. com
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file")
            .create(OUTPUT_FILE_OPTION));
    options.addOption(OptionBuilder
            .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr")
            .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION));
    options.addOption(KEEP_ALL_OPTION, false, "keep all pages");

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (!(language.length() == 2 || language.length() == 6)) {
            // Added length check for 6 to include languages like zh_yue
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION);
    boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION);

    String tmpPath = "tmp-" + WikipediaDocnoMappingBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output file: " + outputFile);
    LOG.info(" - keep all pages: " + keepAll);
    LOG.info(" - language: " + language);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(WikipediaDocnoMappingBuilder.class);
    job.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language));

    job.getConfiguration().setBoolean(KEEP_ALL_OPTION, keepAll);
    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(tmpPath), true);

    job.waitForCompletion(true);
    long cnt = keepAll ? job.getCounters().findCounter(PageTypes.TOTAL).getValue()
            : job.getCounters().findCounter(PageTypes.ARTICLE).getValue();

    WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(getConf()), tmpPath + "/part-r-00000", (int) cnt,
            outputFile);

    FileSystem.get(getConf()).delete(new Path(tmpPath), true);

    return 0;
}

From source file:edu.umd.cloud9.example.bfs.EncodeBfsGraph.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*  ww  w.j  a  va 2s  .  com*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("nodeid").hasArg().withDescription("source node").create(SRC_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(SRC_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    int src = Integer.parseInt(cmdline.getOptionValue(SRC_OPTION));

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - src: " + src);

    Job job = Job.getInstance(getConf());
    job.setJobName(String.format("EncodeBfsGraph[%s: %s, %s: %s, %s: %d]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, SRC_OPTION, src));
    job.setJarByClass(EncodeBfsGraph.class);

    job.setNumReduceTasks(0);

    job.getConfiguration().setInt(SRC_OPTION, src);
    job.getConfiguration().setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BfsNode.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BfsNode.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.cloud9.example.bfs.FindNodeAtDistance.java

License:Apache License

@SuppressWarnings("static-access")
@Override//  w ww .j  av  a2s .  c  om
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("distance").create(DISTANCE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(DISTANCE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    int distance = Integer.parseInt(cmdline.getOptionValue(DISTANCE_OPTION));

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - distance: " + distance);

    Job job = Job.getInstance(getConf());
    job.setJobName(String.format("FindNodeAtDistance[%s: %s, %s: %s, %s: %d]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, DISTANCE_OPTION, distance));
    job.setJarByClass(FindNodeAtDistance.class);

    job.setNumReduceTasks(0);

    job.getConfiguration().setInt(DISTANCE_OPTION, distance);
    job.getConfiguration().setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BfsNode.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BfsNode.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.cloud9.example.bfs.FindReachableNodes.java

License:Apache License

@SuppressWarnings("static-access")
@Override/* w  ww .  j  a v a  2s. c  om*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);

    Job job = Job.getInstance(getConf());
    job.setJobName(String.format("FindReachableNodes:[%s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_OPTION,
            outputPath));
    job.setJarByClass(FindReachableNodes.class);

    job.setNumReduceTasks(0);

    job.getConfiguration().setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BfsNode.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BfsNode.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.cloud9.example.bfs.IterateBfs.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*from  w ww .ja  v a2 s .  c om*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of partitions")
            .create(NUM_PARTITIONS_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(NUM_PARTITIONS_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    int n = Integer.parseInt(cmdline.getOptionValue(NUM_PARTITIONS_OPTION));

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - numPartitions: " + n);

    getConf().set("mapred.child.java.opts", "-Xmx2048m");

    Job job = Job.getInstance(getConf());
    job.setJobName(String.format("IterateBfs[%s: %s, %s: %s, %s: %d]", INPUT_OPTION, inputPath, OUTPUT_OPTION,
            outputPath, NUM_PARTITIONS_OPTION, n));
    job.setJarByClass(EncodeBfsGraph.class);

    job.setNumReduceTasks(n);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BfsNode.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BfsNode.class);

    job.setMapperClass(MapClass.class);
    job.setReducerClass(ReduceClass.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.cloud9.example.bigram.BigramCount.java

License:Apache License

/**
 * Runs this tool.//w w w  . j  av a2s .  co m
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool name: " + BigramCount.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - num reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(BigramCount.class.getSimpleName());
    job.setJarByClass(BigramCount.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.cloud9.example.bigram.BigramRelativeFrequency.java

License:Apache License

/**
 * Runs this tool.//from   w w  w .  ja v a 2  s . c  o m
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool name: " + BigramRelativeFrequency.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - num reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(BigramRelativeFrequency.class.getSimpleName());
    job.setJarByClass(BigramRelativeFrequency.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setMapOutputKeyClass(PairOfStrings.class);
    job.setMapOutputValueClass(FloatWritable.class);
    job.setOutputKeyClass(PairOfStrings.class);
    job.setOutputValueClass(FloatWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.cloud9.example.bigram.BigramRelativeFrequencyJson.java

License:Apache License

/**
 * Runs this tool.//from   w w  w  .  j  a va 2 s  .c  om
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool name: " + BigramRelativeFrequencyJson.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - num reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(BigramRelativeFrequencyJson.class.getSimpleName());
    job.setJarByClass(BigramRelativeFrequencyJson.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setMapOutputKeyClass(MyTuple.class);
    job.setMapOutputValueClass(FloatWritable.class);
    job.setOutputKeyClass(MyTuple.class);
    job.setOutputValueClass(FloatWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}