Example usage for org.apache.hadoop.mapreduce Job getInstance

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getInstance.

Prototype

@Deprecated
public static Job getInstance(Cluster ignored) throws IOException

Source Link

Document

Creates a new Job with no particular Cluster .

Usage

From source file:edu.gslis.ts.hadoop.ThriftDumper.java

License:Apache License

public int run(String[] args) throws Exception {
    String inputPath = args[0];/*w  w w  . ja v a 2 s.  co  m*/
    String outputPath = args[1];
    Path topicsFile = new Path(args[2]);
    Path vocabFile = new Path(args[3]);

    Configuration config = getConf();
    Job job = Job.getInstance(config);
    job.setJarByClass(ThriftDumper.class);
    job.setInputFormatClass(ThriftFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setReducerClass(ThriftDumperReducer.class);

    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputDirRecursive(job, true);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.addCacheFile(topicsFile.toUri());
    job.addCacheFile(vocabFile.toUri());

    job.setMapperClass(ThriftDumperMapper.class);

    boolean b = job.waitForCompletion(true);
    if (!b) {
        throw new IOException("error with job");
    }

    return 0;
}

From source file:edu.gslis.ts.hadoop.ThriftRMScorerHbaseMR.java

License:Apache License

public int run(String[] args) throws Exception {
    String tableName = args[0];/*from   w w  w .  ja v  a2  s.c  o m*/
    Path topicsFile = new Path(args[1]);
    Path vocabFile = new Path(args[2]);
    Path outputPath = new Path(args[3]);
    Path stoplist = new Path(args[4]);
    // String queryId = args[1];

    Configuration config = HBaseConfiguration.create(getConf());
    config.set("hbase.table.name", tableName);
    Job job = Job.getInstance(config);
    job.setJarByClass(ThriftRMScorerHbaseMR.class);

    Scan scan = new Scan();
    scan.setCaching(500);
    scan.setCacheBlocks(false);
    /*
    Filter prefixFilter = new PrefixFilter(Bytes.toBytes(queryId));
    scan.setFilter(prefixFilter);
    */

    TableMapReduceUtil.initTableMapperJob(tableName, scan, ThriftTableMapper.class, IntWritable.class, // mapper output key
            Text.class, // mapper output value
            job);

    job.setReducerClass(ThriftTableReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.addCacheFile(topicsFile.toUri());
    job.addCacheFile(vocabFile.toUri());
    job.addCacheFile(stoplist.toUri());
    FileOutputFormat.setOutputPath(job, outputPath);

    boolean b = job.waitForCompletion(true);
    if (!b) {
        throw new IOException("error with job!");
    }
    return 0;
}

From source file:edu.gslis.ts.hadoop.ThriftSentenceScorerHbase.java

License:Apache License

public int run(String[] args) throws Exception {
    String tableName = args[0];//from   w w w. j a v  a  2s  . c o m
    Path topicsFile = new Path(args[1]);
    Path vocabFile = new Path(args[2]);
    Path outputPath = new Path(args[3]);
    // String queryId = args[1];

    Configuration config = HBaseConfiguration.create(getConf());
    Job job = Job.getInstance(config);
    job.setJarByClass(ThriftSentenceScorerHbase.class);

    Scan scan = new Scan();
    scan.setCaching(500);
    scan.setCacheBlocks(false);
    /*
    Filter prefixFilter = new PrefixFilter(Bytes.toBytes(queryId));
    scan.setFilter(prefixFilter);
    */

    TableMapReduceUtil.initTableMapperJob(tableName, scan, ThriftTableMapper.class, Text.class, // mapper output key
            Text.class, // mapper output value
            job);

    job.addCacheFile(topicsFile.toUri());
    job.addCacheFile(vocabFile.toUri());

    FileOutputFormat.setOutputPath(job, outputPath);

    boolean b = job.waitForCompletion(true);
    if (!b) {
        throw new IOException("error with job!");
    }
    return 0;
}

From source file:edu.gslis.ts.hadoop.ThriftWordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    String inputPath = args[0];/*ww  w. j  a v a2 s. c o m*/
    Path outputPath = new Path(args[1]);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(ThriftWordCount.class);
    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setReducerClass(ThriftWordCountReducer.class);

    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputDirRecursive(job, true);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setMapperClass(ThriftWordCountMapper.class);

    boolean b = job.waitForCompletion(true);
    if (!b) {
        throw new IOException("error with job");
    }

    return 0;
}

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java

License:Apache License

private String phase1(String inputPath, int reduceNo, String lang)
        throws IOException, InterruptedException, ClassNotFoundException {

    String output = "tmp/wiki-link/phase1";

    Job job = Job.getInstance(getConf());
    job.setJobName("Build Wikipedia Weighted Link Graph. Phase 1");
    job.setJarByClass(BuildWikipediaWeightedLinkGraph.class);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");

    job.setNumReduceTasks(reduceNo);/*  ww  w .  j  a v  a  2s  . c om*/

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(output));

    if ("en".equals(lang)) {
        job.setInputFormatClass(WikipediaPageInputFormat.class);
    } else
        throw new InterruptedException("Wikipedia dump with language " + lang + " is not supported ");

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfStringInt.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfStringInt.class);

    job.setMapperClass(LinkEmitMapClass.class);
    job.setReducerClass(RedirectResolveReduceClass.class);

    job.waitForCompletion(true);

    return output;
}

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java

License:Apache License

private String phase2(String inputPath, int reduceNo)
        throws IOException, InterruptedException, ClassNotFoundException {

    String output = "tmp/wiki-link/phase2";

    Job job = Job.getInstance(getConf());
    job.setJobName("Build Wikipedia Weighted Link Graph. Phase 2");
    job.setJarByClass(BuildWikipediaWeightedLinkGraph.class);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");

    job.setNumReduceTasks(reduceNo);//from w w w  .  j av a  2  s  .c om

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfStringInt.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfStringInt.class);

    job.setReducerClass(DestinationIdResolveReduceClass.class);

    job.waitForCompletion(true);

    return output;
}

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaWeightedLinkGraph.java

License:Apache License

private String phase3(String inputPath, int reduceNo)
        throws IOException, InterruptedException, ClassNotFoundException {

    String output = "trace/phase3";

    Job job = Job.getInstance(getConf());
    job.setJobName("Build Wikipedia Weighted Link Graph. Phase 3");
    job.setJarByClass(BuildWikipediaWeightedLinkGraph.class);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");

    job.setNumReduceTasks(reduceNo);//from  w  w w. j  a v  a 2 s  . c  o  m

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfStringInt.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setReducerClass(SourceIdResolveReduceClass.class);

    job.waitForCompletion(true);

    return output;
}

From source file:edu.umd.cloud9.collection.wikipedia.CountWikipediaPages.java

License:Apache License

@SuppressWarnings("static-access")
@Override//  w  w  w  .j  av a 2s  .c o  m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder
            .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr")
            .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String language = "en"; // Assume 'en' by default.
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (!(language.length() == 2 || language.length() == 6)) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - language: " + language);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(CountWikipediaPages.class);
    job.setJobName(String.format("CountWikipediaPages[%s: %s, %s: %s]", INPUT_OPTION, inputPath,
            LANGUAGE_OPTION, language));

    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(inputPath));

    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }

    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);

    job.setMapperClass(MyMapper.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.cloud9.collection.wikipedia.DumpWikipediaToPlainText.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*  www  .  j  a v  a2  s  .com*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(OptionBuilder
            .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr")
            .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION));
    options.addOption(OptionBuilder.withArgName("TEXT|HTML|WIKI").hasArg()
            .withDescription("Output Content Type TEXT, HTML, WIKI").create(CONTENT_FORMAT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String language = "en"; // Assume "en" by default.
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (!(language.length() == 2 || language.length() == 6)) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    String contentFormat = "TEXT"; // Assume "TEXT" by default.
    if (cmdline.hasOption(CONTENT_FORMAT_OPTION)) {
        contentFormat = cmdline.getOptionValue(CONTENT_FORMAT_OPTION);
        if (!contentFormat.equals("TEXT") && !contentFormat.equals("HTML") && !contentFormat.equals("WIKI")) {

            System.err.println("Error: \"" + contentFormat + "\" unknown content type!");
            return -1;
        }
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - output path  : " + outputPath);
    LOG.info(" - language     : " + language);
    LOG.info(" - content_type : " + contentFormat);

    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJarByClass(DumpWikipediaToPlainText.class);
    job.setJobName(String.format("DumpWikipediaToPlainText[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION,
            inputPath, OUTPUT_OPTION, outputPath, LANGUAGE_OPTION, language, CONTENT_FORMAT_OPTION,
            contentFormat));

    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }
    if (contentFormat != null) {
        job.getConfiguration().set("wiki.content_format", contentFormat);
    }
    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java

License:Apache License

private void task1(String inputPath, String outputPath)
        throws IOException, ClassNotFoundException, InterruptedException {
    LOG.info("Exracting anchor text (phase 1)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(ExtractWikipediaAnchorTextWithWindow.class);
    job.setJobName(/* w  ww .  jav a2 s .  c  o m*/
            String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath));

    // 10 reducers is reasonable.
    job.setNumReduceTasks(10);

    // increase heap
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");

    job.getConfiguration().set("mapreduce.map.memory.mb", "6144");
    job.getConfiguration().set("mapreduce.reduce.memory.mb", "6144");
    job.getConfiguration().set("mapreduce.map.java.opts", "-Xmx6144m");
    job.getConfiguration().set("mapreduce.reduce.java.opts", "-Xmx6144m");
    job.getConfiguration().set("mapreduce.job.user.classpath.first", "true");

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    // job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(PairOfStringInt.class);
    job.setMapOutputValueClass(PairOfStrings.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PairOfIntString.class);

    job.setMapperClass(MyMapper1.class);
    job.setReducerClass(MyReducer1.class);
    job.setPartitionerClass(MyPartitioner1.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);
}