Example usage for org.apache.hadoop.mapreduce Job getInstance

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getInstance.

Prototype

@Deprecated
public static Job getInstance(Cluster ignored, Configuration conf) throws IOException

Source Link

Document

Creates a new Job with no particular Cluster and given Configuration .

Usage

From source file:com.yassergonzalez.pagerank.PageRank.java

License:Apache License

private void pageRankIteration(int iter, Configuration conf, Path outputDir) throws Exception {

    // This job performs an iteration of the power iteration method to
    // compute PageRank. The map task processes each block M_{i,j}, loads
    // the corresponding stripe j of the vector v_{k-1} and produces the
    // partial result of the stripe i of the vector v_k. The reduce task
    // sums all the partial results of v_k and adds the teleportation factor
    // (the combiner only sums all the partial results). See Section 5.2
    // (and 5.2.3 in particular) of Mining of Massive Datasets
    // (http://infolab.stanford.edu/~ullman/mmds.html) for details. The
    // output is written in a "vk" subdir of the output dir, where k is the
    // iteration number. MapFileOutputFormat is used to keep an array of the
    // stripes of v.

    Job job = Job.getInstance(conf, "PageRank:Iteration");

    job.setJarByClass(PageRank.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(PageRankIterationMapper.class);
    job.setMapOutputKeyClass(ShortWritable.class);
    job.setMapOutputValueClass(FloatArrayWritable.class);
    job.setCombinerClass(PageRankIterationCombiner.class);
    job.setReducerClass(PageRankIterationReducer.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(ShortWritable.class);
    job.setOutputValueClass(FloatArrayWritable.class);
    FileInputFormat.addInputPath(job, new Path(outputDir, "M"));
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter));

    job.waitForCompletion(true);//from  w  w  w .  ja v  a  2  s . c  o  m
}

From source file:com.zjy.mongo.util.MongoTool.java

License:Apache License

private int runMapReduceJob(final Configuration conf) throws IOException {
    final Job job = Job.getInstance(conf, getJobName());
    /**/*from  www . j  av  a  2  s  .c om*/
     * Any arguments specified with -D <property>=<value>
     * on the CLI will be picked up and set here
     * They override any XML level values
     * Note that -D<space> is important - no space will
     * not work as it gets picked up by Java itself
     */
    // TODO - Do we need to set job name somehow more specifically?
    // This may or may not be correct/sane
    job.setJarByClass(getClass());
    final Class<? extends Mapper> mapper = MongoConfigUtil.getMapper(conf);

    if (LOG.isDebugEnabled()) {
        LOG.debug("Mapper Class: " + mapper);
        LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI));
    }
    job.setMapperClass(mapper);
    Class<? extends Reducer> combiner = MongoConfigUtil.getCombiner(conf);
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }
    job.setReducerClass(MongoConfigUtil.getReducer(conf));

    job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf));
    job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf));
    job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf));
    job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf));
    Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf);
    Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf);

    if (mapOutputKeyClass != null) {
        job.setMapOutputKeyClass(mapOutputKeyClass);
    }
    if (mapOutputValueClass != null) {
        job.setMapOutputValueClass(mapOutputValueClass);
    }

    /**
     * Determines if the job will run verbosely e.g. print debug output
     * Only works with foreground jobs
     */
    final boolean verbose = MongoConfigUtil.isJobVerbose(conf);
    /**
     * Run job in foreground aka wait for completion or background?
     */
    final boolean background = MongoConfigUtil.isJobBackground(conf);
    try {
        if (background) {
            LOG.info("Setting up and running MapReduce job in background.");
            job.submit();
            return 0;
        } else {
            LOG.info("Setting up and running MapReduce job in foreground, will wait for results.  {Verbose? "
                    + verbose + "}");
            return job.waitForCompletion(true) ? 0 : 1;
        }
    } catch (final Exception e) {
        LOG.error("Exception while executing job... ", e);
        return 1;
    }
}

From source file:counting.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    final int NUMBER_OF_NODES = 31;
    final int MAX_NUMBER_OF_TASKS = 1000;
    final double REDUCER_CONSTANT = 0.95; // or 1.75

    if (otherArgs.length < 5) {
        System.err.println(//from   w w  w .  jav  a 2 s.c  o m
                "Usage: wordcount <in> [<in>...] <out> <ngram> <combiner:yes/no> <custom partioner:yes/no>");
        System.exit(2);
    }

    Job job = Job.getInstance(conf, "Word count");

    // Setting map and reduce tasks
    //conf.setNumMapTasks(5); // Not possible with code in line?
    int NUMBER_OF_REDUCERS = (int) REDUCER_CONSTANT * NUMBER_OF_NODES * MAX_NUMBER_OF_TASKS;
    //System.out.println("Number of Reducers: " + NUMBER_OF_REDUCERS);
    job.setNumReduceTasks(12); // Placeholder

    job.setJarByClass(WordCount.class);
    job.setMapperClass(nGramMapper.class);
    nGramMapper.setN(Integer.parseInt(otherArgs[otherArgs.length - 3])); // Set ngram length
    System.out.println("n = " + nGramMapper.getN());
    System.out.println("Combiner = " + otherArgs[otherArgs.length - 2]);
    System.out.println("Custom Partitioner = " + otherArgs[otherArgs.length - 1]);
    System.out.println("Number of reducers = " + NUMBER_OF_NODES);
    if (otherArgs[otherArgs.length - 2].equals("yes")) {
        job.setCombinerClass(IntSumReducer.class);
    }

    if (otherArgs[otherArgs.length - 1].equals("yes")) {
        job.setPartitionerClass(CustomPartitioner.class);
        //CustomPartitioner.setNumberOfReducers(NUMBER_OF_REDUCERS);
    }
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    // Input paths
    for (int i = 0; i < otherArgs.length - 4; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    // Output paths
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 4]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:cp_a.CP_A.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(CP_A.class);

    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:cp_b.CP_B.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(CP_B.class);

    job.setMapperClass(TokenizerMapper.class);
    job.setReducerClass(IntSumReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:cp_c.CP_C.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(CP_C.class);

    job.setMapperClass(TokenizerMapper.class);
    job.setReducerClass(IntSumReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:de.gesundkrank.wikipedia.hadoop.util.RepackToMapFile.java

License:Open Source License

public int run(String basePath, String outputPath, boolean checkNew, boolean skipRedirect) throws Exception {
    Configuration configuration = getConf();
    configuration.setBoolean("skipRedirect", skipRedirect);

    LOGGER.info("Tool name: " + getClass().getSimpleName());

    Job job = Job.getInstance(configuration, getClass().getSimpleName());
    job.setJarByClass(getClass());/*from   w  ww.j a  v a  2s . c  o m*/

    job.setMapperClass(WikiMapper.class);
    job.setInputFormatClass(WikiInputFormat.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(WikiRevisionWritable.class);

    WikiDumpLoader wikiDumpLoader = new WikiDumpLoader(checkNew);
    wikiDumpLoader.addWikiDump(job, basePath);

    MapFileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setNumReduceTasks(1);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.l3s.common.features.hadoop.TimeSeriesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options opts = new Options();

    Option jnameOpt = OptionBuilder.withArgName("job-name").hasArg(true).withDescription("Timeseries analysis")
            .create(JOB_NAME);// w  ww .  j a  v a 2  s .c o m

    Option inputOpt = OptionBuilder.withArgName("input-path").hasArg(true)
            .withDescription("Timeseries file path (required)").create(INPUT_OPT);

    Option outputOpt = OptionBuilder.withArgName("output-path").hasArg(true)
            .withDescription("output file path (required)").create(OUTPUT_OPT);

    Option reduceOpt = OptionBuilder.withArgName("reduce-no").hasArg(true)
            .withDescription("number of reducer nodes").create(REDUCE_NO);

    Option rmOpt = OptionBuilder.withArgName("remove-out").hasArg(false)
            .withDescription("remove the output then create again before writing files onto it")
            .create(REMOVE_OUTPUT);

    Option cOpt = OptionBuilder.withArgName("compress-option").hasArg(true)
            .withDescription("compression option").create(COMPRESS_OPT);

    opts.addOption(jnameOpt);
    opts.addOption(inputOpt);
    opts.addOption(reduceOpt);
    opts.addOption(outputOpt);
    opts.addOption(rmOpt);
    opts.addOption(cOpt);
    CommandLine cl;
    CommandLineParser parser = new GnuParser();
    try {
        cl = parser.parse(opts, args);
    } catch (ParseException e) {
        System.err.println("Error parsing command line: " + e.getMessage());
        return -1;
    }

    if (!cl.hasOption(INPUT_OPT) || !cl.hasOption(OUTPUT_OPT)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(getClass().getName(), opts);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    int reduceNo = DEFAULT_REDUCER_NO;
    if (cl.hasOption(REDUCE_NO)) {
        try {
            reduceNo = Integer.parseInt(cl.getOptionValue(REDUCE_NO));
        } catch (NumberFormatException e) {
            System.err.println("Error parsing reducer number: " + e.getMessage());
        }
    }

    String jobName = "Distributed timeseries [R] correlation";
    if (cl.hasOption(JOB_NAME)) {
        jobName = cl.getOptionValue(JOB_NAME);
        jobName = jobName.replace('-', ' ');
    }

    if (cl.hasOption(REMOVE_OUTPUT)) {

    }

    String input = cl.getOptionValue(INPUT_OPT);
    String output = cl.getOptionValue(OUTPUT_OPT);

    Configuration conf = getConf();
    //DistributedCache.createSymlink(conf); 
    //DistributedCache.addCacheFile(new URI("hdfs://master.hadoop:8020/user/nguyen/lib/"), conf);
    Job job = Job.getInstance(conf, jobName);
    job.setJarByClass(TimeSeriesJob.class);
    job.setMapperClass(TimeSeriesMapper.class);
    job.setReducerClass(TimeSeriesReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Timeseries.class);

    job.setNumReduceTasks(reduceNo);
    job.setInputFormatClass(WholeFileInputFormat.class);
    WholeFileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, new Path(output));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.l3s.content.timex.extracting.ClueWeb09Timex.java

License:Apache License

/**
 * Runs this tool./* www . jav a 2 s  .  c o m*/
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("input").hasArg().withDescription("input path").create(INPUT_OPTION));

    options.addOption(
            OptionBuilder.withArgName("output").hasArg().withDescription("output path").create(OUTPUT_OPTION));

    options.addOption(OptionBuilder.withArgName("column").hasArg()
            .withDescription("column to store row data into (must exist)").create(COLUMN));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    cmdline = parser.parse(options, args);

    if (!cmdline.hasOption(INPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    if (!cmdline.hasOption(OUTPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);

    String output = cmdline.getOptionValue(OUTPUT_OPTION);

    //      String column = cmdline.getOptionValue(COLUMN);

    LOG.info("Tool name: " + ClueWeb09Timex.class.getSimpleName());
    LOG.info(" - input: " + input);
    LOG.info(" - output: " + output);
    //      LOG.info(" - column: " + column);

    Configuration conf = HBaseConfiguration.create();
    conf.set("hbase.zookeeper.quorum", "node05.ib,node03.ib,node04.ib");
    conf.set("hbase.zookeeper.property.clientPort", "2181");
    conf.set("hbase.master", "master.ib");

    //      conf.set("conf.column", column);

    long milliSeconds = 10000 * 60 * 60; //x10 default
    conf.setLong("mapred.task.timeout", milliSeconds);

    Job job = Job.getInstance(conf, ClueWeb09Timex.class.getSimpleName()
            + " time-confident extraction + annotation + HBase import: " + input);

    //Configuration conf = new Configuration();
    //Job job = Job.getInstance(conf, "web pages count");
    job.setJarByClass(ClueWeb09Timex.class);
    job.setNumReduceTasks(0);

    job.setInputFormatClass(ClueWeb09InputFormat.class);
    job.setOutputFormatClass(TableOutputFormat.class);
    job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, output);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Writable.class);
    job.setMapperClass(TMapper.class);
    //job.setReducerClass(IntSumReducer.class);
    //job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(input));
    //FileOutputFormat.setOutputPath(job, new Path(output));
    job.waitForCompletion(true);

    return 0;
}

From source file:de.l3s.content.timex.extracting.ClueWeb09TimexWriteToHDFS.java

License:Apache License

/**
 * Runs this tool.//from   w  w w . j  a  va 2 s . co m
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("input").hasArg().withDescription("input path").create(INPUT_OPTION));

    options.addOption(
            OptionBuilder.withArgName("output").hasArg().withDescription("output path").create(OUTPUT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    cmdline = parser.parse(options, args);

    if (!cmdline.hasOption(INPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    if (!cmdline.hasOption(OUTPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);

    String output = cmdline.getOptionValue(OUTPUT_OPTION);

    LOG.info("Tool name: " + ClueWeb09TimexWriteToHDFS.class.getSimpleName());
    LOG.info(" - input: " + input);
    LOG.info(" - output: " + output);

    Configuration conf = new Configuration();
    long milliSeconds = 10000 * 60 * 60; //x10 default
    conf.setLong("mapred.task.timeout", milliSeconds);
    Job job = Job.getInstance(conf, "extract CW tempex and output to HDFS");
    job.setJarByClass(ClueWeb09TimexWriteToHDFS.class);
    job.setNumReduceTasks(0);

    job.setInputFormatClass(ClueWeb09InputFormat.class);
    job.setMapperClass(TMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(input));
    FileOutputFormat.setOutputPath(job, new Path(output));
    job.waitForCompletion(true);

    return 0;
}