Example usage for org.apache.hadoop.mapreduce Job getInstance

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getInstance.

Prototype

@Deprecated
public static Job getInstance(Cluster ignored) throws IOException

Source Link

Document

Creates a new Job with no particular Cluster .

Usage

From source file:code.DemoWordCount.java

License:Apache License

/**
 * Runs this tool.//from w  ww  .j a v  a2s . c  om
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + DemoWordCount.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);

    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJobName(DemoWordCount.class.getSimpleName());
    job.setJarByClass(DemoWordCount.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:com.accumulobook.advanced.mapreduce.MapReduceFilesExample.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(this.getConf());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(WordCount.WordCountMapper.class);
    job.setCombinerClass(WordCount.WordCountCombiner.class);
    job.setReducerClass(WordCount.WordCountReducer.class);

    // clone the articles table
    ZooKeeperInstance inst = new ZooKeeperInstance(args[0], args[1]);
    Connector conn = inst.getConnector(args[2], new PasswordToken(args[3]));

    conn.tableOperations().clone(WikipediaConstants.ARTICLES_TABLE, WikipediaConstants.ARTICLES_TABLE_CLONE,
            true, Collections.EMPTY_MAP, Collections.EMPTY_SET);

    // take cloned table offline, waiting until the operation is complete
    boolean wait = true;
    conn.tableOperations().offline(WikipediaConstants.ARTICLES_TABLE_CLONE, wait);

    ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]);

    // input/*w w  w. ja  va 2s .  c  o m*/
    job.setInputFormatClass(AccumuloInputFormat.class);
    AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE_CLONE);
    List<Pair<Text, Text>> columns = new ArrayList<>();
    columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text("")));

    AccumuloInputFormat.fetchColumns(job, columns);
    AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));

    // configure to use underlying RFiles
    AccumuloInputFormat.setOfflineTableScan(job, true);

    // output
    job.setOutputFormatClass(AccumuloOutputFormat.class);

    BatchWriterConfig bwConfig = new BatchWriterConfig();

    AccumuloOutputFormat.setBatchWriterOptions(job, bwConfig);
    AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));
    AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE);
    AccumuloOutputFormat.setCreateTables(job, true);

    job.setJarByClass(WordCount.class);

    job.waitForCompletion(true);
    //job.submit();

    return 0;
}

From source file:com.accumulobook.advanced.mapreduce.WordCount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(new Configuration());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(WordCountMapper.class);
    job.setCombinerClass(WordCountCombiner.class);
    job.setReducerClass(WordCountReducer.class);

    // input//from   www.j a  va2 s .  com
    job.setInputFormatClass(AccumuloInputFormat.class);

    ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]);

    AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE);
    List<Pair<Text, Text>> columns = new ArrayList<>();
    columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text("")));

    AccumuloInputFormat.fetchColumns(job, columns);
    AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));

    // output
    job.setOutputFormatClass(AccumuloOutputFormat.class);

    BatchWriterConfig config = new BatchWriterConfig();

    AccumuloOutputFormat.setBatchWriterOptions(job, config);
    AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));
    AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE);
    AccumuloOutputFormat.setCreateTables(job, true);

    job.setJarByClass(WordCount.class);

    job.submit();
    return 0;
}

From source file:com.architecting.ch07.MapReduceIndexerTool.java

License:Apache License

/** API for Java clients;visible for testing;may become a public API eventually */
int run(Options options) throws Exception {
    if (getConf().getBoolean("isMR1", false) && "local".equals(getConf().get("mapred.job.tracker"))) {
        throw new IllegalStateException(
                "Running with LocalJobRunner (i.e. all of Hadoop inside a single JVM) is not supported "
                        + "because LocalJobRunner does not (yet) implement the Hadoop Distributed Cache feature, "
                        + "which is required for passing files via --files and --libjars");
    }/* ww w . j a  va 2s.  c o  m*/

    long programStartTime = System.nanoTime();
    getConf().setInt(SolrOutputFormat.SOLR_RECORD_WRITER_MAX_SEGMENTS, options.maxSegments);

    // switch off a false warning about allegedly not implementing Tool
    // also see http://hadoop.6.n7.nabble.com/GenericOptionsParser-warning-td8103.html
    // also see https://issues.apache.org/jira/browse/HADOOP-8183
    getConf().setBoolean("mapred.used.genericoptionsparser", true);

    if (options.log4jConfigFile != null) {
        Utils.setLogConfigFile(options.log4jConfigFile, getConf());
        addDistributedCacheFile(options.log4jConfigFile, getConf());
    }

    Configuration config = HBaseConfiguration.create();
    Job job = Job.getInstance(config);
    job.setJarByClass(getClass());

    // To be able to run this example from eclipse, we need to make sure 
    // the built jar is distributed to the map-reduce tasks from the
    // local file system.
    job.addCacheArchive(new URI("file:///home/cloudera/ahae/target/ahae.jar"));

    FileSystem fs = options.outputDir.getFileSystem(job.getConfiguration());
    if (fs.exists(options.outputDir) && !delete(options.outputDir, true, fs)) {
        return -1;
    }
    Path outputResultsDir = new Path(options.outputDir, RESULTS_DIR);
    Path outputReduceDir = new Path(options.outputDir, "reducers");

    int reducers = 1;

    Scan scan = new Scan();
    scan.addFamily(CF);
    // tag::SETUP[]
    scan.setCaching(500); // <1>
    scan.setCacheBlocks(false); // <2>

    TableMapReduceUtil.initTableMapperJob( // <3>
            options.inputTable, // Input HBase table name
            scan, // Scan instance to control what to index
            HBaseAvroToSOLRMapper.class, // Mapper to parse cells content.
            Text.class, // Mapper output key
            SolrInputDocumentWritable.class, // Mapper output value
            job);

    FileOutputFormat.setOutputPath(job, outputReduceDir);

    job.setJobName(getClass().getName() + "/" + Utils.getShortClassName(HBaseAvroToSOLRMapper.class));
    job.setReducerClass(SolrReducer.class); // <4>
    job.setPartitionerClass(SolrCloudPartitioner.class); // <5>
    job.getConfiguration().set(SolrCloudPartitioner.ZKHOST, options.zkHost);
    job.getConfiguration().set(SolrCloudPartitioner.COLLECTION, options.collection);
    job.getConfiguration().setInt(SolrCloudPartitioner.SHARDS, options.shards);

    job.setOutputFormatClass(SolrOutputFormat.class);
    SolrOutputFormat.setupSolrHomeCache(options.solrHomeDir, job);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(SolrInputDocumentWritable.class);
    job.setSpeculativeExecution(false);
    // end::SETUP[]
    job.setNumReduceTasks(reducers); // Set the number of reducers based on the number of shards we have.
    if (!waitForCompletion(job, true)) {
        return -1;// job failed
    }

    // -------------------------------------------------------------------------------------------------------------------------------------

    assert reducers == options.shards;

    // normalize output shard dir prefix, i.e.
    // rename part-r-00000 to part-00000 (stems from zero tree merge iterations)
    // rename part-m-00000 to part-00000 (stems from > 0 tree merge iterations)
    for (FileStatus stats : fs.listStatus(outputReduceDir)) {
        String dirPrefix = SolrOutputFormat.getOutputName(job);
        Path srcPath = stats.getPath();
        if (stats.isDirectory() && srcPath.getName().startsWith(dirPrefix)) {
            String dstName = dirPrefix + srcPath.getName().substring(dirPrefix.length() + "-m".length());
            Path dstPath = new Path(srcPath.getParent(), dstName);
            if (!rename(srcPath, dstPath, fs)) {
                return -1;
            }
        }
    }
    ;

    // publish results dir
    if (!rename(outputReduceDir, outputResultsDir, fs)) {
        return -1;
    }

    if (options.goLive && !new GoLive().goLive(options, listSortedOutputShardDirs(job, outputResultsDir, fs))) {
        return -1;
    }

    goodbye(job, programStartTime);
    return 0;
}

From source file:com.asakusafw.runtime.compatibility.hadoop2.JobCompatibilityHadoop2.java

License:Apache License

@Override
public Job newJob(Configuration conf) throws IOException {
    if (conf == null) {
        throw new IllegalArgumentException("conf must not be null"); //$NON-NLS-1$
    }//  w  w  w .java  2  s.c  om
    return Job.getInstance(conf);
}

From source file:com.asakusafw.testdriver.file.FileDeployer.java

License:Apache License

/**
 * Opens output for the specified {@link OutputFormat}.
 * @param <V> value type/*from   ww  w  .j  a  va  2 s  .com*/
 * @param definition target model definition
 * @param destination output location
 * @param output format
 * @return the opened {@link ModelOutput}
 * @throws IOException if failed to open the target output
 * @throws IllegalArgumentException if some parameters were {@code null}
 */
public <V> ModelOutput<V> openOutput(DataModelDefinition<V> definition, final String destination,
        FileOutputFormat<? super NullWritable, ? super V> output) throws IOException {
    assert destination != null;
    assert output != null;
    LOG.debug("Opening {} using {}", destination, output.getClass().getName());
    Job job = Job.getInstance(configuration);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(definition.getModelClass());
    final File temporaryDir = File.createTempFile("asakusa", ".tempdir");
    if (temporaryDir.delete() == false || temporaryDir.mkdirs() == false) {
        throw new IOException("Failed to create temporary directory");
    }
    LOG.debug("Using staging deploy target: {}", temporaryDir);
    URI uri = temporaryDir.toURI();
    FileOutputFormat.setOutputPath(job, new Path(uri));
    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
            new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0));
    FileOutputFormatDriver<V> result = new FileOutputFormatDriver<V>(context, output, NullWritable.get()) {
        @Override
        public void close() throws IOException {
            super.close();
            deploy(destination, temporaryDir);
        }
    };
    return result;
}

From source file:com.asakusafw.testdriver.file.FileExporterRetriever.java

License:Apache License

@Override
public <V> DataModelSource createSource(DataModelDefinition<V> definition, FileExporterDescription description,
        TestContext context) throws IOException {
    LOG.info("??????: {}", description);
    VariableTable variables = createVariables(context);
    checkType(definition, description);//  w w w .  java2  s  .  c o m
    Configuration conf = configurations.newInstance();
    Job job = Job.getInstance(conf);
    String resolved = variables.parse(description.getPathPrefix(), false);
    FileInputFormat.setInputPaths(job, new Path(resolved));
    TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(),
            new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0));
    FileInputFormat<?, V> format = getOpposite(conf, description.getOutputFormat());
    FileInputFormatDriver<V> result = new FileInputFormatDriver<>(definition, taskContext, format);
    return result;
}

From source file:com.asakusafw.thundergate.runtime.cache.mapreduce.CacheBuildClient.java

License:Apache License

private Job newJob() throws IOException {
    Job job = Job.getInstance(getConf());
    job.setJobName("TGC-CREATE-" + tableName);
    Configuration conf = job.getConfiguration();
    Invalidation.setupInvalidationTimestamp(conf, tableName);
    return job;/* w ww .  j  a v  a  2  s. c om*/
}

From source file:com.avira.couchdoop.demo.BenchmarkUpdater.java

License:Apache License

public Job configureJob(Configuration conf, String input) throws IOException {
    conf.setInt("mapreduce.map.failures.maxpercent", 5);
    conf.setInt("mapred.max.map.failures.percent", 5);
    conf.setInt("mapred.max.tracker.failures", 20);

    Job job = Job.getInstance(conf);
    job.setJarByClass(BenchmarkUpdater.class);

    // User classpath takes precedence in favor of Hadoop classpath.
    // This is because the Couchbase client requires a newer version of
    // org.apache.httpcomponents:httpcore.
    job.setUserClassesTakesPrecedence(true);

    // Input/*w  w  w.j av a2  s .c  o  m*/
    FileInputFormat.setInputPaths(job, input);

    // Mapper
    job.setMapperClass(BenchmarkUpdateMapper.class);
    job.setMapOutputKeyClass(String.class);
    job.setMapOutputValueClass(CouchbaseAction.class);

    // Reducer
    job.setNumReduceTasks(0);

    // Output
    job.setOutputFormatClass(CouchbaseOutputFormat.class);
    job.setMapOutputKeyClass(String.class);
    job.setMapOutputValueClass(CouchbaseAction.class);

    return job;
}

From source file:com.avira.couchdoop.demo.ExportDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 1) {
        System.err.println("Usage: <input_path>");
        return 1;
    }//  w ww. j av a 2s .  com
    String input = args[0];

    Job job = Job.getInstance(getConf());
    job.setJarByClass(ExportDriver.class);

    // User classpath takes precedence in favor of Hadoop classpath.
    // This is because the Couchbase client requires a newer version of
    // org.apache.httpcomponents:httpcore.
    //        job.setUserClassesTakesPrecedence(true);

    // Input
    FileInputFormat.setInputPaths(job, input);

    // Mapper
    job.setMapperClass(ExportMapper.class);

    // Reducer
    job.setNumReduceTasks(0);

    // Output
    job.setOutputKeyClass(String.class);
    job.setOutputValueClass(CouchbaseAction.class);
    job.setOutputFormatClass(CouchbaseOutputFormat.class);

    if (!job.waitForCompletion(true)) {
        return 2;
    }

    return 0;
}