List of usage examples for org.apache.hadoop.mapreduce Job getInstance
@Deprecated public static Job getInstance(Cluster ignored) throws IOException
From source file:code.DemoWordCount.java
License:Apache License
/** * Runs this tool.//from w ww .j a v a2s . c om */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + DemoWordCount.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJobName(DemoWordCount.class.getSimpleName()); job.setJarByClass(DemoWordCount.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:com.accumulobook.advanced.mapreduce.MapReduceFilesExample.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(this.getConf()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordCount.WordCountMapper.class); job.setCombinerClass(WordCount.WordCountCombiner.class); job.setReducerClass(WordCount.WordCountReducer.class); // clone the articles table ZooKeeperInstance inst = new ZooKeeperInstance(args[0], args[1]); Connector conn = inst.getConnector(args[2], new PasswordToken(args[3])); conn.tableOperations().clone(WikipediaConstants.ARTICLES_TABLE, WikipediaConstants.ARTICLES_TABLE_CLONE, true, Collections.EMPTY_MAP, Collections.EMPTY_SET); // take cloned table offline, waiting until the operation is complete boolean wait = true; conn.tableOperations().offline(WikipediaConstants.ARTICLES_TABLE_CLONE, wait); ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]); // input/*w w w. ja va 2s . c o m*/ job.setInputFormatClass(AccumuloInputFormat.class); AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE_CLONE); List<Pair<Text, Text>> columns = new ArrayList<>(); columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text(""))); AccumuloInputFormat.fetchColumns(job, columns); AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig); AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3])); // configure to use underlying RFiles AccumuloInputFormat.setOfflineTableScan(job, true); // output job.setOutputFormatClass(AccumuloOutputFormat.class); BatchWriterConfig bwConfig = new BatchWriterConfig(); AccumuloOutputFormat.setBatchWriterOptions(job, bwConfig); AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig); AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3])); AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE); AccumuloOutputFormat.setCreateTables(job, true); job.setJarByClass(WordCount.class); job.waitForCompletion(true); //job.submit(); return 0; }
From source file:com.accumulobook.advanced.mapreduce.WordCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(new Configuration()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordCountMapper.class); job.setCombinerClass(WordCountCombiner.class); job.setReducerClass(WordCountReducer.class); // input//from www.j a va2 s . com job.setInputFormatClass(AccumuloInputFormat.class); ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]); AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE); List<Pair<Text, Text>> columns = new ArrayList<>(); columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text(""))); AccumuloInputFormat.fetchColumns(job, columns); AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig); AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3])); // output job.setOutputFormatClass(AccumuloOutputFormat.class); BatchWriterConfig config = new BatchWriterConfig(); AccumuloOutputFormat.setBatchWriterOptions(job, config); AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig); AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3])); AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE); AccumuloOutputFormat.setCreateTables(job, true); job.setJarByClass(WordCount.class); job.submit(); return 0; }
From source file:com.architecting.ch07.MapReduceIndexerTool.java
License:Apache License
/** API for Java clients;visible for testing;may become a public API eventually */ int run(Options options) throws Exception { if (getConf().getBoolean("isMR1", false) && "local".equals(getConf().get("mapred.job.tracker"))) { throw new IllegalStateException( "Running with LocalJobRunner (i.e. all of Hadoop inside a single JVM) is not supported " + "because LocalJobRunner does not (yet) implement the Hadoop Distributed Cache feature, " + "which is required for passing files via --files and --libjars"); }/* ww w . j a va 2s. c o m*/ long programStartTime = System.nanoTime(); getConf().setInt(SolrOutputFormat.SOLR_RECORD_WRITER_MAX_SEGMENTS, options.maxSegments); // switch off a false warning about allegedly not implementing Tool // also see http://hadoop.6.n7.nabble.com/GenericOptionsParser-warning-td8103.html // also see https://issues.apache.org/jira/browse/HADOOP-8183 getConf().setBoolean("mapred.used.genericoptionsparser", true); if (options.log4jConfigFile != null) { Utils.setLogConfigFile(options.log4jConfigFile, getConf()); addDistributedCacheFile(options.log4jConfigFile, getConf()); } Configuration config = HBaseConfiguration.create(); Job job = Job.getInstance(config); job.setJarByClass(getClass()); // To be able to run this example from eclipse, we need to make sure // the built jar is distributed to the map-reduce tasks from the // local file system. job.addCacheArchive(new URI("file:///home/cloudera/ahae/target/ahae.jar")); FileSystem fs = options.outputDir.getFileSystem(job.getConfiguration()); if (fs.exists(options.outputDir) && !delete(options.outputDir, true, fs)) { return -1; } Path outputResultsDir = new Path(options.outputDir, RESULTS_DIR); Path outputReduceDir = new Path(options.outputDir, "reducers"); int reducers = 1; Scan scan = new Scan(); scan.addFamily(CF); // tag::SETUP[] scan.setCaching(500); // <1> scan.setCacheBlocks(false); // <2> TableMapReduceUtil.initTableMapperJob( // <3> options.inputTable, // Input HBase table name scan, // Scan instance to control what to index HBaseAvroToSOLRMapper.class, // Mapper to parse cells content. Text.class, // Mapper output key SolrInputDocumentWritable.class, // Mapper output value job); FileOutputFormat.setOutputPath(job, outputReduceDir); job.setJobName(getClass().getName() + "/" + Utils.getShortClassName(HBaseAvroToSOLRMapper.class)); job.setReducerClass(SolrReducer.class); // <4> job.setPartitionerClass(SolrCloudPartitioner.class); // <5> job.getConfiguration().set(SolrCloudPartitioner.ZKHOST, options.zkHost); job.getConfiguration().set(SolrCloudPartitioner.COLLECTION, options.collection); job.getConfiguration().setInt(SolrCloudPartitioner.SHARDS, options.shards); job.setOutputFormatClass(SolrOutputFormat.class); SolrOutputFormat.setupSolrHomeCache(options.solrHomeDir, job); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SolrInputDocumentWritable.class); job.setSpeculativeExecution(false); // end::SETUP[] job.setNumReduceTasks(reducers); // Set the number of reducers based on the number of shards we have. if (!waitForCompletion(job, true)) { return -1;// job failed } // ------------------------------------------------------------------------------------------------------------------------------------- assert reducers == options.shards; // normalize output shard dir prefix, i.e. // rename part-r-00000 to part-00000 (stems from zero tree merge iterations) // rename part-m-00000 to part-00000 (stems from > 0 tree merge iterations) for (FileStatus stats : fs.listStatus(outputReduceDir)) { String dirPrefix = SolrOutputFormat.getOutputName(job); Path srcPath = stats.getPath(); if (stats.isDirectory() && srcPath.getName().startsWith(dirPrefix)) { String dstName = dirPrefix + srcPath.getName().substring(dirPrefix.length() + "-m".length()); Path dstPath = new Path(srcPath.getParent(), dstName); if (!rename(srcPath, dstPath, fs)) { return -1; } } } ; // publish results dir if (!rename(outputReduceDir, outputResultsDir, fs)) { return -1; } if (options.goLive && !new GoLive().goLive(options, listSortedOutputShardDirs(job, outputResultsDir, fs))) { return -1; } goodbye(job, programStartTime); return 0; }
From source file:com.asakusafw.runtime.compatibility.hadoop2.JobCompatibilityHadoop2.java
License:Apache License
@Override public Job newJob(Configuration conf) throws IOException { if (conf == null) { throw new IllegalArgumentException("conf must not be null"); //$NON-NLS-1$ }// w w w .java 2 s.c om return Job.getInstance(conf); }
From source file:com.asakusafw.testdriver.file.FileDeployer.java
License:Apache License
/** * Opens output for the specified {@link OutputFormat}. * @param <V> value type/*from ww w .j a va 2 s .com*/ * @param definition target model definition * @param destination output location * @param output format * @return the opened {@link ModelOutput} * @throws IOException if failed to open the target output * @throws IllegalArgumentException if some parameters were {@code null} */ public <V> ModelOutput<V> openOutput(DataModelDefinition<V> definition, final String destination, FileOutputFormat<? super NullWritable, ? super V> output) throws IOException { assert destination != null; assert output != null; LOG.debug("Opening {} using {}", destination, output.getClass().getName()); Job job = Job.getInstance(configuration); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(definition.getModelClass()); final File temporaryDir = File.createTempFile("asakusa", ".tempdir"); if (temporaryDir.delete() == false || temporaryDir.mkdirs() == false) { throw new IOException("Failed to create temporary directory"); } LOG.debug("Using staging deploy target: {}", temporaryDir); URI uri = temporaryDir.toURI(); FileOutputFormat.setOutputPath(job, new Path(uri)); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)); FileOutputFormatDriver<V> result = new FileOutputFormatDriver<V>(context, output, NullWritable.get()) { @Override public void close() throws IOException { super.close(); deploy(destination, temporaryDir); } }; return result; }
From source file:com.asakusafw.testdriver.file.FileExporterRetriever.java
License:Apache License
@Override public <V> DataModelSource createSource(DataModelDefinition<V> definition, FileExporterDescription description, TestContext context) throws IOException { LOG.info("??????: {}", description); VariableTable variables = createVariables(context); checkType(definition, description);// w w w . java2 s . c o m Configuration conf = configurations.newInstance(); Job job = Job.getInstance(conf); String resolved = variables.parse(description.getPathPrefix(), false); FileInputFormat.setInputPaths(job, new Path(resolved)); TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)); FileInputFormat<?, V> format = getOpposite(conf, description.getOutputFormat()); FileInputFormatDriver<V> result = new FileInputFormatDriver<>(definition, taskContext, format); return result; }
From source file:com.asakusafw.thundergate.runtime.cache.mapreduce.CacheBuildClient.java
License:Apache License
private Job newJob() throws IOException { Job job = Job.getInstance(getConf()); job.setJobName("TGC-CREATE-" + tableName); Configuration conf = job.getConfiguration(); Invalidation.setupInvalidationTimestamp(conf, tableName); return job;/* w ww . j a v a 2 s. c om*/ }
From source file:com.avira.couchdoop.demo.BenchmarkUpdater.java
License:Apache License
public Job configureJob(Configuration conf, String input) throws IOException { conf.setInt("mapreduce.map.failures.maxpercent", 5); conf.setInt("mapred.max.map.failures.percent", 5); conf.setInt("mapred.max.tracker.failures", 20); Job job = Job.getInstance(conf); job.setJarByClass(BenchmarkUpdater.class); // User classpath takes precedence in favor of Hadoop classpath. // This is because the Couchbase client requires a newer version of // org.apache.httpcomponents:httpcore. job.setUserClassesTakesPrecedence(true); // Input/*w w w.j av a2 s .c o m*/ FileInputFormat.setInputPaths(job, input); // Mapper job.setMapperClass(BenchmarkUpdateMapper.class); job.setMapOutputKeyClass(String.class); job.setMapOutputValueClass(CouchbaseAction.class); // Reducer job.setNumReduceTasks(0); // Output job.setOutputFormatClass(CouchbaseOutputFormat.class); job.setMapOutputKeyClass(String.class); job.setMapOutputValueClass(CouchbaseAction.class); return job; }
From source file:com.avira.couchdoop.demo.ExportDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 1) { System.err.println("Usage: <input_path>"); return 1; }// w ww. j av a 2s . com String input = args[0]; Job job = Job.getInstance(getConf()); job.setJarByClass(ExportDriver.class); // User classpath takes precedence in favor of Hadoop classpath. // This is because the Couchbase client requires a newer version of // org.apache.httpcomponents:httpcore. // job.setUserClassesTakesPrecedence(true); // Input FileInputFormat.setInputPaths(job, input); // Mapper job.setMapperClass(ExportMapper.class); // Reducer job.setNumReduceTasks(0); // Output job.setOutputKeyClass(String.class); job.setOutputValueClass(CouchbaseAction.class); job.setOutputFormatClass(CouchbaseOutputFormat.class); if (!job.waitForCompletion(true)) { return 2; } return 0; }