List of usage examples for org.apache.hadoop.mapreduce Job getInstance
@Deprecated public static Job getInstance(Cluster ignored, Configuration conf) throws IOException
From source file:com.teradata.benchto.generator.HiveTypesGenerator.java
License:Apache License
@Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(//from www. j av a 2s. c o m Option.builder("format").required().hasArg().desc("file format (orc, parquet or text)").build()); options.addOption(Option.builder("type").required().hasArg().desc( "hive type to be generated (bigint, int, boolean, double, binary, date, timestamp, string, decimal or varchar)") .build()); options.addOption(Option.builder("rows").required().hasArg().desc("total row count").build()); options.addOption(Option.builder("mappers").required().hasArg().desc("total mappers count").build()); options.addOption(Option.builder("path").hasArg() .desc("base path for generating files, default is: /benchmarks/benchto/types").build()); options.addOption(Option.builder("regex").numberOfArgs(3) .desc("generate varchars from regex pattern, arguments are: pattern, min length, max length") .build()); CommandLine line; String format; String hiveType; long numberOfRows; long numberOfFiles; String basePath; Optional<String> regexPattern = Optional.absent(); Optional<Integer> regexMinLength = Optional.absent(); Optional<Integer> regexMaxLength = Optional.absent(); try { line = new DefaultParser().parse(options, args); format = line.getOptionValue("format"); hiveType = line.getOptionValue("type"); numberOfRows = parseLong(line.getOptionValue("rows")); numberOfFiles = parseLong(line.getOptionValue("mappers")); basePath = line.getOptionValue("path", "/benchmarks/benchto/types"); if (line.hasOption("regex")) { String[] values = line.getOptionValues("regex"); regexPattern = Optional.of(values[0]); regexMinLength = Optional.of(parseInt(values[1])); regexMaxLength = Optional.of(parseInt(values[2])); } } catch (Exception e) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("benchto-generator", options); throw e; } String jobName = format("GenerateData-%s-%s-%d", format, hiveType, numberOfRows); Path outputDir = new Path(format("%s/%s-%s/%d", basePath, format, hiveType, numberOfRows)); Class<? extends OutputFormat> outputFormatClass = getOutputFormatClass(format); LOG.info("Generating " + numberOfRows + " " + hiveType + "s, directory: " + outputDir + ", number of files: " + numberOfFiles); Configuration configuration = new Configuration(); configuration.set(FORMAT_PROPERTY_NAME, format); configuration.set(HIVE_TYPE_PROPERTY_NAME, hiveType); configuration.setLong(NUM_ROWS_PROPERTY_NAME, numberOfRows); configuration.setLong(NUM_MAPS, numberOfFiles); if (regexPattern.isPresent()) { configuration.set(REGEX_PATTERN, regexPattern.get()); configuration.setInt(REGEX_MIN_LENGTH, regexMinLength.get()); configuration.setInt(REGEX_MAX_LENGTH, regexMaxLength.get()); } Job generatorJob = Job.getInstance(configuration, jobName); FileOutputFormat.setOutputPath(generatorJob, outputDir); ParquetOutputFormat.setWriteSupportClass(generatorJob, DataWritableWriteSupport.class); generatorJob.setJarByClass(HiveTypesGenerator.class); generatorJob.setMapperClass(HiveTypesMapper.class); generatorJob.setNumReduceTasks(0); generatorJob.setOutputKeyClass(NullWritable.class); generatorJob.setOutputValueClass(Writable.class); generatorJob.setInputFormatClass(CounterInputFormat.class); generatorJob.setOutputFormatClass(outputFormatClass); return generatorJob.waitForCompletion(true) ? 0 : 1; }
From source file:com.toddbodnar.simpleHadoop.distributedHadoopDriver.java
/** * Runs a job/*from w ww .ja va2 s .co m*/ * * @param theJob the MapReduceJob to be run * @param verbose if true, output progress information */ public static void run(MapReduceJob theJob, boolean verbose) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = GetConfiguration.get(); Job job = Job.getInstance(conf, theJob.toString()); job.setJarByClass(distributedHadoopDriver.class); job.setMapperClass(theJob.getMapper().getClass()); job.setReducerClass(theJob.getReducer().getClass()); job.setMapOutputKeyClass(theJob.getKeyType()); job.setMapOutputValueClass(theJob.getValueType()); theJob.writeConfig(job.getConfiguration()); hdfsFile input = hdfsFile.transferToHDFS(theJob.getInput().getFile()); if (!input.equals(theJob.getInput().getFile())) { garbage_collector.noteCreated(input); } if (theJob.getClass().equals(join.class)) { join jobLeftJoin = (join) theJob; hdfsFile input2 = hdfsFile.transferToHDFS(jobLeftJoin.getOtherInput().getFile()); if (!input2.equals(jobLeftJoin.getOtherInput().getFile())) { garbage_collector.noteCreated(input2); } Mapper maps[] = jobLeftJoin.getMapperPairs(); MultipleInputs.addInputPath(job, input.getPath(), TextInputFormat.class, maps[0].getClass()); MultipleInputs.addInputPath(job, input2.getPath(), TextInputFormat.class, maps[1].getClass()); } else { MultipleInputs.addInputPath(job, input.getPath(), TextInputFormat.class); } job.getConfiguration().set(TextOutputFormat.SEPERATOR, ""); job.setOutputFormatClass(TextOutputFormat.class); //FileInputFormat.setInputPaths(job, new Path(theJob.getInput().getFile().getLocation())); Path out = new Path(settings.hdfs_prefix + "/TMP_TABLE_" + theJob.hashCode()); FileOutputFormat.setOutputPath(job, out); boolean success = job.waitForCompletion(true); if (!success) { System.err.println("Error processing " + theJob); return; } FileSystem fs = FileSystem.get(GetConfiguration.get()); fs.delete(new Path(out, "_SUCCESS"), false); table output = new table(new hdfsFile(out), theJob.getOutput().getColNames()); output.setSeperator(theJob.getOutput().getSeperator()); theJob.setOutput(output); garbage_collector.noteCreated(output.getFile()); }
From source file:com.toshiba.mwcloud.gs.hadoop.mapreduce.examples.GSWordCount.java
License:Apache License
/** * <div lang="ja">//from w ww . j a v a 2s . com * WordCount?MapReduce??? * @param args * @return ???0????????1 * @throws Exception ?????? * </div><div lang="en"> * Run a MapReduce job of WordCount. * @param args command argument * @return 0 for normal termination of the job and 1 otherwise * @throws Exception processing failed. * </div> */ public int run(String[] args) throws Exception { GSConf gsConf = new GSConf(); gsConf.parseArg(args); Configuration conf = getConf(); gsConf.setup(conf); Job job = Job.getInstance(conf, APP_NAME); job.setJarByClass(GSWordCount.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(GSRowWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(GSRowInputFormat.class); job.setOutputFormatClass(GSRowOutputFormat.class); int res = job.waitForCompletion(true) ? 0 : 1; if (res == 0) { printResult(gsConf); } return res; }
From source file:com.uc.qiujw.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); System.err.println("Usage: wordcount <in> <out> <map_sleep> <reduce_sleep>"); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); String[] in = otherArgs[0].split(","); String out = otherArgs[1];//from w w w. ja v a2s . c o m for (String inStr : in) { FileInputFormat.addInputPath(job, new Path(inStr)); } FileOutputFormat.setOutputPath(job, new Path(out)); int mapSleep = 1, reduceSleep = 1; if (otherArgs.length > 2) { mapSleep = Integer.valueOf(otherArgs[2]); } if (otherArgs.length > 3) { reduceSleep = Integer.valueOf(otherArgs[3]); } conf.set(mapSleepKey, Integer.toString(mapSleep)); conf.set(reduceSleepKey, Integer.toString(reduceSleep)); FileSystem fs = FileSystem.get(conf); Path outPath = new Path(out); if (fs.exists(outPath)) { fs.delete(outPath, true); } System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.wind.mapreduce.WordCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), WordCount.class); Job job = Job.getInstance(jobConf, "word count"); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java
License:Apache License
@Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); String dir = conf.get(LindenJobConfig.INPUT_DIR, null); logger.info("input dir:" + dir); Path inputPath = new Path(StringUtils.unEscapeString(dir)); Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR)); String indexPath = conf.get(LindenJobConfig.INDEX_PATH); FileSystem fs = FileSystem.get(conf); if (fs.exists(outputPath)) { fs.delete(outputPath, true);//from w ww .j a v a2 s. c o m } if (fs.exists(new Path(indexPath))) { fs.delete(new Path(indexPath), true); } int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1); Shard[] shards = createShards(indexPath, numShards); Shard.setIndexShards(conf, shards); //empty trash; (new Trash(conf)).expunge(); Job job = Job.getInstance(conf, "linden-hadoop-indexing"); job.setJarByClass(LindenJob.class); job.setMapperClass(LindenMapper.class); job.setCombinerClass(LindenCombiner.class); job.setReducerClass(LindenReducer.class); job.setMapOutputKeyClass(Shard.class); job.setMapOutputValueClass(IntermediateForm.class); job.setOutputKeyClass(Shard.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(IndexUpdateOutputFormat.class); job.setReduceSpeculativeExecution(false); job.setNumReduceTasks(numShards); String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL); if (lindenSchemaFile == null) { throw new IOException("no schema file is found"); } logger.info("Adding schema file: " + lindenSchemaFile); job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema")); String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL); if (lindenPropertiesFile == null) { throw new IOException("no linden properties file is found"); } logger.info("Adding linden properties file: " + lindenPropertiesFile); job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties")); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); Path[] inputs = FileInputFormat.getInputPaths(job); StringBuilder buffer = new StringBuilder(inputs[0].toString()); for (int i = 1; i < inputs.length; i++) { buffer.append(","); buffer.append(inputs[i].toString()); } logger.info("mapreduce.input.dir = " + buffer.toString()); logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString()); logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks()); logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS)); logger.info("mapreduce.input.format.class = " + job.getInputFormatClass()); logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass()); logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR)); job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed"); } return 0; }
From source file:com.xiaoxiaomo.mr.utils.kafka.HadoopJob.java
License:Apache License
public int run(String[] args) throws Exception { CommandLineParser parser = new PosixParser(); Options options = buildOptions();/*from ww w . ja v a 2 s. co m*/ CommandLine cmd = parser.parse(options, args); if (cmd.hasOption("h") || cmd.getArgs().length == 0) { printHelpAndExit(options); } String hdfsPath = cmd.getArgs()[0]; Configuration conf = getConf(); conf.setBoolean("mapred.map.tasks.speculative.execution", false); if (cmd.hasOption("topics")) { LOG.info("Using topics: " + cmd.getOptionValue("topics")); KafkaInputFormat.configureKafkaTopics(conf, cmd.getOptionValue("topics")); } else { printHelpAndExit(options); } KafkaInputFormat.configureZkConnection(conf, cmd.getOptionValue("zk-connect", "localhost:2181")); if (cmd.hasOption("consumer-group")) { CheckpointManager.configureUseZooKeeper(conf, cmd.getOptionValue("consumer-group", "dev-hadoop-loader")); } if (cmd.getOptionValue("autooffset-reset") != null) { KafkaInputFormat.configureAutoOffsetReset(conf, cmd.getOptionValue("autooffset-reset")); } JobConf jobConf = new JobConf(conf); if (cmd.hasOption("remote")) { String ip = cmd.getOptionValue("remote"); LOG.info("Default file system: hdfs://" + ip + ":8020/"); jobConf.set("fs.defaultFS", "hdfs://" + ip + ":8020/"); LOG.info("Remote jobtracker: " + ip + ":8021"); jobConf.set("mapred.job.tracker", ip + ":8021"); } Path jarTarget = new Path( getClass().getProtectionDomain().getCodeSource().getLocation() + "../kafka-hadoop-loader.jar"); if (new File(jarTarget.toUri()).exists()) { // running from IDE/ as maven jobConf.setJar(jarTarget.toUri().getPath()); LOG.info("Using target jar: " + jarTarget.toString()); } else { // running from jar remotely or locally jobConf.setJarByClass(getClass()); LOG.info("Using parent jar: " + jobConf.getJar()); } Job job = Job.getInstance(jobConf, "kafka.hadoop.loader"); job.setInputFormatClass(KafkaInputFormat.class); job.setMapperClass(HadoopJobMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(MultiOutputFormat.class); job.setNumReduceTasks(0); MultiOutputFormat.setOutputPath(job, new Path(hdfsPath)); MultiOutputFormat.setCompressOutput(job, cmd.getOptionValue("compress-output", "on").equals("on")); LOG.info("Output hdfs location: {}", hdfsPath); LOG.info("Output hdfs compression: {}", MultiOutputFormat.getCompressOutput(job)); return job.waitForCompletion(true) ? 0 : -1; }
From source file:com.yahoo.druid.hadoop.DruidInputFormatTest.java
License:Apache License
@Test public void testSampleMRJob() throws Exception { Job job = Job.getInstance(new Configuration(), "Druid-Loader-Sample-Test-Job"); job.getConfiguration().set("mapreduce.job.acl-view-job", "*"); job.getConfiguration().set("mapreduce.map.java.opts", "-Duser.timezone=UTC"); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_OVERLORD_HOSTPORT, "localhost:" + overlordTestPort); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_SCHEMA, "{" + "\"dataSource\":\"testDataSource\"," + "\"interval\":\"1970-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z\"," + "\"granularity\":\"NONE\"," + "\"dimensions\":[\"host\"]," + "\"metrics\":[\"visited_sum\",\"unique_hosts\"]" + "}"); job.setMapperClass(SampleMapper.class); job.setNumReduceTasks(0);/*w w w. j ava 2 s . c o m*/ job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setInputFormatClass(DruidInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); String outputPath = tempFolder.newFolder() + "/out"; TextOutputFormat.setOutputPath(job, new Path(outputPath)); Assert.assertTrue(job.waitForCompletion(true)); //verify that the SampleMapper actually ran and verified the data Assert.assertTrue(FileUtils.readFileToString(new File(outputPath + "/part-m-00000")).startsWith("SUCCESS")); }
From source file:com.yahoo.labs.yamall.hadoop.Test.java
License:Open Source License
/** * Run the map/reduce job//www . j av a 2 s . c o m */ public final int run(final String[] args) throws Exception { startLogger(Level.INFO); Configuration conf = getConf(); conf.set("yamall.vw_model", args[2]); conf.setIfUnset("yamall.bit_precision", "18"); conf.setIfUnset("yamall.parser", "vw"); // Print to screen all the options TreeMap<String, String> map = new TreeMap<String, String>(); for (Map.Entry<String, String> entry : conf) { map.put(entry.getKey(), entry.getValue()); } for (Map.Entry<String, String> entry : map.entrySet()) { System.out.printf("%s=%s\n", entry.getKey(), entry.getValue()); } Job job = Job.getInstance(conf, "Yamall Test on MapReduce"); job.setNumReduceTasks(1); job.setJarByClass(Test.class); job.setMapperClass(TestMapper.class); job.setMapOutputKeyClass(DoubleWritable.class); job.setReducerClass(TestReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(CompositeDoubleTextWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); MultipleOutputs.addNamedOutput(job, "out", TextOutputFormat.class, NullWritable.class, Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.yahoo.labs.yamall.hadoop.Train.java
License:Open Source License
/** * Run the map/reduce job//w ww . ja v a 2 s .co m */ public final int run(final String[] args) throws Exception { startLogger(Level.INFO); Configuration conf = getConf(); conf.set("yamall.output", args[1]); conf.setIfUnset("yamall.bit_precision", "18"); conf.setIfUnset("yamall.parser", "vw"); // Print to screen all the options TreeMap<String, String> map = new TreeMap<String, String>(); for (Map.Entry<String, String> entry : conf) { map.put(entry.getKey(), entry.getValue()); } for (Map.Entry<String, String> entry : map.entrySet()) { System.out.printf("%s=%s\n", entry.getKey(), entry.getValue()); } Job job = Job.getInstance(conf, "Yamall Train on MapReduce"); job.setNumReduceTasks(1); // important job.setJarByClass(Train.class); job.setMapperClass(TrainMapper.class); job.setMapOutputKeyClass(DoubleWritable.class); job.setMapOutputValueClass(InstanceOrHashMapWritable.class); job.setReducerClass(TrainReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; }