List of usage examples for org.apache.hadoop.mapreduce Job Job
Job(JobStatus status, JobConf conf) throws IOException
From source file:com.main.MRSearchMain.java
public void searchHBase(int numOfDays) throws IOException, InterruptedException, ClassNotFoundException { long startTime; long endTime; String path = "/home/hadoop/app/hadoop-2.0.0-cdh4.3.0/etc/hadoop/"; Configuration conf = HBaseConfiguration.create(); // conf.set("hbase.zookeeper.quorum", "streamslab.localdomain"); // conf.set("fs.default.name", "hdfs://streamslab.localdomain:8020"); // conf.set("mapred.job.tracker", "hdfs://streamslab.localdomain:50300"); // conf.set("fs.hdfs.impl", // org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); //?,FileSystem? conf.addResource(new Path(path + "core-site.xml")); conf.addResource(new Path(path + "hdfs-site.xml")); conf.addResource(new Path(path + "mapred-site.xml")); /* // w w w .j a va 2 s . c om * ?map */ conf.set("search.license", "C87310"); conf.set("search.color", "10"); conf.set("search.direction", "2"); Job job = new Job(conf, "MRSearchHBase"); System.out.println("search.license: " + conf.get("search.license")); job.setNumReduceTasks(0); job.setJarByClass(MRSearchMain.class); Scan scan = new Scan(); scan.addFamily(FAMILY_NAME); byte[] startRow = Bytes.toBytes("2011010100000"); byte[] stopRow; switch (numOfDays) { case 1: stopRow = Bytes.toBytes("2011010200000"); break; case 10: stopRow = Bytes.toBytes("2011011100000"); break; case 30: stopRow = Bytes.toBytes("2011020100000"); break; case 365: stopRow = Bytes.toBytes("2012010100000"); break; default: stopRow = Bytes.toBytes("2011010101000"); } // ?key scan.setStartRow(startRow); scan.setStopRow(stopRow); TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, SearchMapper.class, ImmutableBytesWritable.class, Text.class, job); Path outPath = new Path("searchresult"); LOG.info("outPath:" + outPath.toString()); //hdfs FileSystem file = null; try { file = FileSystem.get(conf); } catch (IOException e) { e.printStackTrace(); } // HDFS_File file = new HDFS_File(); // file.DelFile(conf, outPath.getName(), true); // //"hdfs://streamslab.localdomain:8020/ if (file.exists(outPath)) { file.delete(outPath, true); LOG.info("=====delPath " + outPath.toString() + "====="); } FileOutputFormat.setOutputPath(job, outPath);// startTime = System.currentTimeMillis(); job.waitForCompletion(true); endTime = System.currentTimeMillis(); LOG.info("Time used: " + (endTime - startTime)); LOG.info("startRow:" + Text.decode(startRow)); LOG.info("stopRow: " + Text.decode(stopRow)); }
From source file:com.mb.saas.bi.job.WordCountJob.java
License:Apache License
public static boolean runHadoopMapReduceJob() throws Exception { System.setProperty("HADOOP_USER_NAME", "hadoop"); File jarFile = UploadResource.createTempJar("bin"); ClassLoader classLoader = UploadResource.getClassLoader(); Thread.currentThread().setContextClassLoader(classLoader); Configuration conf = new Configuration(); conf.set("fs.defaultFS", "hdfs://mbcluster/"); conf.set("dfs.nameservices", "mbcluster"); conf.set("dfs.ha.namenodes.mbcluster", "ns1,ns2"); conf.set("dfs.namenode.rpc-address.mbcluster.ns1", "master:4001"); conf.set("dfs.namenode.rpc-address.mbcluster.ns2", "backup:4001"); conf.set("dfs.client.failover.proxy.provider.mbcluster", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"); Job job = new Job(conf, "word count"); job.setJarByClass(WordCountJob.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); if (jarFile != null) ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath()); boolean isMapReduceJarSetted = false; String hadoopMapReduceJar = "F:/henry_projects/mbHiveAnalyzer/t.jar"; File file = new File(hadoopMapReduceJar); if (file.exists()) { ((JobConf) job.getConfiguration()).setJar(hadoopMapReduceJar); isMapReduceJarSetted = true;/*w w w . j av a2 s. c o m*/ } if (!isMapReduceJarSetted && jarFile != null) ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath()); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path("/input/wordcount.txt")); FileOutputFormat.setOutputPath(job, new Path("/output/001")); System.exit(job.waitForCompletion(true) ? 0 : 1); return true; }
From source file:com.mb.saas.bi.job.WordCountJob.java
License:Apache License
public static void main(String[] args) throws Exception { System.setProperty("HADOOP_USER_NAME", "hadoop"); File jarFile = UploadResource.createTempJar("bin"); System.setProperty("hadoop.home.dir", "F:/hadoop"); ClassLoader classLoader = UploadResource.getClassLoader(); Thread.currentThread().setContextClassLoader(classLoader); Configuration conf = new Configuration(); // conf.set("fs.defaultFS", "hdfs://slave1:4001"); // conf.set("mapreduce.framework.name", "yarn"); // conf.set("yarn.resourcemanager.address", "master:8032"); // conf.set("yarn.resourcemanager.scheduler.address", "master:8030"); conf.set("fs.defaultFS", "hdfs://mbcluster/"); conf.set("dfs.nameservices", "mbcluster"); conf.set("dfs.ha.namenodes.mbcluster", "ns1,ns2"); conf.set("dfs.namenode.rpc-address.mbcluster.ns1", "master:4001"); conf.set("dfs.namenode.rpc-address.mbcluster.ns2", "backup:4001"); conf.set("dfs.client.failover.proxy.provider.mbcluster", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"); conf.set("mapred.remote.os", "Linux"); System.out.println(conf.get("mapred.remote.os")); // conf.set("mapreduce.job.reduces", "2"); // conf.set("mapreduce.tasktracker.map.tasks.maximum", "8"); // conf.set("mapreduce.input.fileinputformat.split.maxsize","123"); Job job = new Job(conf, "word count"); job.setJarByClass(WordCountJob.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); if (jarFile != null) ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath()); // job.setMaxMapAttempts(2); job.setNumReduceTasks(1);//from w w w. ja v a 2s. c o m FileInputFormat.addInputPath(job, new Path("/input/wordcount2.txt")); // FileInputFormat.addInputPath(job, new Path("/input/wordcount2.txt")); FileOutputFormat.setOutputPath(job, new Path("/output/001002")); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.metamx.druid.indexer.DeterminePartitionsJob.java
License:Open Source License
public boolean run() { try {// w w w .jav a2s. co m /* * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear * in the final segment. */ if (!config.getPartitionsSpec().isAssumeGrouped()) { final Job groupByJob = new Job(new Configuration(), String.format( "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals())); injectSystemProperties(groupByJob); groupByJob.setInputFormatClass(TextInputFormat.class); groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class); groupByJob.setMapOutputKeyClass(BytesWritable.class); groupByJob.setMapOutputValueClass(NullWritable.class); groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setOutputKeyClass(BytesWritable.class); groupByJob.setOutputValueClass(NullWritable.class); groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class); groupByJob.setJarByClass(DeterminePartitionsJob.class); config.addInputPaths(groupByJob); config.intoConfiguration(groupByJob); FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir()); groupByJob.submit(); log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL()); if (!groupByJob.waitForCompletion(true)) { log.error("Job failed: %s", groupByJob.getJobID()); return false; } } else { log.info("Skipping group-by job."); } /* * Read grouped data and determine appropriate partitions. */ final Job dimSelectionJob = new Job(new Configuration(), String.format( "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals())); dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19"); injectSystemProperties(dimSelectionJob); if (!config.getPartitionsSpec().isAssumeGrouped()) { // Read grouped data from the groupByJob. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class); dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir()); } else { // Directly read the source data, since we assume it's already grouped. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class); dimSelectionJob.setInputFormatClass(TextInputFormat.class); config.addInputPaths(dimSelectionJob); } SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob); dimSelectionJob.setMapOutputValueClass(Text.class); dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class); dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class); dimSelectionJob.setOutputKeyClass(BytesWritable.class); dimSelectionJob.setOutputValueClass(Text.class); dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class); dimSelectionJob.setJarByClass(DeterminePartitionsJob.class); config.intoConfiguration(dimSelectionJob); FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath()); dimSelectionJob.submit(); log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL()); if (!dimSelectionJob.waitForCompletion(true)) { log.error("Job failed: %s", dimSelectionJob.getJobID().toString()); return false; } /* * Load partitions determined by the previous job. */ log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals()); FileSystem fileSystem = null; Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance()); int shardCount = 0; for (Interval segmentGranularity : config.getSegmentGranularIntervals()) { DateTime bucket = segmentGranularity.getStart(); final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(new Bucket(0, bucket, 0)); if (fileSystem == null) { fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration()); } if (fileSystem.exists(partitionInfoPath)) { List<ShardSpec> specs = config.jsonMapper.readValue( Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() { }); List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size()); for (int i = 0; i < specs.size(); ++i) { actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++)); log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i)); } shardSpecs.put(bucket, actualSpecs); } else { log.info("Path[%s] didn't exist!?", partitionInfoPath); } } config.setShardSpecs(shardSpecs); return true; } catch (Exception e) { throw Throwables.propagate(e); } }
From source file:com.metamx.druid.indexer.HadoopDruidIndexerJob.java
License:Open Source License
private void ensurePaths() { // config.addInputPaths() can have side-effects ( boo! :( ), so this stuff needs to be done before anything else try {// w w w .j a va2 s . c om Job job = new Job(new Configuration(), String.format("%s-determine_partitions-%s", config.getDataSource(), config.getIntervals())); job.getConfiguration().set("io.sort.record.percent", "0.19"); for (String propName : System.getProperties().stringPropertyNames()) { Configuration conf = job.getConfiguration(); if (propName.startsWith("hadoop.")) { conf.set(propName.substring("hadoop.".length()), System.getProperty(propName)); } } config.addInputPaths(job); } catch (IOException e) { throw Throwables.propagate(e); } }
From source file:com.metamx.druid.indexer.IndexGeneratorJob.java
License:Open Source License
public boolean run() { try {// ww w .jav a2 s. co m Job job = new Job(new Configuration(), String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals())); job.getConfiguration().set("io.sort.record.percent", "0.23"); for (String propName : System.getProperties().stringPropertyNames()) { Configuration conf = job.getConfiguration(); if (propName.startsWith("hadoop.")) { conf.set(propName.substring("hadoop.".length()), System.getProperty(propName)); } } job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(IndexGeneratorMapper.class); job.setMapOutputValueClass(Text.class); SortableBytes.useSortableBytesAsMapOutputKey(job); job.setNumReduceTasks(Iterables.size(config.getAllBuckets())); job.setPartitionerClass(IndexGeneratorPartitioner.class); job.setReducerClass(IndexGeneratorReducer.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(IndexGeneratorOutputFormat.class); FileOutputFormat.setOutputPath(job, config.makeIntermediatePath()); config.addInputPaths(job); config.intoConfiguration(job); job.setJarByClass(IndexGeneratorJob.class); job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); boolean success = job.waitForCompletion(true); Counter invalidRowCount = job.getCounters() .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER); jobStats.setInvalidRowCount(invalidRowCount.getValue()); return success; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:com.ML_Hadoop.K_meansClustering.K_meansClusteringMapReduce.java
public static void main(String[] args) throws Exception { int iteration = 0, num_of_iteration = 30; int feature_size = 2; FileSystem fs;//from w w w. ja v a 2 s.co m int number_of_clusters = 2; do { Configuration conf = new Configuration(); fs = FileSystem.get(conf); Job job = new Job(conf, "K_meansClusteringMapReduce"); job.setJarByClass(K_meansClusteringMapReduce.class); conf = job.getConfiguration(); // This line is mandatory. job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FloatArrayWritable.class); job.setMapperClass(K_meansClusteringMap.class); job.setReducerClass(K_meansClusteringReduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); // set number of reducers to one. FileInputFormat.addInputPath(job, new Path(args[0])); Path out = new Path(args[1]); if (fs.exists(out)) fs.delete(out, true); FileOutputFormat.setOutputPath(job, out); number_of_clusters = Integer.parseInt(args[2]); num_of_iteration = Integer.parseInt(args[3]); feature_size = Integer.parseInt(args[4]); conf.setInt("number_of_clusters", number_of_clusters); conf.setInt("feature_size", feature_size); conf.setInt("current_iteration_num", iteration); try { job.waitForCompletion(true); iteration++; } catch (IOException e) { e.printStackTrace(); } } while (iteration < num_of_iteration); }
From source file:com.ML_Hadoop.MultipleLinearRegression.MultipleLinearRegressionMapReduce.java
public static void main(String[] args) throws Exception { String[] theta;//from ww w .ja v a 2 s.co m int iteration = 0, num_of_iteration = 1; int feature_size = 0, input_data_size = 0; FileSystem fs; Float alpha = 0.1f; do { Configuration conf = new Configuration(); fs = FileSystem.get(conf); Job job = new Job(conf, "LinearRegressionMapReduce"); job.setJarByClass(MultipleLinearRegressionMapReduce.class); // the following two lines are needed for propagating "theta" conf = job.getConfiguration(); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FloatWritable.class); job.setMapperClass(MultipleLinearRegressionMap.class); job.setReducerClass(MultipleLinearRegressionReduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); // set mapred.reduce.tasks = 1 (only one reducer) FileInputFormat.addInputPath(job, new Path(args[0])); Path out = new Path(args[1]); if (fs.exists(out)) fs.delete(out, true); FileOutputFormat.setOutputPath(job, out); alpha = Float.parseFloat(args[2]); num_of_iteration = Integer.parseInt(args[3]); feature_size = Integer.parseInt(args[4]); input_data_size = Integer.parseInt(args[5]); conf.setFloat("alpha", alpha); conf.setInt("feature_size", feature_size); conf.setInt("input_data_size", input_data_size); conf.setInt("iteration", iteration); theta = new String[feature_size]; if (iteration == 0) { // first iteration for (int i = 0; i < theta.length; i++) theta[i] = "0.0"; conf.setStrings("theta", theta); } else { try { String uri = "/user/hduser/theta.txt"; fs = FileSystem.get(conf); //FSDataInputStream in = fs.open(new Path(uri)); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(uri)))); theta = br.readLine().split(","); } catch (Exception e) { } conf.setStrings("theta", theta); } for (int i = 0; i < theta.length; i++) System.out.println("In MapRedce main function: theta[ " + i + " ]" + theta[i]); try { job.waitForCompletion(true); iteration++; } catch (IOException e) { e.printStackTrace(); } } while (iteration < num_of_iteration); }
From source file:com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features.NaiveBayesClassifierMapReduce_Continuous_Features.java
/** * @param args/*from ww w. j a v a2s. c om*/ * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { int number_of_classes = 1; int number_of_features = 1; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Job job = new Job(conf, "NaiveBayesClassifierMapReduce_Continuous_Features"); job.setJarByClass(NaiveBayesClassifierMapReduce_Continuous_Features.class); conf = job.getConfiguration(); // This line is mandatory. job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FloatArrayWritable.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(MapArrayWritable.class); job.setMapperClass(NaiveBayesClassifierMap_Continuous_Features.class); job.setReducerClass(NaiveBayesClassifierReduce_Continuous_Features.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path(args[0])); Path out = new Path(args[1]); if (fs.exists(out)) fs.delete(out, true); FileOutputFormat.setOutputPath(job, out); number_of_classes = Integer.parseInt(args[2]); number_of_features = Integer.parseInt(args[3]); conf.setInt("number_of_classes", number_of_classes); conf.setInt("number_of_features", number_of_features); try { job.waitForCompletion(true); } catch (IOException e) { e.printStackTrace(); } }
From source file:com.mongodb.hadoop.examples.snmp.SnmpStatisticMongoTool.java
License:Apache License
@Override public int run(String[] args) throws Exception { final Configuration conf = getConf(); final com.mongodb.MongoURI outputUri = MongoConfigUtil.getOutputURI(conf); if (outputUri == null) throw new IllegalStateException("output uri is not set"); if (MongoConfigUtil.getInputURI(conf) == null) throw new IllegalStateException("input uri is not set"); final String outputCollectionName = outputUri.getCollection(); final Job job = new Job(conf, "snmp analysis " + outputCollectionName); job.setJarByClass(SnmpStatisticMongoTool.class); job.setMapperClass(MapHostUploadEachAPEachDay.class); job.setReducerClass(ReduceHostUploadEachAPEachDay.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setInputFormatClass(MongoInputFormat.class); job.setOutputFormatClass(MongoOutputFormat.class); boolean result = job.waitForCompletion(true); return (result ? 0 : 1); }