List of usage examples for org.apache.hadoop.mapreduce Job submit
public void submit() throws IOException, InterruptedException, ClassNotFoundException
From source file:org.apache.kylin.engine.mr.common.AbstractHadoopJob.java
License:Apache License
protected int waitForCompletion(Job job) throws IOException, InterruptedException, ClassNotFoundException { int retVal = 0; long start = System.nanoTime(); if (isAsync) { job.submit(); } else {/* w w w . j a v a2 s . c o m*/ job.waitForCompletion(true); retVal = job.isSuccessful() ? 0 : 1; logger.debug("Job '" + job.getJobName() + "' finished " + (job.isSuccessful() ? "successfully in " : "with failures. Time taken ") + formatTime((System.nanoTime() - start) / 1000000L)); } return retVal; }
From source file:org.apache.mahout.clustering.lda.cvb.CVB0Driver.java
License:Apache License
private Job writeTopicModel(Configuration conf, Path modelInput, Path output) throws IOException, InterruptedException, ClassNotFoundException { String jobName = String.format("Writing final topic/term distributions from %s to %s", modelInput, output); log.info("About to run: {}", jobName); Job job = prepareJob(modelInput, output, SequenceFileInputFormat.class, CVB0TopicTermVectorNormalizerMapper.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, jobName); job.submit(); return job;// ww w . ja va 2s.c o m }
From source file:org.apache.mahout.clustering.lda.cvb.CVB0Driver.java
License:Apache License
private Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output) throws IOException, ClassNotFoundException, InterruptedException { String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output); log.info("About to run: {}", jobName); Job job = prepareJob(corpus, output, SequenceFileInputFormat.class, CVB0DocInferenceMapper.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, jobName); FileSystem fs = FileSystem.get(corpus.toUri(), conf); if (modelInput != null && fs.exists(modelInput)) { FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter()); URI[] modelUris = new URI[statuses.length]; for (int i = 0; i < statuses.length; i++) { modelUris[i] = statuses[i].getPath().toUri(); }//from www . j a va 2s . c o m DistributedCache.setCacheFiles(modelUris, conf); setModelPaths(job, modelInput); } job.submit(); return job; }
From source file:org.apache.mahout.freqtermsets.PFPGrowth.java
License:Apache License
/** * Count the frequencies of various features in parallel using Map/Reduce *///from w ww. jav a 2 s .c o m public static void startParallelCounting(Parameters params, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { conf.set(PFP_PARAMETERS, params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); // if(Boolean.parseBoolean(params.get(PFPGrowth.PSEUDO, "false"))){ // conf.set("mapred.tasktracker.map.tasks.maximum", "3"); // conf.set("mapred.tasktracker.reduce.tasks.maximum", "3"); // conf.set("mapred.map.child.java.opts", "-Xmx777M"); // conf.set("mapred.reduce.child.java.opts", "-Xmx777M"); // conf.setInt("mapred.max.map.failures.percent", 0); // } conf.set("mapred.child.java.opts", "-XX:-UseGCOverheadLimit -XX:+HeapDumpOnOutOfMemoryError"); // String input = params.get(INPUT); // Job job = new Job(conf, "Parallel Counting Driver running over input: " + input); long startTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START)); // Long.toString(PFPGrowth.TREC2011_MIN_TIMESTAMP)); //GMT23JAN2011)); long endTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_END)); // Long.toString(Long.MAX_VALUE)); long windowSize = Long .parseLong(params.get(PFPGrowth.PARAM_WINDOW_SIZE, Long.toString(endTime - startTime))); long stepSize = Long.parseLong(params.get(PFPGrowth.PARAM_STEP_SIZE, Long.toString(windowSize))); endTime = Math.min(endTime, startTime + windowSize); FileSystem fs = FileSystem.get(conf); // TODONE: do I need?getLocal(conf); Job[] jobArr = new Job[(int) Math.ceil(windowSize / stepSize)]; for (int j = 0; startTime < endTime; startTime += stepSize, ++j) { long jobEnd = startTime + stepSize; Job job = new Job(conf, "Parallel counting running over inerval " + startTime + "-" + jobEnd); // endTime); // Path outPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING); Path outRoot = new Path(params.get(OUTROOT)); Path stepOutput = new Path(outRoot, startTime + ""); stepOutput = new Path(stepOutput, jobEnd + ""); if (fs.exists(stepOutput)) { continue; } jobArr[j] = job; Path outPath = new Path(stepOutput, PARALLEL_COUNTING); FileOutputFormat.setOutputPath(job, outPath); // HadoopUtil.delete(conf, outPath); job.setJarByClass(PFPGrowth.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); PartitionByTimestamp.setInputPaths(job, params, conf); // FileInputFormat.addInputPath(job, new Path(input)); // job.setInputFormatClass(HtmlTweetInputFormat.class); job.setInputFormatClass(CSVTweetInputFormat.class); job.setMapperClass(ParallelCountingMapper.class); job.setCombinerClass(ParallelCountingReducer.class); job.setReducerClass(ParallelCountingReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.submit(); // boolean succeeded = job.waitForCompletion(true); // if (!succeeded) { // throw new IllegalStateException("Job failed!"); // } } boolean allCompleted; do { Thread.sleep(1000); allCompleted = true; for (int j = 0; j < jobArr.length; ++j) { if (jobArr[j] == null) { continue; } boolean complete = jobArr[j].isComplete(); allCompleted &= complete; if (!complete) { String report = (j + " (" + jobArr[j].getJobName() + "): map " + StringUtils.formatPercent(jobArr[j].mapProgress(), 0) + " reduce " + StringUtils.formatPercent(jobArr[j].reduceProgress(), 0) + " - Tracking: " + jobArr[j].getTrackingURL()); LOG.info(report); } } } while (!allCompleted); boolean allSuccess = true; for (int j = 0; j < jobArr.length; ++j) { if (jobArr[j] == null) { continue; } boolean success = jobArr[j].isSuccessful(); allSuccess &= success; if (!success) { String report = (j + " (" + jobArr[j].getJobName() + "): FAILED - Tracking: " + jobArr[j].getTrackingURL()); LOG.info(report); } } if (!allSuccess) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.mahout.math.hadoop.MatrixColumnMeansJob.java
License:Apache License
/** * Job for calculating column-wise mean of a DistributedRowMatrix * * @param initialConf/*from ww w.ja va 2 s .c o m*/ * @param inputPath * path to DistributedRowMatrix input * @param outputVectorTmpPath * path for temporary files created during job * @param vectorClass * String of desired class for returned vector e.g. DenseVector, * RandomAccessSparseVector (may be null for {@link DenseVector} ) * @return Vector containing column-wise mean of DistributedRowMatrix */ public static Vector run(Configuration initialConf, Path inputPath, Path outputVectorTmpPath, String vectorClass) throws IOException { try { initialConf.set(VECTOR_CLASS, vectorClass == null ? DenseVector.class.getName() : vectorClass); Job job = new Job(initialConf, "MatrixColumnMeansJob"); job.setJarByClass(MatrixColumnMeansJob.class); FileOutputFormat.setOutputPath(job, outputVectorTmpPath); outputVectorTmpPath.getFileSystem(job.getConfiguration()).delete(outputVectorTmpPath, true); job.setNumReduceTasks(1); FileOutputFormat.setOutputPath(job, outputVectorTmpPath); FileInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputVectorTmpPath); job.setMapperClass(MatrixColumnMeansMapper.class); job.setReducerClass(MatrixColumnMeansReducer.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); job.waitForCompletion(true); Path tmpFile = new Path(outputVectorTmpPath, "part-r-00000"); SequenceFileValueIterator<VectorWritable> iterator = new SequenceFileValueIterator<VectorWritable>( tmpFile, true, initialConf); try { if (iterator.hasNext()) { return iterator.next().get(); } else { return (Vector) Class.forName(vectorClass).getConstructor(int.class).newInstance(0); } } finally { Closeables.close(iterator, true); } } catch (IOException ioe) { throw ioe; } catch (Throwable thr) { throw new IOException(thr); } }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.ABtDenseOutJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputAPaths, Path inputBtGlob, Path xiPath, Path sqPath, Path sbPath, Path outputPath, int aBlockRows, int minSplitSize, int k, int p, int outerProdBlockHeight, int numReduceTasks, boolean broadcastBInput) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); Job job = new Job(oldApiJob); job.setJobName("ABt-job"); job.setJarByClass(ABtDenseOutJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, inputAPaths); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); }/*from www.j a va 2 s . c o m*/ FileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(SplitPartitionedWritable.class); job.setMapOutputValueClass(DenseBlockWritable.class); job.setOutputKeyClass(SplitPartitionedWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(ABtMapper.class); job.setReducerClass(QRReducer.class); job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows); job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, outerProdBlockHeight); job.getConfiguration().setInt(QRFirstStep.PROP_K, k); job.getConfiguration().setInt(QRFirstStep.PROP_P, p); job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString()); /* * PCA-related options, MAHOUT-817 */ if (xiPath != null) { job.getConfiguration().set(PROP_XI_PATH, xiPath.toString()); job.getConfiguration().set(PROP_SB_PATH, sbPath.toString()); job.getConfiguration().set(PROP_SQ_PATH, sqPath.toString()); } job.setNumReduceTasks(numReduceTasks); // broadcast Bt files if required. if (broadcastBInput) { job.getConfiguration().set(PROP_BT_BROADCAST, "y"); FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf); FileStatus[] fstats = fs.globStatus(inputBtGlob); if (fstats != null) { for (FileStatus fstat : fstats) { /* * new api is not enabled yet in our dependencies at this time, still * using deprecated one */ DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration()); } } } job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("ABt job unsuccessful."); } }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.ABtJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputAPaths, Path inputBtGlob, Path outputPath, int aBlockRows, int minSplitSize, int k, int p, int outerProdBlockHeight, int numReduceTasks, boolean broadcastBInput) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); // MultipleOutputs // .addNamedOutput(oldApiJob, // QJob.OUTPUT_QHAT, // org.apache.hadoop.mapred.SequenceFileOutputFormat.class, // SplitPartitionedWritable.class, // DenseBlockWritable.class); ////from w w w .j a v a2 s.c o m // MultipleOutputs // .addNamedOutput(oldApiJob, // QJob.OUTPUT_RHAT, // org.apache.hadoop.mapred.SequenceFileOutputFormat.class, // SplitPartitionedWritable.class, // VectorWritable.class); Job job = new Job(oldApiJob); job.setJobName("ABt-job"); job.setJarByClass(ABtJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, inputAPaths); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); } FileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(SplitPartitionedWritable.class); job.setMapOutputValueClass(SparseRowBlockWritable.class); job.setOutputKeyClass(SplitPartitionedWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(ABtMapper.class); job.setCombinerClass(BtJob.OuterProductCombiner.class); job.setReducerClass(QRReducer.class); job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows); job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, outerProdBlockHeight); job.getConfiguration().setInt(QRFirstStep.PROP_K, k); job.getConfiguration().setInt(QRFirstStep.PROP_P, p); job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString()); // number of reduce tasks doesn't matter. we don't actually // send anything to reducers. job.setNumReduceTasks(numReduceTasks); // broadcast Bt files if required. if (broadcastBInput) { job.getConfiguration().set(PROP_BT_BROADCAST, "y"); FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf); FileStatus[] fstats = fs.globStatus(inputBtGlob); if (fstats != null) { for (FileStatus fstat : fstats) { /* * new api is not enabled yet in our dependencies at this time, still * using deprecated one */ DistributedCache.addCacheFile(fstat.getPath().toUri(), conf); } } } job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("ABt job unsuccessful."); } }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.BBtJob.java
License:Apache License
public static void run(Configuration conf, Path btPath, Path outputPath, int numReduceTasks) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(conf); job.setJobName("BBt-job"); job.setJarByClass(BBtJob.class); // input/* ww w.j av a2s. c om*/ job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, btPath); // map job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setMapperClass(BBtMapper.class); job.setReducerClass(BBtReducer.class); // combiner and reducer job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); // output job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BBT); // run job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("BBt job failed."); } }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.BtJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPathA, Path inputPathQJob, Path xiPath, Path outputPath, int minSplitSize, int k, int p, int btBlockHeight, int numReduceTasks, boolean broadcast, Class<? extends Writable> labelClass, boolean outputBBtProducts) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_Q, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, labelClass, VectorWritable.class); if (outputBBtProducts) { MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_BBT, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); /*/* w ww . j a v a 2 s. com*/ * MAHOUT-1067: if we are asked to output BBT products then named vector * names should be propagated to Q too so that UJob could pick them up * from there. */ oldApiJob.setBoolean(PROP_NV, true); } if (xiPath != null) { // compute pca -related stuff as well MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SQ, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SB, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); } /* * HACK: we use old api multiple outputs since they are not available in the * new api of either 0.20.2 or 0.20.203 but wrap it into a new api job so we * can use new api interfaces. */ Job job = new Job(oldApiJob); job.setJobName("Bt-job"); job.setJarByClass(BtJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, inputPathA); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); } FileOutputFormat.setOutputPath(job, outputPath); // WARN: tight hadoop integration here: job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BT); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(SparseRowBlockWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(BtMapper.class); job.setCombinerClass(OuterProductCombiner.class); job.setReducerClass(OuterProductReducer.class); job.getConfiguration().setInt(QJob.PROP_K, k); job.getConfiguration().setInt(QJob.PROP_P, p); job.getConfiguration().set(PROP_QJOB_PATH, inputPathQJob.toString()); job.getConfiguration().setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts); job.getConfiguration().setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, btBlockHeight); job.setNumReduceTasks(numReduceTasks); /* * PCA-related options, MAHOUT-817 */ if (xiPath != null) { job.getConfiguration().set(PROP_XI_PATH, xiPath.toString()); } /* * we can broadhast Rhat files since all of them are reuqired by each job, * but not Q files which correspond to splits of A (so each split of A will * require only particular Q file, each time different one). */ if (broadcast) { job.getConfiguration().set(PROP_RHAT_BROADCAST, "y"); FileSystem fs = FileSystem.get(inputPathQJob.toUri(), conf); FileStatus[] fstats = fs.globStatus(new Path(inputPathQJob, QJob.OUTPUT_RHAT + "-*")); if (fstats != null) { for (FileStatus fstat : fstats) { /* * new api is not enabled yet in our dependencies at this time, still * using deprecated one */ DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration()); } } } job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("Bt job unsuccessful."); } }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.QJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPaths, Path sbPath, Path outputPath, int aBlockRows, int minSplitSize, int k, int p, long seed, int numReduceTasks) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_QHAT, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, SplitPartitionedWritable.class, DenseBlockWritable.class); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_RHAT, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, SplitPartitionedWritable.class, VectorWritable.class); Job job = new Job(oldApiJob); job.setJobName("Q-job"); job.setJarByClass(QJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, inputPaths); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); }// w w w. j a v a 2 s. co m FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(SplitPartitionedWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(SplitPartitionedWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(QMapper.class); job.getConfiguration().setInt(PROP_AROWBLOCK_SIZE, aBlockRows); job.getConfiguration().setLong(PROP_OMEGA_SEED, seed); job.getConfiguration().setInt(PROP_K, k); job.getConfiguration().setInt(PROP_P, p); if (sbPath != null) { job.getConfiguration().set(PROP_SB_PATH, sbPath.toString()); } /* * number of reduce tasks doesn't matter. we don't actually send anything to * reducers. */ job.setNumReduceTasks(0 /* numReduceTasks */); job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("Q job unsuccessful."); } }