List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:com.twitter.algebra.nmf.SampleRowsJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, float sampleRate) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.setFloat(SAMPLERATE, sampleRate); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "samplerows"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(SampleRowsJob.class); job.setJobName(SampleRowsJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit();/*from w ww . j a v a 2 s. c o m*/ boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.XtXJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, int numCols, String xmPath, Path matrixOutputPath) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.setInt(MATRIXCOLS, numCols);// ww w .j a v a 2s . c om // conf.set(XMPATH, xmPath); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, new Path[] { matrixInputPath }, "xtx"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJobName("XtXJob-" + matrixOutputPath.getName()); job.setJarByClass(XtXJob.class); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "xtx"); job.setNumReduceTasks(numReducers); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numCols); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); job.waitForCompletion(true); }
From source file:com.twitter.algebra.TransposeJob.java
License:Apache License
/** * Perform transpose of A, where A refers to the path that contains a matrix * in {@link SequenceFileInputFormat}./*from w ww . j a v a 2 s . com*/ * * @param conf * the initial configuration * @param matrixInputPath * the path to the input files that we process * @param matrixOutputPath * the path of the resulting transpose matrix * @param numInputRows * rows * @param numInputCols * cols * @return the running job * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows, int numInputCols) throws IOException, InterruptedException, ClassNotFoundException { conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows); conf.setInt(RowPartitioner.TOTAL_KEYS, numInputCols); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "transpose"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(TransposeJob.class); job.setJobName(TransposeJob.class.getSimpleName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(TransposeMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "transpose"); job.setNumReduceTasks(numReducers); // job.setPartitionerClass(RowPartitioner.IntRowPartitioner.class); RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputCols); job.setCombinerClass(MergeVectorsCombiner.class); job.setReducerClass(MergeVectorsReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.elephanttwin.indexing.AbstractBlockIndexingJob.java
License:Open Source License
/** * Sets up various job properites required for the indexing job. * If your implementation needs to mess with the conf, you can do so by overriding * this method (remember to call super.setupJob()!) or in setMapper(). * @param conf/*from w w w.j a v a 2s.co m*/ * @return * @throws IOException */ protected Job setupJob(Configuration conf) throws IOException { Job job = new Job(new Configuration(conf)); job.setJarByClass(getClass()); job.setInputFormatClass(BlockIndexedFileInputFormat.class); job.setReducerClass(MapFileIndexingReducer.class); job.setMapOutputKeyClass(TextLongPairWritable.class); job.setMapOutputValueClass(LongPairWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ListLongPair.class); job.setPartitionerClass(TextLongPairWritable.Parititioner.class); job.setSortComparatorClass(TextLongPairWritable.PairComparator.class); job.setGroupingComparatorClass(TextLongPairWritable.KeyOnlyComparator.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setNumReduceTasks(getNumPartitions()); BlockIndexedFileInputFormat.setIndexOptions(job, getInputFormat(), getValueClass(), getIndex(), getColumnName()); return job; }
From source file:com.twitter.elephanttwin.lucene.indexing.AbstractLuceneIndexingJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { LOG = Logger.getLogger(this.getClass()); params = newIndexConfig();//from www. j a v a2 s . co m LOG.info("Starting up indexer..."); LOG.info(" - input: " + Joiner.on(" ").join(IndexConfig.input.get())); LOG.info(" - index: " + IndexConfig.index); LOG.info(" - number of shards: " + IndexConfig.numPartitions.get()); Configuration conf = getConf(); conf.set(AbstractLuceneIndexingReducer.HDFS_INDEX_LOCATION, IndexConfig.index.get()); conf.set(AbstractLuceneIndexingReducer.ANALYZER, IndexConfig.analyzer.get()); conf.set(AbstractLuceneIndexingReducer.SIMILARITY, IndexConfig.similarity.get()); conf.setInt(AbstractSamplingIndexingMapper.SAMPLE_PERCENTAGE, IndexConfig.samplePercentage.get()); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); Job job = new Job(conf, getJobName(params)); // Job's constructor copies conf, we need a reference to the one job // is actually using conf = job.getConfiguration(); job.setJarByClass(this.getClass()); job.setNumReduceTasks(IndexConfig.numPartitions.get()); for (String s : IndexConfig.input.get()) { Path spath = new Path(s); FileSystem fs = spath.getFileSystem(getConf()); List<FileStatus> stats = Lists.newArrayList(); addInputPathRecursively(stats, fs, spath, HdfsUtils.HIDDEN_FILE_FILTER); for (FileStatus foundStat : stats) { FileInputFormat.addInputPath(job, foundStat.getPath()); } } FileOutputFormat.setOutputPath(job, new Path(IndexConfig.index.get())); setupJob(job); // Delete the output directory if it exists already. Path outputDir = new Path(IndexConfig.index.get()); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); LOG.info("Job " + getJobName(params) + " started."); // TODO Jimmy has a parameter that controls whether we wait in Thud but not in ES. // when would we not want to wait? job.waitForCompletion(true); LOG.info("Job " + getJobName(params) + " Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); if (job.isSuccessful()) { writeIndexDescriptors(getIndexDescriptor()); } return job.isSuccessful() ? 0 : 1; }
From source file:com.twitter.elephanttwin.retrieval.ScanUsingIndexJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { params = new IndexConfig(); LOG.info(" - input: " + Joiner.on(" ").join(params.getInput())); LOG.info(" - output: " + IndexConfig.output.get()); Configuration conf = getConf(); Path outputDir = new Path(params.getOutput()); FileSystem fs = outputDir.getFileSystem(conf); fs.delete(outputDir, true);// w ww .j a va 2 s . co m int totalInputFiles = 0; List<FileStatus> stats = Lists.newArrayList(); for (String s : params.getInput()) { Path spath = new Path(IndexConfig.index.get() + s); HdfsUtils.addInputPathRecursively(stats, fs, spath, hiddenDirectoryFilter, indexDataFilter); } totalInputFiles = stats.size(); LOG.info(totalInputFiles + " total index files to be scanned"); conf.set(IndexScanMapper.searchColumnName, params.getColumnName()); Job job = new Job(new Configuration(conf)); job.setJarByClass(getClass()); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(params.getOutput())); for (FileStatus file : stats) FileInputFormat.addInputPath(job, file.getPath()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); job.setMapperClass(IndexScanMapper.class); job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); job.setJobName("ScanUsingIndexJob:" + IndexConfig.input.get()); BlockIndexedFileInputFormat.setSearchOptions(job, params.getinputFormat(), params.getValueClass(), params.getIndex(), (String) null); job.waitForCompletion(true); return 0; }
From source file:com.twitter.hraven.etl.JobFileProcessor.java
License:Apache License
/** * @param conf//from w ww . j a va 2s. c o m * to use to create and run the job * @param scan * to be used to scan the raw table. * @param totalJobCount * the total number of jobs that need to be run in this batch. Used * in job name. * @return The job to be submitted to the cluster. * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ private Job getProcessingJob(Configuration conf, Scan scan, int totalJobCount) throws IOException { Configuration confClone = new Configuration(conf); // Turn off speculative execution. // Note: must be BEFORE the job construction with the new mapreduce API. confClone.setBoolean("mapred.map.tasks.speculative.execution", false); // Set up job Job job = new Job(confClone, getJobName(totalJobCount)); // This is a map-only class, skip reduce step job.setNumReduceTasks(0); job.setJarByClass(JobFileProcessor.class); job.setOutputFormatClass(MultiTableOutputFormat.class); TableMapReduceUtil.initTableMapperJob(Constants.HISTORY_RAW_TABLE, scan, JobFileTableMapper.class, JobFileTableMapper.getOutputKeyClass(), JobFileTableMapper.getOutputValueClass(), job); return job; }
From source file:com.twitter.hraven.etl.JobFileRawLoader.java
License:Apache License
/** * @param conf// w ww . ja va 2 s . c o m * to use to create and run the job. Should be an HBase * configuration. * @param input * path to the processFile * @param totalJobCount the total number of * jobs that need to be run in this batch. Used in job name. * @return whether all job confs were loaded properly. * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ private boolean runRawLoaderJob(Configuration myHBaseConf, String input, int totalJobCount) throws IOException, InterruptedException, ClassNotFoundException { boolean success; // Turn off speculative execution. // Note: must be BEFORE the job construction with the new mapreduce API. myHBaseConf.setBoolean("mapred.map.tasks.speculative.execution", false); // Set up job Job job = new Job(myHBaseConf, getJobName(totalJobCount)); job.setJarByClass(JobFileRawLoader.class); Path inputPath = new Path(input); if (hdfs.exists(inputPath)) { // Set input job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.setInputPaths(job, inputPath); job.setMapperClass(JobFileRawLoaderMapper.class); // Set the output format to push data into HBase. job.setOutputFormatClass(TableOutputFormat.class); TableMapReduceUtil.initTableReducerJob(Constants.HISTORY_RAW_TABLE, null, job); job.setOutputKeyClass(JobFileRawLoaderMapper.getOutputKeyClass()); job.setOutputValueClass(JobFileRawLoaderMapper.getOutputValueClass()); // This is a map-only class, skip reduce step job.setNumReduceTasks(0); // Run the job success = job.waitForCompletion(true); if (success) { success = hdfs.delete(inputPath, false); } } else { System.err.println("Unable to find processFile: " + inputPath); success = false; } return success; }
From source file:com.twitter.scalding.parquet.scrooge.TestCorruptScroogeRecords.java
License:Apache License
@Override public void setupJob(Job job, Path path) throws Exception { job.setInputFormatClass(ParquetScroogeInputFormat.class); ParquetScroogeInputFormat.setInputPaths(job, path); ParquetScroogeInputFormat.setThriftClass(job.getConfiguration(), StructWithUnionV2.class); ThriftReadSupport.setRecordConverterClass(job.getConfiguration(), ScroogeRecordConverter.class); job.setMapperClass(ReadMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(NullOutputFormat.class); }
From source file:com.wipro.ats.bdre.datagen.mr.Driver.java
License:Apache License
/** * @param args the cli arguments/* w ww . j a va 2s . c om*/ */ @Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = getConf(); GetGeneralConfig generalConfig = new GetGeneralConfig(); GeneralConfig gc = generalConfig.byConigGroupAndKey("imconfig", "common.default-fs-name"); conf.set("fs.defaultFS", gc.getDefaultVal()); String processId = args[0]; Path outputDir = new Path(ResolvePath.replaceVars(args[1])); Properties dataProps = Config.getDataProperties(processId); Properties tableProps = Config.getTableProperties(processId); TableUtil tableUtil = new TableUtil(); Table table = tableUtil.formTableFromConfig(processId); FileSystem fs = FileSystem.get(conf); LOGGER.info("Default FS =" + conf.get("fs.defaultFS")); //set in the conf for mappers to use conf.set(Config.SEPARATOR_KEY, tableProps.getProperty("separator")); conf.set(Config.PID_KEY, processId); conf.setLong(Config.NUM_ROWS_KEY, Long.parseLong(dataProps.getProperty("numRows"))); conf.setInt(Config.NUM_SPLITS_KEY, Integer.parseInt(dataProps.getProperty("numSplits"))); Job job = Job.getInstance(conf); Path mrOutputPath = new Path(outputDir.toString() + "/MROUT/" + table.getTableName()); FileOutputFormat.setOutputPath(job, mrOutputPath); job.setJobName("Datagen-" + table.getTableName()); job.setJarByClass(Driver.class); job.setMapperClass(RecordGenMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RangeInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.waitForCompletion(true); //merge and create a single file Path srcDir = mrOutputPath; Path destFile = new Path(outputDir.toString() + "/" + table.getTableName()); FileUtil.copyMerge(fs, srcDir, fs, destFile, true, conf, ""); //Return file info oozie params RegisterFileInfo registerFileInfo = new RegisterFileInfo(); registerFileInfo.setBatchId(null); registerFileInfo.setCreationTs(new Timestamp(new Date().getTime())); registerFileInfo.setFileHash("0"); registerFileInfo.setFileSize(0L); registerFileInfo.setPath(destFile.toString()); registerFileInfo.setSubProcessId(Integer.parseInt(processId)); OozieUtil oozieUtil = new OozieUtil(); oozieUtil.persistBeanData(registerFileInfo, false); return 0; }