List of usage examples for org.apache.hadoop.conf Configuration setInt
public void setInt(String name, int value)
name
property to an int
. From source file:com.twitter.algebra.nmf.ErrDMJ.java
License:Apache License
public Job run(Configuration conf, Path xPath, Path matrixAInputPath, Path ytPath, Path outPath, int aRows, int ytRows, int ytCols) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(MAPDIRMATRIXX, xPath.toString()); conf.set(MAPDIRMATRIXYT, ytPath.toString()); conf.setInt(YTROWS, ytRows); conf.setInt(YTCOLS, ytCols);/*from w w w . j av a 2 s .co m*/ FileSystem fs = FileSystem.get(outPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixAInputPath, "err"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(ErrDMJ.class); job.setJobName(ErrDMJ.class.getSimpleName() + "-" + outPath.getName()); matrixAInputPath = fs.makeQualified(matrixAInputPath); MultipleInputs.addInputPath(job, matrixAInputPath, SequenceFileInputFormat.class); outPath = fs.makeQualified(outPath); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); int numReducers = 1; job.setNumReduceTasks(numReducers); job.setCombinerClass(SumVectorsReducer.class); job.setReducerClass(SumVectorsReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed! "); return job; }
From source file:com.twitter.algebra.nmf.SampleColsJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, int cols, Path matrixOutputPath, float sampleRate) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.setFloat(SAMPLERATE, sampleRate); conf.setInt(COLS, cols); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "samplecol"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(SampleColsJob.class); job.setJobName(SampleColsJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0);/*w w w . j a v a 2s .c o m*/ job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.XtXJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, int numCols, String xmPath, Path matrixOutputPath) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.setInt(MATRIXCOLS, numCols); // conf.set(XMPATH, xmPath); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, new Path[] { matrixInputPath }, "xtx"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJobName("XtXJob-" + matrixOutputPath.getName()); job.setJarByClass(XtXJob.class); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "xtx"); job.setNumReduceTasks(numReducers);//from w w w .j av a 2 s . c o m // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numCols); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); job.waitForCompletion(true); }
From source file:com.twitter.algebra.TransposeJob.java
License:Apache License
/** * Perform transpose of A, where A refers to the path that contains a matrix * in {@link SequenceFileInputFormat}./*from www . j a va 2 s .com*/ * * @param conf * the initial configuration * @param matrixInputPath * the path to the input files that we process * @param matrixOutputPath * the path of the resulting transpose matrix * @param numInputRows * rows * @param numInputCols * cols * @return the running job * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows, int numInputCols) throws IOException, InterruptedException, ClassNotFoundException { conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows); conf.setInt(RowPartitioner.TOTAL_KEYS, numInputCols); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "transpose"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(TransposeJob.class); job.setJobName(TransposeJob.class.getSimpleName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(TransposeMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "transpose"); job.setNumReduceTasks(numReducers); // job.setPartitionerClass(RowPartitioner.IntRowPartitioner.class); RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputCols); job.setCombinerClass(MergeVectorsCombiner.class); job.setReducerClass(MergeVectorsReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.elephanttwin.lucene.indexing.AbstractLuceneIndexingJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { LOG = Logger.getLogger(this.getClass()); params = newIndexConfig();//from w w w. j a va 2s . c o m LOG.info("Starting up indexer..."); LOG.info(" - input: " + Joiner.on(" ").join(IndexConfig.input.get())); LOG.info(" - index: " + IndexConfig.index); LOG.info(" - number of shards: " + IndexConfig.numPartitions.get()); Configuration conf = getConf(); conf.set(AbstractLuceneIndexingReducer.HDFS_INDEX_LOCATION, IndexConfig.index.get()); conf.set(AbstractLuceneIndexingReducer.ANALYZER, IndexConfig.analyzer.get()); conf.set(AbstractLuceneIndexingReducer.SIMILARITY, IndexConfig.similarity.get()); conf.setInt(AbstractSamplingIndexingMapper.SAMPLE_PERCENTAGE, IndexConfig.samplePercentage.get()); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); Job job = new Job(conf, getJobName(params)); // Job's constructor copies conf, we need a reference to the one job // is actually using conf = job.getConfiguration(); job.setJarByClass(this.getClass()); job.setNumReduceTasks(IndexConfig.numPartitions.get()); for (String s : IndexConfig.input.get()) { Path spath = new Path(s); FileSystem fs = spath.getFileSystem(getConf()); List<FileStatus> stats = Lists.newArrayList(); addInputPathRecursively(stats, fs, spath, HdfsUtils.HIDDEN_FILE_FILTER); for (FileStatus foundStat : stats) { FileInputFormat.addInputPath(job, foundStat.getPath()); } } FileOutputFormat.setOutputPath(job, new Path(IndexConfig.index.get())); setupJob(job); // Delete the output directory if it exists already. Path outputDir = new Path(IndexConfig.index.get()); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); LOG.info("Job " + getJobName(params) + " started."); // TODO Jimmy has a parameter that controls whether we wait in Thud but not in ES. // when would we not want to wait? job.waitForCompletion(true); LOG.info("Job " + getJobName(params) + " Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); if (job.isSuccessful()) { writeIndexDescriptors(getIndexDescriptor()); } return job.isSuccessful() ? 0 : 1; }
From source file:com.twitter.hraven.etl.JobFileProcessor.java
License:Apache License
public int run(String[] args) throws Exception { Configuration hbaseConf = HBaseConfiguration.create(getConf()); // Grab input args and allow for -Dxyz style arguments String[] otherArgs = new GenericOptionsParser(hbaseConf, args).getRemainingArgs(); // Grab the arguments we're looking for. CommandLine commandLine = parseArgs(otherArgs); // Grab the cluster argument String cluster = commandLine.getOptionValue("c"); LOG.info("cluster=" + cluster); // Number of parallel threads to use int threadCount = 1; if (commandLine.hasOption("t")) { try {//ww w.ja v a2s.c o m threadCount = Integer.parseInt(commandLine.getOptionValue("t")); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "Provided thread-count argument (-t) is not a number: " + commandLine.getOptionValue("t"), nfe); } if (threadCount < 1) { throw new IllegalArgumentException( "Cannot run fewer than 1 thread. Provided thread-count argument (-t): " + threadCount); } } LOG.info("threadCount=" + threadCount); boolean reprocess = commandLine.hasOption("r"); LOG.info("reprocess=" + reprocess); // Grab the batch-size argument int batchSize; if (commandLine.hasOption("b")) { try { batchSize = Integer.parseInt(commandLine.getOptionValue("b")); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "batch size option -b is is not a valid number: " + commandLine.getOptionValue("b"), nfe); } // Additional check if (batchSize < 1) { throw new IllegalArgumentException( "Cannot process files in batches smaller than 1. Specified batch size option -b is: " + commandLine.getOptionValue("b")); } } else { batchSize = DEFAULT_BATCH_SIZE; } // Grab the costfile argument String costFilePath = commandLine.getOptionValue("zf"); LOG.info("cost properties file on hdfs=" + costFilePath); if (costFilePath == null) costFilePath = Constants.COST_PROPERTIES_HDFS_DIR; Path hdfsPath = new Path(costFilePath + Constants.COST_PROPERTIES_FILENAME); // add to distributed cache DistributedCache.addCacheFile(hdfsPath.toUri(), hbaseConf); // Grab the machine type argument String machineType = commandLine.getOptionValue("m"); // set it as part of conf so that the // hRaven job can access it in the mapper hbaseConf.set(Constants.HRAVEN_MACHINE_TYPE, machineType); // check if re-aggregate option is forced on // if yes, we need to aggregate for this job inspite of // job having aggregation done status in raw table boolean reAggregateFlagValue = false; if (commandLine.hasOption("ra")) { String reaggregateFlag = commandLine.getOptionValue("ra"); // set it as part of conf so that the // hRaven jobProcessor can access it in the mapper if (StringUtils.isNotBlank(reaggregateFlag)) { LOG.info(" reaggregateFlag is: " + reaggregateFlag); if (StringUtils.equalsIgnoreCase(reaggregateFlag, Boolean.TRUE.toString())) { reAggregateFlagValue = true; } } } LOG.info(AggregationConstants.RE_AGGREGATION_FLAG_NAME + "=" + reAggregateFlagValue); hbaseConf.setBoolean(AggregationConstants.RE_AGGREGATION_FLAG_NAME, reAggregateFlagValue); // set aggregation to off by default boolean aggFlagValue = false; if (commandLine.hasOption("a")) { String aggregateFlag = commandLine.getOptionValue("a"); // set it as part of conf so that the // hRaven jobProcessor can access it in the mapper if (StringUtils.isNotBlank(aggregateFlag)) { LOG.info(" aggregateFlag is: " + aggregateFlag); if (StringUtils.equalsIgnoreCase(aggregateFlag, Boolean.TRUE.toString())) { aggFlagValue = true; } } } if (reprocess) { // turn off aggregation if reprocessing is true // we don't want to inadvertently aggregate again while re-processing // re-aggregation needs to be a conscious setting aggFlagValue = false; } LOG.info(AggregationConstants.AGGREGATION_FLAG_NAME + "=" + aggFlagValue); hbaseConf.setBoolean(AggregationConstants.AGGREGATION_FLAG_NAME, aggFlagValue); String processFileSubstring = null; if (commandLine.hasOption("p")) { processFileSubstring = commandLine.getOptionValue("p"); } LOG.info("processFileSubstring=" + processFileSubstring); // hbase.client.keyvalue.maxsize somehow defaults to 10 MB and we have // history files exceeding that. Disable limit. hbaseConf.setInt("hbase.client.keyvalue.maxsize", 0); // Shove this into the jobConf so that we can get it out on the task side. hbaseConf.setStrings(Constants.CLUSTER_JOB_CONF_KEY, cluster); boolean success = false; if (reprocess) { success = reProcessRecords(hbaseConf, cluster, batchSize, threadCount); } else { success = processRecords(hbaseConf, cluster, batchSize, threadCount, processFileSubstring); } // Return the status return success ? 0 : 1; }
From source file:com.twitter.hraven.etl.JobFileRawLoader.java
License:Apache License
public int run(String[] args) throws ParseException, IOException, ClassNotFoundException, InterruptedException { Configuration myHBaseConf = HBaseConfiguration.create(getConf()); hdfs = FileSystem.get(myHBaseConf); // Grab input args and allow for -Dxyz style arguments String[] otherArgs = new GenericOptionsParser(myHBaseConf, args).getRemainingArgs(); // Grab the arguments we're looking for. CommandLine commandLine = parseArgs(otherArgs); String input = null;/*from w w w .j a v a 2 s.c o m*/ boolean inputSpecified = commandLine.hasOption("i"); if (inputSpecified) { // Grab the input path argument input = commandLine.getOptionValue("i"); LOG.info("input=" + input); } else { LOG.info("Processing input from HBase ProcessRecords"); } // Grab the cluster argument String cluster = commandLine.getOptionValue("c"); LOG.info("cluster=" + cluster); String processFileSubstring = null; if (commandLine.hasOption("p")) { processFileSubstring = commandLine.getOptionValue("p"); } LOG.info("processFileSubstring=" + processFileSubstring); boolean forceReprocess = commandLine.hasOption("f"); LOG.info("forceReprocess: " + forceReprocess); // hbase.client.keyvalue.maxsize somehow defaults to 10 MB and we have // history files exceeding that. Disable limit. myHBaseConf.setInt("hbase.client.keyvalue.maxsize", 0); // Shove this into the jobConf so that we can get it out on the task side. myHBaseConf.setStrings(Constants.CLUSTER_JOB_CONF_KEY, cluster); boolean success = processRecordsFromHBase(myHBaseConf, cluster, processFileSubstring, forceReprocess); // Return the status return success ? 0 : 1; }
From source file:com.twitter.hraven.hadoopJobMonitor.AppStatusCheckerTest.java
License:Apache License
public boolean testTask(TaskType taskType, String confParamName, long durationMin, final int MAX_RUN, float progress, boolean enforce, boolean dryRun, TIPStatus status, boolean wellBahaved, boolean killed) throws Exception { setTaskAttemptXML(durationMin * MIN, progress); TaskReport taskReport = mock(TaskReport.class); when(taskReport.getCurrentStatus()).thenReturn(status); Collection<TaskAttemptID> attempts = new ArrayList<TaskAttemptID>(); attempts.add(taskAttemptId);// w w w . ja va 2 s . c o m when(taskReport.getRunningTaskAttemptIds()).thenReturn(attempts); when(taskReport.getTaskID()).thenReturn(org.apache.hadoop.mapred.TaskID.downgrade(taskId)); when(taskReport.getProgress()).thenReturn(progress); vConf.setBoolean(HadoopJobMonitorConfiguration.DRY_RUN, dryRun); Configuration remoteAppConf = new Configuration(); remoteAppConf.setInt(confParamName, MAX_RUN); remoteAppConf.setBoolean(HadoopJobMonitorConfiguration.enforced(confParamName), enforce); when(taskReport.getStartTime()).thenReturn(now - durationMin * MIN); AppConfiguraiton appConf = new AppConfiguraiton(remoteAppConf, vConf); AppConfCache.getInstance().put(appId, appConf); appStatusChecker.init(); appStatusChecker.loadClientService(); boolean res = appStatusChecker.checkTask(taskType, taskReport, now); if (wellBahaved) assertEquals("Well-bahved task does not pass the check", wellBahaved, res); else assertEquals("Not Well-bahved task passes the check", wellBahaved, res); if (killed) { killCounter++; verify(clientService, times(killCounter)).killTask(any(TaskAttemptID.class), Mockito.anyBoolean()); } else verify(clientService, times(killCounter)).killTask(any(TaskAttemptID.class), Mockito.anyBoolean()); return res; }
From source file:com.twitter.hraven.hadoopJobMonitor.AppStatusCheckerTest.java
License:Apache License
@Test public void testUnsetEnforce() throws IOException, ConfigurationAccessException { Configuration remoteAppConf = new Configuration(); remoteAppConf.setInt(HadoopJobMonitorConfiguration.JOB_MAX_LEN_MIN, 10); //remoteAppConf.setBoolean(HadoopJobMonitorConfiguration.enforced(HadoopJobMonitorConfiguration.JOB_MAX_LEN_MIN), true); when(appReport.getStartTime()).thenReturn(now - 15 * MIN); AppConfiguraiton appConf = new AppConfiguraiton(remoteAppConf, vConf); AppConfCache.getInstance().put(appId, appConf); appStatusChecker.init();/*from w ww .j ava 2s. c o m*/ boolean res = appStatusChecker.checkApp(); Assert.assertTrue("fails job duration check even though enforce is not set", res); }
From source file:com.twitter.hraven.hadoopJobMonitor.AppStatusCheckerTest.java
License:Apache License
@Test public void testLongJobDryRun() throws IOException, ConfigurationAccessException, YarnException { Configuration remoteAppConf = new Configuration(); remoteAppConf.setInt(HadoopJobMonitorConfiguration.JOB_MAX_LEN_MIN, 10); remoteAppConf.setBoolean(// w w w. j ava 2 s .co m HadoopJobMonitorConfiguration.enforced(HadoopJobMonitorConfiguration.JOB_MAX_LEN_MIN), true); when(appReport.getStartTime()).thenReturn(now - 15 * MIN); AppConfiguraiton appConf = new AppConfiguraiton(remoteAppConf, vConf); AppConfCache.getInstance().put(appId, appConf); appStatusChecker.init(); boolean res = appStatusChecker.checkApp(); Assert.assertFalse("does not fail job duration check even though enforce is set", res); verify(rm, times(0)).killApplication(appId); }