List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:org.apache.mahout.df.mapred.partial.PartialBuilder.java
License:Apache License
@Override protected void configureJob(JobConf job, int nbTrees, boolean oobEstimate) throws IOException { FileInputFormat.setInputPaths(job, getDataPath()); FileOutputFormat.setOutputPath(job, getOutputPath(job)); job.setOutputKeyClass(TreeID.class); job.setOutputValueClass(MapredOutput.class); job.setMapperClass(Step1Mapper.class); job.setNumReduceTasks(0); // no reducers job.setInputFormat(TextInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); // if we are in 'local' mode, correct the number of maps // or the mappers won't be able to compute the right indexes String tracker = job.get("mapred.job.tracker", "local"); if ("local".equals(tracker)) { log.warn("Hadoop running in 'local' mode, only one map task will be launched"); job.setNumMapTasks(1);// w ww.j a v a 2 s . c om } }
From source file:org.apache.mahout.df.mapred.partial.Step0Job.java
License:Apache License
/** * Computes the partitions' first ids in Hadoop's order * /*w w w . j a v a 2s. c om*/ * @param conf * configuration * @return first ids for all the partitions * @throws IOException */ public Step0Output[] run(Configuration conf) throws IOException { JobConf job = new JobConf(conf, Step0Job.class); // check the output if (outputPath.getFileSystem(job).exists(outputPath)) { throw new IOException("Output path already exists : " + outputPath); } // put the dataset into the DistributedCache // use setCacheFiles() to overwrite the first-step cache files URI[] files = { datasetPath.toUri() }; DistributedCache.setCacheFiles(files, job); FileInputFormat.setInputPaths(job, dataPath); FileOutputFormat.setOutputPath(job, outputPath); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Step0Output.class); job.setMapperClass(Step0Mapper.class); job.setNumReduceTasks(0); // no reducers job.setInputFormat(TextInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); // run the job JobClient.runJob(job); return parseOutput(job); }
From source file:org.apache.mahout.df.mapred.partial.Step0JobTest.java
License:Apache License
public void testStep0Mapper() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, numAttributes); double[][] source = Utils.randomDoubles(rng, descriptor, numInstances); String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); JobConf job = new JobConf(); job.setNumMapTasks(numMaps);// w w w .ja v a 2s . c o m FileInputFormat.setInputPaths(job, dataPath); // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, numMaps); InputSplit[] sorted = Arrays.copyOf(splits, splits.length); Builder.sortSplits(sorted); Step0OutputCollector collector = new Step0OutputCollector(numMaps); Reporter reporter = Reporter.NULL; for (int p = 0; p < numMaps; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); Step0Mapper mapper = new Step0Mapper(); mapper.configure(p); Long firstKey = null; int size = 0; while (reader.next(key, value)) { if (firstKey == null) { firstKey = key.get(); } mapper.map(key, value, collector, reporter); size++; } mapper.close(); // validate the mapper's output assertEquals(p, collector.keys[p]); assertEquals(firstKey.longValue(), collector.values[p].getFirstId()); assertEquals(size, collector.values[p].getSize()); } }
From source file:org.apache.mahout.df.mapred.partial.Step0JobTest.java
License:Apache License
public void testProcessOutput() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, numAttributes); double[][] source = Utils.randomDoubles(rng, descriptor, numInstances); // each instance label is its index in the dataset int labelId = Utils.findLabel(descriptor); for (int index = 0; index < numInstances; index++) { source[index][labelId] = index;//w w w. ja va 2 s . c o m } String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); // prepare a data converter Dataset dataset = DataLoader.generateDataset(descriptor, sData); DataConverter converter = new DataConverter(dataset); JobConf job = new JobConf(); job.setNumMapTasks(numMaps); FileInputFormat.setInputPaths(job, dataPath); // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, numMaps); InputSplit[] sorted = Arrays.copyOf(splits, splits.length); Builder.sortSplits(sorted); Reporter reporter = Reporter.NULL; int[] keys = new int[numMaps]; Step0Output[] values = new Step0Output[numMaps]; int[] expectedIds = new int[numMaps]; for (int p = 0; p < numMaps; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); Long firstKey = null; int size = 0; while (reader.next(key, value)) { if (firstKey == null) { firstKey = key.get(); expectedIds[p] = converter.convert(0, value.toString()).label; } size++; } keys[p] = p; values[p] = new Step0Output(firstKey, size); } Step0Output[] partitions = Step0Job.processOutput(keys, values); int[] actualIds = Step0Output.extractFirstIds(partitions); assertTrue("Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds), Arrays.equals(expectedIds, actualIds)); }
From source file:org.apache.mahout.df.mapred.partial.Step2Job.java
License:Apache License
/** * Run the second step.//from ww w .j ava2s . c om * * @param conf * configuration * @param keys * keys returned by the first step * @param trees * trees returned by the first step * @param callback * @throws IOException */ public void run(Configuration conf, TreeID[] keys, Node[] trees, PredictionCallback callback) throws IOException { if (callback == null) { // no need to launch the job return; } int numTrees = keys.length; JobConf job = new JobConf(conf, Step2Job.class); // check the output if (outputPath.getFileSystem(job).exists(outputPath)) { throw new IOException("Output path already exists : " + outputPath); } int[] sizes = Step0Output.extractSizes(partitions); InterResults.store(forestPath.getFileSystem(job), forestPath, keys, trees, sizes); // needed by the mapper Builder.setNbTrees(job, numTrees); // put the dataset and the forest into the DistributedCache // use setCacheFiles() to overwrite the first-step cache files URI[] files = { datasetPath.toUri(), forestPath.toUri() }; DistributedCache.setCacheFiles(files, job); FileInputFormat.setInputPaths(job, dataPath); FileOutputFormat.setOutputPath(job, outputPath); job.setOutputKeyClass(TreeID.class); job.setOutputValueClass(MapredOutput.class); job.setMapperClass(Step2Mapper.class); job.setNumReduceTasks(0); // no reducers job.setInputFormat(TextInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); // run the job JobClient.runJob(job); parseOutput(job, callback); }
From source file:org.apache.oozie.action.hadoop.LauncherMainTester.java
License:Apache License
private static void executeJavaMapReduce(String[] args) throws IOException, InterruptedException { JobConf jConf = createSleepMapperReducerJobConf(); final Path input = new Path(args[1]); FileInputFormat.setInputPaths(jConf, input); FileOutputFormat.setOutputPath(jConf, new Path(args[2])); writeToFile(input, jConf, "dummy\n", "data.txt"); JobClient jc = new JobClient(jConf); System.out.println("Submitting MR job"); RunningJob job = jc.submitJob(jConf); System.out.println("Submitted job " + job.getID().toString()); writeToFile(input, jConf, job.getID().toString(), JOB_ID_FILE_NAME); job.waitForCompletion();/*from w w w. ja v a 2 s .co m*/ jc.monitorAndPrintJob(jConf, job); if (job.getJobState() != JobStatus.SUCCEEDED) { System.err.println(job.getJobState() + " job state instead of" + JobStatus.SUCCEEDED); System.exit(-1); } }
From source file:org.apache.oozie.example.SampleOozieActionConfigurator.java
License:Apache License
@Override public void configure(JobConf actionConf) throws OozieActionConfiguratorException { if (actionConf.getUser() == null) { throw new OozieActionConfiguratorException("No user set"); }/*from w ww. jav a 2 s. co m*/ if (actionConf.get("examples.root") == null) { throw new OozieActionConfiguratorException("examples.root not set"); } if (actionConf.get("output.dir.name") == null) { throw new OozieActionConfiguratorException("output.dir.name not set"); } actionConf.setMapperClass(SampleMapper.class); actionConf.setReducerClass(SampleReducer.class); actionConf.setNumMapTasks(1); FileInputFormat.setInputPaths(actionConf, new Path( "/user/" + actionConf.getUser() + "/" + actionConf.get("examples.root") + "/input-data/text")); FileOutputFormat.setOutputPath(actionConf, new Path("/user/" + actionConf.getUser() + "/" + actionConf.get("examples.root") + "/output-data/" + actionConf.get("output.dir.name"))); }
From source file:org.apache.pig.test.utils.datagen.HadoopRunner.java
License:Apache License
public void generate() throws IOException { // Configuration processed by ToolRunner // Create a JobConf using the processed conf JobConf job;/*from w ww. j a va 2 s . co m*/ if (conf != null) { // TODO: conf could be null, check when and why job = new JobConf(conf); } else { job = new JobConf(new Configuration()); } fs = FileSystem.get(job); tmpHome = createTempDir(null); String config = genMapFiles().toUri().getRawPath(); // set config properties into job conf job.set(COLUMN_CONF_FILE_PATH, config); job.set(COLUMN_OUTPUT_SEPARATOR, String.valueOf((int) dgConf.getSeparator())); job.setJobName("data-gen"); job.setNumMapTasks(dgConf.getNumMappers()); job.setNumReduceTasks(0); job.setMapperClass(DataGenMapper.class); job.setJarByClass(DataGenMapper.class); // if inFile is specified, use it as input if (dgConf.getInFile() != null) { FileInputFormat.setInputPaths(job, dgConf.getInFile()); job.set(HAS_USER_INPUT, "true"); } else { job.set(HAS_USER_INPUT, "false"); Path input = genInputFiles(); FileInputFormat.setInputPaths(job, input); } FileOutputFormat.setOutputPath(job, new Path(dgConf.getOutputFile())); // Submit the job, then poll for progress until the job is complete System.out.println("Submit hadoop job..."); RunningJob j = JobClient.runJob(job); if (!j.isSuccessful()) { throw new IOException("Job failed"); } if (fs.exists(tmpHome)) { fs.delete(tmpHome, true); } }
From source file:org.apache.sysml.runtime.controlprogram.parfor.DataPartitionerRemoteMR.java
License:Apache License
@Override protected void partitionMatrix(MatrixObject in, String fnameNew, InputInfo ii, OutputInfo oi, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { String jobname = "ParFor-DPMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job;/*from w ww . j ava2 s .c o m*/ job = new JobConf(DataPartitionerRemoteMR.class); if (_pfid >= 0) //use in parfor job.setJobName(jobname + _pfid); else //use for partition instruction job.setJobName("Partition-MR"); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try { //force writing to disk (typically not required since partitioning only applied if dataset exceeds CP size) in.exportData(); //written to disk iff dirty Path path = new Path(in.getFileName()); ///// //configure the MR job MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, ii, oi, _format, _n, fnameNew, _keepIndexes); //set mappers, reducers, combiners job.setMapperClass(DataPartitionerRemoteMapper.class); job.setReducerClass(DataPartitionerRemoteReducer.class); if (oi == OutputInfo.TextCellOutputInfo) { //binary cell intermediates for reduced IO job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(PairWritableCell.class); } else if (oi == OutputInfo.BinaryCellOutputInfo) { job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(PairWritableCell.class); } else if (oi == OutputInfo.BinaryBlockOutputInfo) { job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(PairWritableBlock.class); //check Alignment if ((_format == PDataPartitionFormat.ROW_BLOCK_WISE_N && rlen > _n && _n % brlen != 0) || (_format == PDataPartitionFormat.COLUMN_BLOCK_WISE_N && clen > _n && _n % bclen != 0)) { throw new DMLRuntimeException( "Data partitioning format " + _format + " requires aligned blocks."); } } //set input format job.setInputFormat(ii.inputFormatClass); //set the input path and output path FileInputFormat.setInputPaths(job, path); //set output path MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); //FileOutputFormat.setOutputPath(job, pathNew); job.setOutputFormat(NullOutputFormat.class); ////// //set optimization parameters //set the number of mappers and reducers //job.setNumMapTasks( _numMappers ); //use default num mappers long reducerGroups = -1; switch (_format) { case ROW_WISE: reducerGroups = rlen; break; case COLUMN_WISE: reducerGroups = clen; break; case ROW_BLOCK_WISE: reducerGroups = (rlen / brlen) + ((rlen % brlen == 0) ? 0 : 1); break; case COLUMN_BLOCK_WISE: reducerGroups = (clen / bclen) + ((clen % bclen == 0) ? 0 : 1); break; case ROW_BLOCK_WISE_N: reducerGroups = (rlen / _n) + ((rlen % _n == 0) ? 0 : 1); break; case COLUMN_BLOCK_WISE_N: reducerGroups = (clen / _n) + ((clen % _n == 0) ? 0 : 1); break; default: //do nothing } job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups)); //disable automatic tasks timeouts and speculative task exec job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //enables the reuse of JVMs (multiple tasks per MR task) if (_jvmReuse) job.setNumTasksToExecutePerJvm(-1); //unlimited //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower) //job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS, "true"); //job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS_CODEC, "org.apache.hadoop.io.compress.GzipCodec"); //set the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, _replication); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); //set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 //job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, _max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job JobClient.runJob(job); //maintain dml script counters Statistics.incrementNoOfExecutedMRJobs(); } catch (Exception ex) { throw new DMLRuntimeException(ex); } if (DMLScript.STATISTICS && _pfid >= 0) { long t1 = System.nanoTime(); //only for parfor Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } }
From source file:org.apache.sysml.runtime.controlprogram.parfor.RemoteDPParForMR.java
License:Apache License
public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program, String resultFile, MatrixObject input, PartitionFormat dpf, OutputInfo oi, boolean tSparseCol, //config params boolean enableCPCaching, int numReducers, int replication) //opt params throws DMLRuntimeException { RemoteParForJobReturn ret = null;//from w w w .j av a 2s. c o m String jobname = "ParFor-DPEMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(RemoteDPParForMR.class); job.setJobName(jobname + pfid); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try { ///// //configure the MR job //set arbitrary CP program blocks that will perform in the reducers MRJobConfiguration.setProgramBlocks(job, program); //enable/disable caching MRJobConfiguration.setParforCachingConfig(job, enableCPCaching); //setup input matrix Path path = new Path(input.getFileName()); long rlen = input.getNumRows(); long clen = input.getNumColumns(); int brlen = (int) input.getNumRowsPerBlock(); int bclen = (int) input.getNumColumnsPerBlock(); MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, InputInfo.BinaryBlockInputInfo, oi, dpf._dpf, dpf._N, input.getFileName(), itervar, matrixvar, tSparseCol); job.setInputFormat(InputInfo.BinaryBlockInputInfo.inputFormatClass); FileInputFormat.setInputPaths(job, path); //set mapper and reducers classes job.setMapperClass(DataPartitionerRemoteMapper.class); job.setReducerClass(RemoteDPParWorkerReducer.class); //set output format job.setOutputFormat(SequenceFileOutputFormat.class); //set output path MapReduceTool.deleteFileIfExistOnHDFS(resultFile); FileOutputFormat.setOutputPath(job, new Path(resultFile)); //set the output key, value schema //parfor partitioning outputs (intermediates) job.setMapOutputKeyClass(LongWritable.class); if (oi == OutputInfo.BinaryBlockOutputInfo) job.setMapOutputValueClass(PairWritableBlock.class); else if (oi == OutputInfo.BinaryCellOutputInfo) job.setMapOutputValueClass(PairWritableCell.class); else throw new DMLRuntimeException("Unsupported intermrediate output info: " + oi); //parfor exec output job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); ////// //set optimization parameters //set the number of mappers and reducers job.setNumReduceTasks(numReducers); //disable automatic tasks timeouts and speculative task exec job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); //disable JVM reuse job.setNumTasksToExecutePerJvm(1); //-1 for unlimited //set the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set the max number of retries per map task //note: currently disabled to use cluster config //job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job RunningJob runjob = JobClient.runJob(job); // Process different counters Statistics.incrementNoOfExecutedMRJobs(); Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME); int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString()); int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString()); if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) { Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString())); Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString())); Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString())); Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString()); CacheStatistics .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString())); CacheStatistics.incrementFSBuffHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString())); CacheStatistics .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString())); CacheStatistics.incrementHDFSHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString())); CacheStatistics.incrementFSBuffWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString())); CacheStatistics.incrementFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString())); CacheStatistics.incrementHDFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString())); CacheStatistics .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString())); CacheStatistics .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString())); CacheStatistics .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString())); CacheStatistics .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString())); } // read all files of result variables and prepare for return LocalVariableMap[] results = readResultFile(job, resultFile); ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results); } catch (Exception ex) { throw new DMLRuntimeException(ex); } finally { // remove created files try { MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job); } catch (IOException ex) { throw new DMLRuntimeException(ex); } } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } return ret; }