List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:org.apache.sysml.runtime.controlprogram.parfor.RemoteParForMR.java
License:Apache License
public static RemoteParForJobReturn runJob(long pfid, String program, String taskFile, String resultFile, MatrixObject colocatedDPMatrixObj, //inputs boolean enableCPCaching, int numMappers, int replication, int max_retry, long minMem, boolean jvmReuse) //opt params throws DMLRuntimeException { RemoteParForJobReturn ret = null;/*from w w w .j av a 2s . co m*/ String jobname = "ParFor-EMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(RemoteParForMR.class); job.setJobName(jobname + pfid); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try { ///// //configure the MR job //set arbitrary CP program blocks that will perform in the mapper MRJobConfiguration.setProgramBlocks(job, program); //enable/disable caching MRJobConfiguration.setParforCachingConfig(job, enableCPCaching); //set mappers, reducers, combiners job.setMapperClass(RemoteParWorkerMapper.class); //map-only //set input format (one split per row, NLineInputFormat default N=1) if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) { job.setInputFormat(RemoteParForColocatedNLineInputFormat.class); MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat()); MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics(); MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock()); MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock()); MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName()); } else //default case { job.setInputFormat(NLineInputFormat.class); } //set the input path and output path FileInputFormat.setInputPaths(job, new Path(taskFile)); //set output format job.setOutputFormat(SequenceFileOutputFormat.class); //set output path MapReduceTool.deleteFileIfExistOnHDFS(resultFile); FileOutputFormat.setOutputPath(job, new Path(resultFile)); //set the output key, value schema job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); ////// //set optimization parameters //set the number of mappers and reducers job.setNumMapTasks(numMappers); //numMappers job.setNumReduceTasks(0); //job.setInt("mapred.map.tasks.maximum", 1); //system property //job.setInt("mapred.tasktracker.tasks.maximum",1); //system property //job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property //set jvm memory size (if require) String memKey = MRConfigurationNames.MR_CHILD_JAVA_OPTS; if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) { InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem); LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M."); } //disable automatic tasks timeouts and speculative task exec job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0); job.setMapSpeculativeExecution(false); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); //enables the reuse of JVMs (multiple tasks per MR task) if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); //unlimited //set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption) job.setInt(MRConfigurationNames.MR_TASK_IO_SORT_MB, 8); //8MB //set the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 //job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job RunningJob runjob = JobClient.runJob(job); // Process different counters Statistics.incrementNoOfExecutedMRJobs(); Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME); int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString()); int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString()); if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) { Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString())); Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString())); Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString())); Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString()); CacheStatistics .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString())); CacheStatistics.incrementFSBuffHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString())); CacheStatistics .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString())); CacheStatistics.incrementHDFSHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString())); CacheStatistics.incrementFSBuffWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString())); CacheStatistics.incrementFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString())); CacheStatistics.incrementHDFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString())); CacheStatistics .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString())); CacheStatistics .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString())); CacheStatistics .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString())); CacheStatistics .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString())); } // read all files of result variables and prepare for return LocalVariableMap[] results = readResultFile(job, resultFile); ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results); } catch (Exception ex) { throw new DMLRuntimeException(ex); } finally { // remove created files try { MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job); MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job); } catch (IOException ex) { throw new DMLRuntimeException(ex); } } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } return ret; }
From source file:org.apache.sysml.runtime.controlprogram.parfor.ResultMergeRemoteMR.java
License:Apache License
@SuppressWarnings({ "unused", "deprecation" }) protected void executeMerge(String fname, String fnameNew, String[] srcFnames, InputInfo ii, OutputInfo oi, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { String jobname = "ParFor-RMMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job = new JobConf(ResultMergeRemoteMR.class); job.setJobName(jobname + _pfid);/*www .j a v a 2s. co m*/ //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); //warning for textcell/binarycell without compare boolean withCompare = (fname != null); if ((oi == OutputInfo.TextCellOutputInfo || oi == OutputInfo.BinaryCellOutputInfo) && !withCompare && ResultMergeLocalFile.ALLOW_COPY_CELLFILES) LOG.warn("Result merge for " + OutputInfo.outputInfoToString(oi) + " without compare can be realized more efficiently with LOCAL_FILE than REMOTE_MR."); try { Path pathCompare = null; Path pathNew = new Path(fnameNew); ///// //configure the MR job if (withCompare) { FileSystem fs = IOUtilFunctions.getFileSystem(pathNew, job); pathCompare = new Path(fname).makeQualified(fs); MRJobConfiguration.setResultMergeInfo(job, pathCompare.toString(), ii, LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, brlen, bclen); } else MRJobConfiguration.setResultMergeInfo(job, "null", ii, LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, bclen, bclen); //set mappers, reducers, combiners job.setMapperClass(ResultMergeRemoteMapper.class); job.setReducerClass(ResultMergeRemoteReducer.class); if (oi == OutputInfo.TextCellOutputInfo) { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixCell.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); } else if (oi == OutputInfo.BinaryCellOutputInfo) { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixCell.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixCell.class); } else if (oi == OutputInfo.BinaryBlockOutputInfo) { //setup partitioning, grouping, sorting for composite key (old API) job.setPartitionerClass(ResultMergeRemotePartitioning.class); //partitioning job.setOutputValueGroupingComparator(ResultMergeRemoteGrouping.class); //grouping job.setOutputKeyComparatorClass(ResultMergeRemoteSorting.class); //sorting job.setMapOutputKeyClass(ResultMergeTaggedMatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixBlock.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixBlock.class); } //set input format job.setInputFormat(ii.inputFormatClass); //set the input path Path[] paths = null; if (withCompare) { paths = new Path[srcFnames.length + 1]; paths[0] = pathCompare; for (int i = 1; i < paths.length; i++) paths[i] = new Path(srcFnames[i - 1]); } else { paths = new Path[srcFnames.length]; for (int i = 0; i < paths.length; i++) paths[i] = new Path(srcFnames[i]); } FileInputFormat.setInputPaths(job, paths); //set output format job.setOutputFormat(oi.outputFormatClass); //set output path MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); FileOutputFormat.setOutputPath(job, pathNew); ////// //set optimization parameters //set the number of mappers and reducers //job.setNumMapTasks( _numMappers ); //use default num mappers long reducerGroups = _numReducers; if (oi == OutputInfo.BinaryBlockOutputInfo) reducerGroups = Math.max(rlen / brlen, 1) * Math.max(clen / bclen, 1); else //textcell/binarycell reducerGroups = Math.max((rlen * clen) / StagingFileUtils.CELL_BUFFER_SIZE, 1); job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups)); //disable automatic tasks timeouts and speculative task exec job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up custom map/reduce configurations DMLConfig config = ConfigurationManager.getDMLConfig(); MRJobConfiguration.setupCustomMRConfigurations(job, config); //enables the reuse of JVMs (multiple tasks per MR task) if (_jvmReuse) job.setNumTasksToExecutePerJvm(-1); //unlimited //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower) //job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS, "true"); //job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS_CODEC, "org.apache.hadoop.io.compress.GzipCodec"); //set the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, _replication); //set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 //job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, _max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job JobClient.runJob(job); //maintain dml script counters Statistics.incrementNoOfExecutedMRJobs(); } catch (Exception ex) { throw new DMLRuntimeException(ex); } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } }
From source file:org.apache.sysml.runtime.controlprogram.parfor.ResultMergeRemoteSpark.java
License:Apache License
@SuppressWarnings("unchecked") protected RDDObject executeMerge(MatrixObject compare, MatrixObject[] inputs, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { String jobname = "ParFor-RMSP"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; SparkExecutionContext sec = (SparkExecutionContext) _ec; boolean withCompare = (compare != null); RDDObject ret = null;//www. jav a 2s. c om //determine degree of parallelism int numRed = (int) determineNumReducers(rlen, clen, brlen, bclen, _numReducers); //sanity check for empty src files if (inputs == null || inputs.length == 0) throw new DMLRuntimeException("Execute merge should never be called with no inputs."); try { //note: initial implementation via union over all result rdds discarded due to //stack overflow errors with many parfor tasks, and thus many rdds //Step 1: construct input rdd from all result files of parfor workers //a) construct job conf with all files InputInfo ii = InputInfo.BinaryBlockInputInfo; JobConf job = new JobConf(ResultMergeRemoteMR.class); job.setJobName(jobname); job.setInputFormat(ii.inputFormatClass); Path[] paths = new Path[inputs.length]; for (int i = 0; i < paths.length; i++) { //ensure input exists on hdfs (e.g., if in-memory or RDD) inputs[i].exportData(); paths[i] = new Path(inputs[i].getFileName()); //update rdd handle to allow lazy evaluation by guarding //against cleanup of temporary result files setRDDHandleForMerge(inputs[i], sec); } FileInputFormat.setInputPaths(job, paths); //b) create rdd from input files w/ deep copy of keys and blocks JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = sec.getSparkContext() .hadoopRDD(job, ii.inputFormatClass, ii.inputKeyClass, ii.inputValueClass) .mapPartitionsToPair(new CopyBlockPairFunction(true), true); //Step 2a: merge with compare JavaPairRDD<MatrixIndexes, MatrixBlock> out = null; if (withCompare) { JavaPairRDD<MatrixIndexes, MatrixBlock> compareRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sec .getRDDHandleForMatrixObject(compare, InputInfo.BinaryBlockInputInfo); //merge values which differ from compare values ResultMergeRemoteSparkWCompare cfun = new ResultMergeRemoteSparkWCompare(); out = rdd.groupByKey(numRed) //group all result blocks per key .join(compareRdd) //join compare block and result blocks .mapToPair(cfun); //merge result blocks w/ compare } //Step 2b: merge without compare else { //direct merge in any order (disjointness guaranteed) out = RDDAggregateUtils.mergeByKey(rdd, false); } //Step 3: create output rdd handle w/ lineage ret = new RDDObject(out); for (int i = 0; i < paths.length; i++) ret.addLineageChild(inputs[i].getRDDHandle()); if (withCompare) ret.addLineageChild(compare.getRDDHandle()); } catch (Exception ex) { throw new DMLRuntimeException(ex); } //maintain statistics Statistics.incrementNoOfCompiledSPInst(); Statistics.incrementNoOfExecutedSPInst(); if (DMLScript.STATISTICS) { Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0); } return ret; }
From source file:org.apache.sysml.runtime.matrix.CleanupMR.java
License:Apache License
public static boolean runJob(DMLConfig conf) throws Exception { boolean ret = false; try {//from w ww . ja v a 2s . c o m JobConf job; job = new JobConf(CleanupMR.class); job.setJobName("Cleanup-MR"); //set up SystemML local tmp dir String dir = conf.getTextValue(DMLConfig.LOCAL_TMP_DIR); MRJobConfiguration.setSystemMLLocalTmpDir(job, dir); //set mappers, reducers int numNodes = InfrastructureAnalyzer.getRemoteParallelNodes(); job.setMapperClass(CleanupMapper.class); //map-only job.setNumMapTasks(numNodes); //numMappers job.setNumReduceTasks(0); //set input/output format, input path String inFileName = conf.getTextValue(DMLConfig.SCRATCH_SPACE) + "/cleanup_tasks"; job.setInputFormat(NLineInputFormat.class); job.setOutputFormat(NullOutputFormat.class); Path path = new Path(inFileName); FileInputFormat.setInputPaths(job, path); writeCleanupTasksToFile(path, numNodes); //disable automatic tasks timeouts and speculative task exec job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0); job.setMapSpeculativeExecution(false); ///// // execute the MR job RunningJob runjob = JobClient.runJob(job); ret = runjob.isSuccessful(); } catch (Exception ex) { //don't raise an exception, just gracefully an error message. LOG.error("Failed to run cleanup MR job. ", ex); } return ret; }
From source file:org.apache.sysml.runtime.matrix.data.hadoopfix.DelegatingInputFormat.java
License:Apache License
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException { JobConf confCopy = new JobConf(conf); List<InputSplit> splits = new ArrayList<>(); Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(conf); Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(conf); Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<>(); // First, build a map of InputFormats to Paths for (Entry<Path, InputFormat> entry : formatMap.entrySet()) { if (!formatPaths.containsKey(entry.getValue().getClass())) { formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>()); }/* ww w . ja va 2 s. c o m*/ formatPaths.get(entry.getValue().getClass()).add(entry.getKey()); } for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) { Class<? extends InputFormat> formatClass = formatEntry.getKey(); InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf); List<Path> paths = formatEntry.getValue(); Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<>(); // Now, for each set of paths that have a common InputFormat, build // a map of Mappers to the paths they're used for for (Path path : paths) { Class<? extends Mapper> mapperClass = mapperMap.get(path); if (!mapperPaths.containsKey(mapperClass)) { mapperPaths.put(mapperClass, new LinkedList<Path>()); } mapperPaths.get(mapperClass).add(path); } // Now each set of paths that has a common InputFormat and Mapper can // be added to the same job, and split together. for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) { paths = mapEntry.getValue(); Class<? extends Mapper> mapperClass = mapEntry.getKey(); if (mapperClass == null) { mapperClass = conf.getMapperClass(); } FileInputFormat.setInputPaths(confCopy, paths.toArray(new Path[paths.size()])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. InputSplit[] pathSplits = format.getSplits(confCopy, numSplits); for (InputSplit pathSplit : pathSplits) { splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass)); } } } return splits.toArray(new InputSplit[splits.size()]); }
From source file:org.apache.sysml.runtime.matrix.SortMR.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public static JobReturn runJob(MRJobInstruction inst, String input, InputInfo inputInfo, long rlen, long clen, int brlen, int bclen, String combineInst, String sortInst, int numReducers, int replication, String output, OutputInfo outputInfo, boolean valueIsWeight) throws Exception { boolean sortIndexes = getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes; String tmpOutput = sortIndexes ? MRJobConfiguration.constructTempOutputFilename() : output; JobConf job = new JobConf(SortMR.class); job.setJobName("SortMR"); //setup partition file String pfname = MRJobConfiguration.setUpSortPartitionFilename(job); Path partitionFile = new Path(pfname); URI partitionUri = new URI(partitionFile.toString()); //setup input/output paths Path inputDir = new Path(input); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); FileInputFormat.setInputPaths(job, inputDir); Path outpath = new Path(tmpOutput); FileOutputFormat.setOutputPath(job, outpath); MapReduceTool.deleteFileIfExistOnHDFS(outpath, job); //set number of reducers (1 if local mode) if (!InfrastructureAnalyzer.isLocalMode(job)) { MRJobConfiguration.setNumReducers(job, numReducers, numReducers); //ensure partition size <= 10M records to avoid scalability bottlenecks //on cp-side qpick instructions for quantile/iqm/median (~128MB) if (!(getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes)) job.setNumReduceTasks((int) Math.max(job.getNumReduceTasks(), rlen / 10000000)); } else //in case of local mode job.setNumReduceTasks(1);/* w ww . j a v a 2 s .c o m*/ //setup input/output format job.setInputFormat(SamplingSortMRInputFormat.class); SamplingSortMRInputFormat.setTargetKeyValueClasses(job, (Class<? extends WritableComparable>) outputInfo.outputKeyClass, outputInfo.outputValueClass); //setup instructions and meta information if (combineInst != null && !combineInst.trim().isEmpty()) job.set(COMBINE_INSTRUCTION, combineInst); job.set(SORT_INSTRUCTION, sortInst); job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight); boolean desc = getSortInstructionDescending(sortInst); job.setBoolean(SORT_DECREASING, desc); MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen); MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); int partitionWith0 = SamplingSortMRInputFormat.writePartitionFile(job, partitionFile); //setup mapper/reducer/partitioner/output classes if (getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes) { MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass); job.setMapperClass(IndexSortMapper.class); job.setReducerClass(IndexSortReducer.class); job.setMapOutputKeyClass(!desc ? IndexSortComparable.class : IndexSortComparableDesc.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixBlock.class); } else { //default case: SORT w/wo weights MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); job.setOutputFormat(CompactOutputFormat.class); job.setMapperClass(ValueSortMapper.class); job.setReducerClass(ValueSortReducer.class); job.setOutputKeyClass(outputInfo.outputKeyClass); //double job.setOutputValueClass(outputInfo.outputValueClass); //int } job.setPartitionerClass(TotalOrderPartitioner.class); //setup distributed cache DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); //setup replication factor job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up custom map/reduce configurations DMLConfig config = ConfigurationManager.getDMLConfig(); MRJobConfiguration.setupCustomMRConfigurations(job, config); MatrixCharacteristics[] s = new MatrixCharacteristics[1]; s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(s); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); //run mr job RunningJob runjob = JobClient.runJob(job); Group group = runjob.getCounters().getGroup(NUM_VALUES_PREFIX); numReducers = job.getNumReduceTasks(); //process final meta data long[] counts = new long[numReducers]; long total = 0; for (int i = 0; i < numReducers; i++) { counts[i] = group.getCounter(Integer.toString(i)); total += counts[i]; } //add missing 0s back to the results long missing0s = 0; if (total < rlen * clen) { if (partitionWith0 < 0) throw new RuntimeException("no partition contains 0, which is wrong!"); missing0s = rlen * clen - total; counts[partitionWith0] += missing0s; } else partitionWith0 = -1; if (sortIndexes) { //run builtin job for shifting partially sorted blocks according to global offsets //we do this in this custom form since it would not fit into the current structure //of systemml to output two intermediates (partially sorted data, offsets) out of a //single SortKeys lop boolean success = runjob.isSuccessful(); if (success) { success = runStitchupJob(tmpOutput, rlen, clen, brlen, bclen, counts, numReducers, replication, output); } MapReduceTool.deleteFileIfExistOnHDFS(tmpOutput); MapReduceTool.deleteFileIfExistOnHDFS(pfname); return new JobReturn(s[0], OutputInfo.BinaryBlockOutputInfo, success); } else { MapReduceTool.deleteFileIfExistOnHDFS(pfname); return new JobReturn(s[0], counts, partitionWith0, missing0s, runjob.isSuccessful()); } }
From source file:org.apache.sysml.runtime.matrix.SortMR.java
License:Apache License
private static boolean runStitchupJob(String input, long rlen, long clen, int brlen, int bclen, long[] counts, int numReducers, int replication, String output) throws Exception { JobConf job = new JobConf(SortMR.class); job.setJobName("SortIndexesMR"); //setup input/output paths Path inpath = new Path(input); Path outpath = new Path(output); FileInputFormat.setInputPaths(job, inpath); FileOutputFormat.setOutputPath(job, outpath); MapReduceTool.deleteFileIfExistOnHDFS(outpath, job); //set number of reducers (1 if local mode) if (InfrastructureAnalyzer.isLocalMode(job)) job.setNumReduceTasks(1);//from w w w.j ava 2 s. c o m else MRJobConfiguration.setNumReducers(job, numReducers, numReducers); //setup input/output format InputInfo iinfo = InputInfo.BinaryBlockInputInfo; OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo; job.setInputFormat(iinfo.inputFormatClass); job.setOutputFormat(oinfo.outputFormatClass); CompactInputFormat.setKeyValueClasses(job, MatrixIndexes.class, MatrixBlock.class); //setup mapper/reducer/output classes MRJobConfiguration.setInputInfo(job, (byte) 0, InputInfo.BinaryBlockInputInfo, brlen, bclen, ConvertTarget.BLOCK); job.setMapperClass(IndexSortStitchupMapper.class); job.setReducerClass(IndexSortStitchupReducer.class); job.setOutputKeyClass(oinfo.outputKeyClass); job.setOutputValueClass(oinfo.outputValueClass); MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen); MRJobConfiguration.setMatricesDimensions(job, new byte[] { 0 }, new long[] { rlen }, new long[] { clen }); //compute shifted prefix sum of offsets and put into configuration long[] cumsumCounts = new long[counts.length]; long sum = 0; for (int i = 0; i < counts.length; i++) { cumsumCounts[i] = sum; sum += counts[i]; } job.set(SORT_INDEXES_OFFSETS, Arrays.toString(cumsumCounts)); //setup replication factor job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); //run mr job RunningJob runJob = JobClient.runJob(job); return runJob.isSuccessful(); }
From source file:org.apache.tez.mapreduce.examples.MapredWordCount.java
License:Apache License
/** * The main driver for word count map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker.//from www . ja va2 s . c o m */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), MapredWordCount.class); conf.setJobName("wordcount"); LOG.info("Running WordCount job using mapred apis"); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are counts (ints) conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MapClass.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { LOG.error("Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { LOG.error("Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { LOG.error("Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
From source file:org.apache.tez.mapreduce.input.TestMultiMRInput.java
License:Apache License
@Test(timeout = 5000) public void testSingleSplit() throws Exception { Path workDir = new Path(TEST_ROOT_DIR, "testSingleSplit"); JobConf jobConf = new JobConf(defaultConf); jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class); FileInputFormat.setInputPaths(jobConf, workDir); MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder(); builder.setGroupingEnabled(false);//from w w w . j a v a 2 s . com builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf)); byte[] payload = builder.build().toByteArray(); InputContext inputContext = createTezInputContext(payload); MultiMRInput input = new MultiMRInput(inputContext, 1); input.initialize(); List<Event> eventList = new ArrayList<Event>(); String file1 = "file1"; LinkedHashMap<LongWritable, Text> data1 = createInputData(localFs, workDir, jobConf, file1, 0, 10); SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>(); InputSplit[] splits = format.getSplits(jobConf, 1); assertEquals(1, splits.length); MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]); InputDataInformationEvent event = InputDataInformationEvent.createWithSerializedPayload(0, splitProto.toByteString().asReadOnlyByteBuffer()); eventList.clear(); eventList.add(event); input.handleEvents(eventList); int readerCount = 0; for (KeyValueReader reader : input.getKeyValueReaders()) { readerCount++; while (reader.next()) { if (data1.size() == 0) { fail("Found more records than expected"); } Object key = reader.getCurrentKey(); Object val = reader.getCurrentValue(); assertEquals(val, data1.remove(key)); } } assertEquals(1, readerCount); }
From source file:org.apache.tez.mapreduce.input.TestMultiMRInput.java
License:Apache License
@Test(timeout = 5000) public void testMultipleSplits() throws Exception { Path workDir = new Path(TEST_ROOT_DIR, "testMultipleSplits"); JobConf jobConf = new JobConf(defaultConf); jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class); FileInputFormat.setInputPaths(jobConf, workDir); MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder(); builder.setGroupingEnabled(false);/*from w w w . java2 s . c o m*/ builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf)); byte[] payload = builder.build().toByteArray(); InputContext inputContext = createTezInputContext(payload); MultiMRInput input = new MultiMRInput(inputContext, 2); input.initialize(); List<Event> eventList = new ArrayList<Event>(); LinkedHashMap<LongWritable, Text> data = new LinkedHashMap<LongWritable, Text>(); String file1 = "file1"; LinkedHashMap<LongWritable, Text> data1 = createInputData(localFs, workDir, jobConf, file1, 0, 10); String file2 = "file2"; LinkedHashMap<LongWritable, Text> data2 = createInputData(localFs, workDir, jobConf, file2, 10, 20); data.putAll(data1); data.putAll(data2); SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>(); InputSplit[] splits = format.getSplits(jobConf, 2); assertEquals(2, splits.length); MRSplitProto splitProto1 = MRInputHelpers.createSplitProto(splits[0]); InputDataInformationEvent event1 = InputDataInformationEvent.createWithSerializedPayload(0, splitProto1.toByteString().asReadOnlyByteBuffer()); MRSplitProto splitProto2 = MRInputHelpers.createSplitProto(splits[1]); InputDataInformationEvent event2 = InputDataInformationEvent.createWithSerializedPayload(0, splitProto2.toByteString().asReadOnlyByteBuffer()); eventList.clear(); eventList.add(event1); eventList.add(event2); input.handleEvents(eventList); int readerCount = 0; for (KeyValueReader reader : input.getKeyValueReaders()) { readerCount++; while (reader.next()) { if (data.size() == 0) { fail("Found more records than expected"); } Object key = reader.getCurrentKey(); Object val = reader.getCurrentValue(); assertEquals(val, data.remove(key)); } } assertEquals(2, readerCount); }