List of usage examples for org.apache.hadoop.mapred FileInputFormat getInputPaths
public static Path[] getInputPaths(JobConf conf)
From source file:org.hypertable.hadoop.hive.HiveHypertableInputFormat.java
License:Open Source License
@Override public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { String namespace = Utilities.getNamespace(jobConf.get(Properties.HYPERTABLE_TABLE_NAME)); String tableName = Utilities.getTableName(jobConf.get(Properties.HYPERTABLE_TABLE_NAME)); String columnsMappingSpec = jobConf.get(Properties.HYPERTABLE_COLUMNS_MAPPING); if (columnsMappingSpec == null) { throw new IOException("hypertable.columns.mapping required for Hypertable Table."); }//from w ww . ja v a 2 s. c o m ColumnMappings columnMappings = null; try { columnMappings = ColumnMappings.parseColumnsMapping(columnsMappingSpec); } catch (SerDeException e) { throw new IOException(e); } int iKey = columnMappings.getKeyIndex(); ColumnMapping keyMapping = columnMappings.getKeyMapping(); RowInputFormat rif = new RowInputFormat(); rif.set_namespace(namespace); rif.set_table_name(tableName); ScanSpec scanSpec = new ScanSpec(); boolean readAllColumns = ColumnProjectionUtils.isReadAllColumns(jobConf); scanSpec.setKeys_only(true); // The list of families that have been added to the scan List<String> addedFamilies = new ArrayList<String>(); if (!readAllColumns) { for (ColumnMapping colMap : columnMappings) { if (colMap.isRowKey) { continue; } if (colMap.qualifierName == null) { scanSpec.addToColumns(colMap.familyName); addedFamilies.add(colMap.familyName); } else { if (!addedFamilies.contains(colMap.familyName)) { String column = colMap.familyName + ":" + colMap.qualifierName; scanSpec.addToColumns(column); } } scanSpec.setKeys_only(false); } } scanSpec.setVersions(1); rif.set_scan_spec(scanSpec); Path[] tablePaths = FileInputFormat.getInputPaths(jobConf); int num_splits = 0; InputSplit[] splits = rif.getSplits(jobConf, num_splits); InputSplit[] results = new InputSplit[splits.length]; for (int ii = 0; ii < splits.length; ii++) { results[ii] = new HiveHypertableSplit((TableSplit) splits[ii], tablePaths[0]); } return results; }
From source file:org.kiji.hive.KijiTableInputFormat.java
License:Apache License
/** * Returns an array of input splits to be used as input to map tasks. * * @param job The job configuration./*from w ww.j a v a 2 s .c o m*/ * @param numTasks A hint from the MR framework for the number of mappers. * @return The specifications of each split. * @throws IOException If there is an error. */ @Override public InputSplit[] getSplits(JobConf job, int numTasks) throws IOException { // TODO: Use the numTasks hint effectively. We just ignore it right now. final KijiURI kijiURI = getKijiURI(job); final InputSplit[] splits; Kiji kiji = null; KijiTable kijiTable = null; try { kiji = Kiji.Factory.open(kijiURI); kijiTable = kiji.openTable(kijiURI.getTable()); // Get the start keys for each region in the table. List<KijiRegion> kijiRegions = kijiTable.getRegions(); splits = new InputSplit[kijiRegions.size()]; for (int i = 0; i < kijiRegions.size(); i++) { KijiRegion kijiRegion = kijiRegions.get(i); byte[] regionStartKey = kijiRegion.getStartKey(); byte[] regionEndKey = kijiRegion.getEndKey(); Collection<String> regionLocations = kijiRegion.getLocations(); String regionHost = null; if (!regionLocations.isEmpty()) { // TODO: Allow the usage of regions that aren't the first. String regionLocation = regionLocations.iterator().next(); regionHost = regionLocation.substring(0, regionLocation.indexOf(":")); } else { LOG.warn("No locations found for region: {}", kijiRegion.toString()); } final Path dummyPath = FileInputFormat.getInputPaths(job)[0]; splits[i] = new KijiTableInputSplit(kijiURI, regionStartKey, regionEndKey, regionHost, dummyPath); } } catch (IOException e) { LOG.warn("Unable to get region information. Returning an empty list of splits."); LOG.warn(StringUtils.stringifyException(e)); return new InputSplit[0]; } finally { ResourceUtils.releaseOrLog(kijiTable); ResourceUtils.releaseOrLog(kiji); } return splits; }
From source file:org.macau.util.FuzzyJoinDriver.java
License:Apache License
/** * //from w ww. j a v a 2 s .c o m * @param job * @throws IOException * run the job and output the basic information of the job * the start time * the finished time * the running time(finished_Time - start_Time) */ public static void run(JobConf job) throws IOException { job.setJarByClass(FuzzyJoinDriver.class); // // print info // String ret = "FuzzyJoinDriver(" + job.getJobName() + ")\n" + " Input Path: {"; Path inputs[] = FileInputFormat.getInputPaths(job); for (int ctr = 0; ctr < inputs.length; ctr++) { if (ctr > 0) { ret += "\n "; } ret += inputs[ctr].toString(); } ret += "}\n"; ret += " Output Path: " + FileOutputFormat.getOutputPath(job) + "\n" + " Map Jobs: " + job.getNumMapTasks() + "\n" + " Reduce Jobs: " + job.getNumReduceTasks() + "\n" + " Properties: {"; String[][] properties = new String[][] { new String[] { FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE }, new String[] { FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, "" + FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE }, new String[] { FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE }, new String[] { TOKENS_PACKAGE_PROPERTY, TOKENS_PACKAGE_VALUE }, new String[] { TOKENS_LENGTHSTATS_PROPERTY, "" + TOKENS_LENGTHSTATS_VALUE }, new String[] { RIDPAIRS_GROUP_CLASS_PROPERTY, RIDPAIRS_GROUP_CLASS_VALUE }, new String[] { RIDPAIRS_GROUP_FACTOR_PROPERTY, "" + RIDPAIRS_GROUP_FACTOR_VALUE }, new String[] { FuzzyJoinConfig.DATA_TOKENS_PROPERTY, "" }, new String[] { DATA_JOININDEX_PROPERTY, "" }, }; for (int crt = 0; crt < properties.length; crt++) { if (crt > 0) { ret += "\n "; } ret += properties[crt][0] + "=" + job.get(properties[crt][0], properties[crt][1]); } ret += "}"; System.out.println(ret); // // run job // Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println( "The job took " + (end_time.getTime() - startTime.getTime()) / (float) 1000.0 + " seconds."); }
From source file:org.pentaho.hadoop.shim.common.ConfigurationProxyTest.java
License:Apache License
@Test public void testSetInputPaths() throws Exception { configurationProxy.setInputPaths(null); Path[] inputPaths = FileInputFormat.getInputPaths(configurationProxy); assertEquals(0, inputPaths.length);/*from ww w.j av a 2 s . com*/ PathProxy path1 = new PathProxy("file://path1"); PathProxy path2 = new PathProxy("file://path2"); configurationProxy.setInputPaths(path1, path2); inputPaths = FileInputFormat.getInputPaths(configurationProxy); assertEquals(2, inputPaths.length); assertArrayEquals(new Path[] { path1, path2 }, inputPaths); }
From source file:org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat.java
License:Mozilla Public License
@SuppressWarnings("unchecked") @Override//from w w w.j a v a2s.c om /** * Splits the input collection into * sets of files where each Map task * gets about the same number of files */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Path[] paths = FileInputFormat.getInputPaths(job); // HADOOP-1818: Manage splits only if there are paths if (paths.length == 0) { return new InputSplit[0]; } if (numSplits > paths.length) { numSplits = paths.length; } else if (numSplits < 1) { numSplits = 1; } logger.info("Allocating " + paths.length + " files across " + numSplits + " map tasks"); List<PositionAwareSplit<CombineFileSplit>> splits = new ArrayList<PositionAwareSplit<CombineFileSplit>>( numSplits); final int numPaths = paths.length; long[] lengths = new long[numPaths]; TObjectLongHashMap<String>[] locations = (TObjectLongHashMap<String>[]) Array .newInstance(TObjectLongHashMap.class, numPaths); final FileSystem fs = FileSystem.get(job); for (int i = 0; i < paths.length; i++) { final FileStatus fss = fs.getFileStatus(paths[i]); lengths[i] = fss.getLen(); final TObjectLongHashMap<String> location2size = locations[i] = new TObjectLongHashMap<String>(); final long normalblocksize = fss.getBlockSize(); for (long offset = 0; offset < lengths[i]; offset += normalblocksize) { final long blocksize = Math.min(offset + normalblocksize, lengths[i]); final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize); for (BlockLocation bl : blockLocations) { for (String host : bl.getHosts()) { location2size.adjustOrPutValue(host, blocksize, blocksize); } } } } //we need to over-estimate using ceil, to ensure that the last split is not /too/ big final int numberOfFilesPerSplit = (int) Math.ceil((double) paths.length / (double) numSplits); int pathsUsed = 0; int splitnum = 0; CombineFileSplit mfs; // for each split except the last one (which may be smaller than numberOfFilesPerSplit) while (pathsUsed < numPaths) { /* caclulate split size for this task - usually numberOfFilesPerSplit, but * less than this for the last split */ final int splitSizeForThisSplit = numberOfFilesPerSplit + pathsUsed > numPaths ? numPaths - pathsUsed : numberOfFilesPerSplit; //arrays of information for split Path[] splitPaths = new Path[splitSizeForThisSplit]; long[] splitLengths = new long[splitSizeForThisSplit]; long[] splitStarts = new long[splitSizeForThisSplit]; final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>(); String[] splitLocations = null; //final recommended locations for this split. for (int i = 0; i < splitSizeForThisSplit; i++) { locations[pathsUsed + i].forEachEntry(new TObjectLongProcedure<String>() { public boolean execute(String a, long b) { allLocationsForSplit.adjustOrPutValue(a, b, b); return true; } }); if (allLocationsForSplit.size() <= 3) { splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]); } else { String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]); Arrays.sort(hosts, new Comparator<String>() { public int compare(String o1, String o2) { long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2); if (diffamount > 0) { return -1; } else if (diffamount < 0) { return 1; } return 0; } }); splitLocations = new String[3]; System.arraycopy(hosts, 0, splitLocations, 0, 3); } } //copy information for this split System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit); System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit); //count the number of paths consumed pathsUsed += splitSizeForThisSplit; //make the actual split object //logger.info("New split of size " + splitSizeForThisSplit); mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations); splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum)); splitnum++; } if (!(pathsUsed == paths.length)) { throw new IOException("Number of used paths does not equal total available paths!"); } return splits.toArray(new PositionAwareSplit[splits.size()]); }
From source file:org.vilcek.hive.kv.KVHiveInputFormat.java
License:Apache License
@Override public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException { String kvHostPort = conf.get(ConfigProperties.KV_HOST_PORT); Pattern pattern = Pattern.compile(","); kvHelperHosts = pattern.split(kvHostPort); kvStoreName = conf.get(ConfigProperties.KV_NAME); Topology topology = null;// w ww .j a va2s. co m try { topology = TopologyLocator.get(kvHelperHosts, 0); } catch (KVStoreException KVSE) { KVSE.printStackTrace(); return null; } RegistryUtils regUtils = new RegistryUtils(topology); PartitionMap partitionMap = topology.getPartitionMap(); int nParts = partitionMap.getNPartitions(); List<InputSplit> ret = new ArrayList<InputSplit>(nParts); Map<Object, RepNodeStatus> statuses = new HashMap<Object, RepNodeStatus>(); Path[] tablePaths = FileInputFormat.getInputPaths(conf); for (int i = 1; i <= nParts; i++) { PartitionId partId = new PartitionId(i); RepGroupId repGroupId = topology.getRepGroupId(partId); RepGroup repGroup = topology.get(repGroupId); Collection<RepNode> repNodes = repGroup.getRepNodes(); List<String> repNodeNames = new ArrayList<String>(); List<String> repNodeNamesAndPorts = new ArrayList<String>(); for (RepNode rn : repNodes) { RepNodeStatus rnStatus = null; try { if (statuses.containsKey(rn.getResourceId())) { rnStatus = statuses.get(rn.getResourceId()); } else { RepNodeAdminAPI rna = regUtils.getRepNodeAdmin(rn.getResourceId()); rnStatus = rna.ping(); statuses.put(rn.getResourceId(), rnStatus); } } catch (RemoteException re) { System.err.println("Ping failed for " + rn.getResourceId() + ": " + re.getMessage()); re.printStackTrace(); statuses.put(rn.getResourceId(), null); } catch (NotBoundException e) { System.err.println( "No RMI service for RN: " + rn.getResourceId() + " message: " + e.getMessage()); } if (rnStatus == null) { continue; } /* * com.sleepycat.je.rep.ReplicatedEnvironment.State state = rnStatus.getReplicationState(); if (!state.isActive() || * (consistency == Consistency.ABSOLUTE && !state.isMaster())) { continue; } */ StorageNodeId snid = rn.getStorageNodeId(); StorageNode sn = topology.get(snid); repNodeNames.add(sn.getHostname()); repNodeNamesAndPorts.add(sn.getHostname() + ":" + sn.getRegistryPort()); } Key parentKey = null; String parentKeyValue = conf.get("oracle.kv.parentKey"); if (parentKeyValue != null && parentKeyValue.length() > 0) { parentKey = Key.fromString(parentKeyValue); } KeyRange subRange = null; String subRangeValue = conf.get("oracle.kv.subRange"); if (subRangeValue != null && subRangeValue.length() > 0) { subRange = KeyRange.fromString(subRangeValue); } int batchSize = conf.getInt("oracle.kv.batchSize", 0); ret.add(new KVHiveInputSplit(tablePaths[0]) .setKVHelperHosts(repNodeNamesAndPorts.toArray(new String[0])).setKVStoreName(kvStoreName) .setKVPart(i).setLocations(repNodeNames.toArray(new String[0])).setDirection(direction) .setBatchSize(batchSize).setParentKey(parentKey).setSubRange(subRange).setDepth(depth) .setConsistency(consistency).setTimeout(timeout).setTimeoutUnit(timeoutUnit)); } return ret.toArray(new InputSplit[ret.size()]); }
From source file:org.vroyer.hive.solr.SolrInputFormat.java
License:Open Source License
@Override public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException { log.debug("conf=" + conf); SolrTable table = new SolrTable(conf); long total = table.count(); int _numSplits = (numSplits < 1 || total <= numSplits) ? 1 : numSplits; final long splitSize = total / _numSplits; SolrSplit[] splits = new SolrSplit[_numSplits]; final Path[] tablePaths = FileInputFormat.getInputPaths(conf); for (int i = 0; i < _numSplits; i++) { if ((i + 1) == _numSplits) { splits[i] = new SolrSplit(i * splitSize, total, tablePaths[0]); splits[i].setLastSplit();/*from w w w . j a v a2 s .c o m*/ } else { splits[i] = new SolrSplit(i * splitSize, (i + 1) * splitSize, tablePaths[0]); } } log.debug("splits=" + Arrays.toString(splits)); return splits; }
From source file:org.warcbase.index.IndexerRunner.java
License:Apache License
@SuppressWarnings("static-access") public int run(String[] args) throws IOException, ParseException { LOG.info("Initializing indexer..."); Options options = new Options(); options.addOption(/*from ww w .j a va2s . c o m*/ OptionBuilder.withArgName("file").hasArg().withDescription("input file list").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("HDFS index output path") .create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of shards") .create(SHARDS_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("config file (optional)") .create(CONFIG_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION) || !cmdline.hasOption(SHARDS_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String configPath = null; if (cmdline.hasOption(CONFIG_OPTION)) { configPath = cmdline.getOptionValue(CONFIG_OPTION); } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(INDEX_OPTION); int shards = Integer.parseInt(cmdline.getOptionValue(SHARDS_OPTION)); JobConf conf = new JobConf(getConf(), IndexerRunner.class); if (configPath == null) { LOG.info("Config not specified, using default src/main/solr/WARCIndexer.conf"); configPath = "src/main/solr/WARCIndexer.conf"; } File configFile = new File(configPath); if (!configFile.exists()) { LOG.error("Error: config does not exist!"); System.exit(-1); } Config config = ConfigFactory.parseFile(configFile); conf.set(CONFIG_PROPERTIES, config.withOnlyPath("warc").root().render(ConfigRenderOptions.concise())); FileSystem fs = FileSystem.get(conf); LOG.info("HDFS index output path: " + outputPath); conf.set(IndexerReducer.HDFS_OUTPUT_PATH, outputPath); if (fs.exists(new Path(outputPath))) { LOG.error("Error: path exists already!"); System.exit(-1); } LOG.info("Number of shards: " + shards); conf.setInt(IndexerMapper.NUM_SHARDS, shards); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); conf.setJobName(IndexerRunner.class.getSimpleName() + ": " + inputPath); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(IndexerMapper.class); conf.setReducerClass(IndexerReducer.class); conf.setOutputFormat(NullOutputFormat.class); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.job.user.classpath.first", true); // Also set reduce speculative execution off, avoiding duplicate submissions to Solr. conf.setBoolean("mapreduce.reduce.speculative", false); // Note that we need this to ensure FileSystem.get is thread-safe: // @see https://issues.apache.org/jira/browse/HDFS-925 // @see https://mail-archives.apache.org/mod_mbox/hadoop-user/201208.mbox/%3CCA+4kjVt-QE2L83p85uELjWXiog25bYTKOZXdc1Ahun+oBSJYpQ@mail.gmail.com%3E conf.setBoolean("fs.hdfs.impl.disable.cache", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(WritableSolrRecord.class); conf.setNumReduceTasks(shards); // number of reducers = number of shards cacheSolrHome(conf, solrHomeZipName); JobClient.runJob(conf); return 0; }
From source file:tachyon.client.keyvalue.hadoop.KeyValueInputFormat.java
License:Apache License
/** * Returns each partition as a {@link KeyValueInputSplit}. * * @param conf MapReduce job configuration * @param numSplits number of splits, ignored because it is determined by number of partitions * @return list of {@link InputSplit}s, each split is a partition * @throws IOException if information about the partition cannot be retrieved */// ww w .j a va 2 s. c o m @Override public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException { // The paths are MapReduce program's inputs specified in // {@code mapreduce.input.fileinputformat.inputdir}, each path should be a key-value store. Path[] paths = FileInputFormat.getInputPaths(conf); List<InputSplit> splits = Lists.newArrayList(); try { for (Path path : paths) { List<PartitionInfo> partitionInfos = mKeyValueMasterClient .getPartitionInfo(new TachyonURI(path.toString())); for (PartitionInfo partitionInfo : partitionInfos) { splits.add(new KeyValueInputSplit(partitionInfo)); } } } catch (TachyonException te) { throw new IOException(te); } InputSplit[] ret = new InputSplit[splits.size()]; return splits.toArray(ret); }
From source file:ucsc.hadoop.mapreduce.apache.Sort.java
License:Apache License
/** * The main driver for sort program./* w w w.j av a 2s . com*/ * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), Sort.class); jobConf.setJobName("sorter"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = BytesWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.<K, V>writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }