Example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths

List of usage examples for org.apache.hadoop.mapred FileInputFormat getInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths.

Prototype

public static Path[] getInputPaths(JobConf conf) 

Source Link

Document

Get the list of input Path s for the map-reduce job.

Usage

From source file:org.hypertable.hadoop.hive.HiveHypertableInputFormat.java

License:Open Source License

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {

    String namespace = Utilities.getNamespace(jobConf.get(Properties.HYPERTABLE_TABLE_NAME));
    String tableName = Utilities.getTableName(jobConf.get(Properties.HYPERTABLE_TABLE_NAME));
    String columnsMappingSpec = jobConf.get(Properties.HYPERTABLE_COLUMNS_MAPPING);

    if (columnsMappingSpec == null) {
        throw new IOException("hypertable.columns.mapping required for Hypertable Table.");
    }//from w ww  .  ja  v a  2 s. c  o  m

    ColumnMappings columnMappings = null;
    try {
        columnMappings = ColumnMappings.parseColumnsMapping(columnsMappingSpec);
    } catch (SerDeException e) {
        throw new IOException(e);
    }

    int iKey = columnMappings.getKeyIndex();
    ColumnMapping keyMapping = columnMappings.getKeyMapping();

    RowInputFormat rif = new RowInputFormat();
    rif.set_namespace(namespace);
    rif.set_table_name(tableName);

    ScanSpec scanSpec = new ScanSpec();

    boolean readAllColumns = ColumnProjectionUtils.isReadAllColumns(jobConf);
    scanSpec.setKeys_only(true);

    // The list of families that have been added to the scan
    List<String> addedFamilies = new ArrayList<String>();

    if (!readAllColumns) {
        for (ColumnMapping colMap : columnMappings) {
            if (colMap.isRowKey) {
                continue;
            }
            if (colMap.qualifierName == null) {
                scanSpec.addToColumns(colMap.familyName);
                addedFamilies.add(colMap.familyName);
            } else {
                if (!addedFamilies.contains(colMap.familyName)) {
                    String column = colMap.familyName + ":" + colMap.qualifierName;
                    scanSpec.addToColumns(column);
                }
            }
            scanSpec.setKeys_only(false);
        }
    }

    scanSpec.setVersions(1);

    rif.set_scan_spec(scanSpec);

    Path[] tablePaths = FileInputFormat.getInputPaths(jobConf);

    int num_splits = 0;
    InputSplit[] splits = rif.getSplits(jobConf, num_splits);
    InputSplit[] results = new InputSplit[splits.length];
    for (int ii = 0; ii < splits.length; ii++) {
        results[ii] = new HiveHypertableSplit((TableSplit) splits[ii], tablePaths[0]);
    }
    return results;
}

From source file:org.kiji.hive.KijiTableInputFormat.java

License:Apache License

/**
 * Returns an array of input splits to be used as input to map tasks.
 *
 * @param job The job configuration./*from   w  ww.j  a v a  2  s  .c  o m*/
 * @param numTasks A hint from the MR framework for the number of mappers.
 * @return The specifications of each split.
 * @throws IOException If there is an error.
 */
@Override
public InputSplit[] getSplits(JobConf job, int numTasks) throws IOException {
    // TODO: Use the numTasks hint effectively. We just ignore it right now.

    final KijiURI kijiURI = getKijiURI(job);
    final InputSplit[] splits;

    Kiji kiji = null;
    KijiTable kijiTable = null;
    try {
        kiji = Kiji.Factory.open(kijiURI);
        kijiTable = kiji.openTable(kijiURI.getTable());

        // Get the start keys for each region in the table.
        List<KijiRegion> kijiRegions = kijiTable.getRegions();
        splits = new InputSplit[kijiRegions.size()];
        for (int i = 0; i < kijiRegions.size(); i++) {
            KijiRegion kijiRegion = kijiRegions.get(i);
            byte[] regionStartKey = kijiRegion.getStartKey();
            byte[] regionEndKey = kijiRegion.getEndKey();

            Collection<String> regionLocations = kijiRegion.getLocations();
            String regionHost = null;
            if (!regionLocations.isEmpty()) {
                // TODO: Allow the usage of regions that aren't the first.
                String regionLocation = regionLocations.iterator().next();
                regionHost = regionLocation.substring(0, regionLocation.indexOf(":"));
            } else {
                LOG.warn("No locations found for region: {}", kijiRegion.toString());
            }
            final Path dummyPath = FileInputFormat.getInputPaths(job)[0];
            splits[i] = new KijiTableInputSplit(kijiURI, regionStartKey, regionEndKey, regionHost, dummyPath);
        }
    } catch (IOException e) {
        LOG.warn("Unable to get region information.  Returning an empty list of splits.");
        LOG.warn(StringUtils.stringifyException(e));
        return new InputSplit[0];
    } finally {
        ResourceUtils.releaseOrLog(kijiTable);
        ResourceUtils.releaseOrLog(kiji);
    }
    return splits;
}

From source file:org.macau.util.FuzzyJoinDriver.java

License:Apache License

/**
 * //from w ww. j a v  a  2  s  .c  o m
 * @param job
 * @throws IOException
 * run the job and output the basic information of the job
 * the start time
 * the finished time
 * the running time(finished_Time - start_Time)
 */
public static void run(JobConf job) throws IOException {
    job.setJarByClass(FuzzyJoinDriver.class);
    //
    // print info
    //
    String ret = "FuzzyJoinDriver(" + job.getJobName() + ")\n" + "  Input Path:  {";
    Path inputs[] = FileInputFormat.getInputPaths(job);
    for (int ctr = 0; ctr < inputs.length; ctr++) {
        if (ctr > 0) {
            ret += "\n                ";
        }
        ret += inputs[ctr].toString();
    }
    ret += "}\n";
    ret += "  Output Path: " + FileOutputFormat.getOutputPath(job) + "\n" + "  Map Jobs:    "
            + job.getNumMapTasks() + "\n" + "  Reduce Jobs: " + job.getNumReduceTasks() + "\n"
            + "  Properties:  {";
    String[][] properties = new String[][] {
            new String[] { FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE },
            new String[] { FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY,
                    "" + FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE },
            new String[] { FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE },
            new String[] { TOKENS_PACKAGE_PROPERTY, TOKENS_PACKAGE_VALUE },
            new String[] { TOKENS_LENGTHSTATS_PROPERTY, "" + TOKENS_LENGTHSTATS_VALUE },
            new String[] { RIDPAIRS_GROUP_CLASS_PROPERTY, RIDPAIRS_GROUP_CLASS_VALUE },
            new String[] { RIDPAIRS_GROUP_FACTOR_PROPERTY, "" + RIDPAIRS_GROUP_FACTOR_VALUE },
            new String[] { FuzzyJoinConfig.DATA_TOKENS_PROPERTY, "" },
            new String[] { DATA_JOININDEX_PROPERTY, "" }, };
    for (int crt = 0; crt < properties.length; crt++) {
        if (crt > 0) {
            ret += "\n                ";
        }
        ret += properties[crt][0] + "=" + job.get(properties[crt][0], properties[crt][1]);
    }
    ret += "}";
    System.out.println(ret);
    //
    // run job
    //
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(job);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println(
            "The job took " + (end_time.getTime() - startTime.getTime()) / (float) 1000.0 + " seconds.");
}

From source file:org.pentaho.hadoop.shim.common.ConfigurationProxyTest.java

License:Apache License

@Test
public void testSetInputPaths() throws Exception {
    configurationProxy.setInputPaths(null);
    Path[] inputPaths = FileInputFormat.getInputPaths(configurationProxy);
    assertEquals(0, inputPaths.length);/*from ww  w.j av a  2 s . com*/

    PathProxy path1 = new PathProxy("file://path1");
    PathProxy path2 = new PathProxy("file://path2");
    configurationProxy.setInputPaths(path1, path2);

    inputPaths = FileInputFormat.getInputPaths(configurationProxy);
    assertEquals(2, inputPaths.length);
    assertArrayEquals(new Path[] { path1, path2 }, inputPaths);
}

From source file:org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat.java

License:Mozilla Public License

@SuppressWarnings("unchecked")
@Override//from w  w w.j a v a2s.c om
/**
 * Splits the input collection into
 * sets of files where each Map task 
 * gets about the same number of files
 */
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    Path[] paths = FileInputFormat.getInputPaths(job);
    // HADOOP-1818: Manage splits only if there are paths
    if (paths.length == 0) {
        return new InputSplit[0];
    }

    if (numSplits > paths.length) {
        numSplits = paths.length;
    } else if (numSplits < 1) {
        numSplits = 1;
    }
    logger.info("Allocating " + paths.length + " files across " + numSplits + " map tasks");
    List<PositionAwareSplit<CombineFileSplit>> splits = new ArrayList<PositionAwareSplit<CombineFileSplit>>(
            numSplits);
    final int numPaths = paths.length;
    long[] lengths = new long[numPaths];
    TObjectLongHashMap<String>[] locations = (TObjectLongHashMap<String>[]) Array
            .newInstance(TObjectLongHashMap.class, numPaths);
    final FileSystem fs = FileSystem.get(job);
    for (int i = 0; i < paths.length; i++) {
        final FileStatus fss = fs.getFileStatus(paths[i]);
        lengths[i] = fss.getLen();
        final TObjectLongHashMap<String> location2size = locations[i] = new TObjectLongHashMap<String>();
        final long normalblocksize = fss.getBlockSize();
        for (long offset = 0; offset < lengths[i]; offset += normalblocksize) {
            final long blocksize = Math.min(offset + normalblocksize, lengths[i]);
            final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize);
            for (BlockLocation bl : blockLocations) {
                for (String host : bl.getHosts()) {
                    location2size.adjustOrPutValue(host, blocksize, blocksize);
                }
            }
        }
    }

    //we need to over-estimate using ceil, to ensure that the last split is not /too/ big
    final int numberOfFilesPerSplit = (int) Math.ceil((double) paths.length / (double) numSplits);

    int pathsUsed = 0;
    int splitnum = 0;
    CombineFileSplit mfs;
    // for each split except the last one (which may be smaller than numberOfFilesPerSplit)
    while (pathsUsed < numPaths) {
        /* caclulate split size for this task - usually numberOfFilesPerSplit, but
         * less than this for the last split */
        final int splitSizeForThisSplit = numberOfFilesPerSplit + pathsUsed > numPaths ? numPaths - pathsUsed
                : numberOfFilesPerSplit;
        //arrays of information for split
        Path[] splitPaths = new Path[splitSizeForThisSplit];
        long[] splitLengths = new long[splitSizeForThisSplit];
        long[] splitStarts = new long[splitSizeForThisSplit];
        final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>();
        String[] splitLocations = null; //final recommended locations for this split.
        for (int i = 0; i < splitSizeForThisSplit; i++) {
            locations[pathsUsed + i].forEachEntry(new TObjectLongProcedure<String>() {
                public boolean execute(String a, long b) {
                    allLocationsForSplit.adjustOrPutValue(a, b, b);
                    return true;
                }
            });
            if (allLocationsForSplit.size() <= 3) {
                splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
            } else {
                String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
                Arrays.sort(hosts, new Comparator<String>() {
                    public int compare(String o1, String o2) {
                        long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2);
                        if (diffamount > 0) {
                            return -1;
                        } else if (diffamount < 0) {
                            return 1;
                        }
                        return 0;
                    }
                });
                splitLocations = new String[3];
                System.arraycopy(hosts, 0, splitLocations, 0, 3);
            }
        }

        //copy information for this split
        System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit);
        System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit);
        //count the number of paths consumed
        pathsUsed += splitSizeForThisSplit;

        //make the actual split object
        //logger.info("New split of size " + splitSizeForThisSplit);
        mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations);
        splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum));
        splitnum++;
    }

    if (!(pathsUsed == paths.length)) {
        throw new IOException("Number of used paths does not equal total available paths!");
    }
    return splits.toArray(new PositionAwareSplit[splits.size()]);
}

From source file:org.vilcek.hive.kv.KVHiveInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    String kvHostPort = conf.get(ConfigProperties.KV_HOST_PORT);
    Pattern pattern = Pattern.compile(",");
    kvHelperHosts = pattern.split(kvHostPort);
    kvStoreName = conf.get(ConfigProperties.KV_NAME);

    Topology topology = null;//  w  ww .j  a va2s.  co m
    try {
        topology = TopologyLocator.get(kvHelperHosts, 0);
    } catch (KVStoreException KVSE) {
        KVSE.printStackTrace();
        return null;
    }
    RegistryUtils regUtils = new RegistryUtils(topology);
    PartitionMap partitionMap = topology.getPartitionMap();
    int nParts = partitionMap.getNPartitions();
    List<InputSplit> ret = new ArrayList<InputSplit>(nParts);

    Map<Object, RepNodeStatus> statuses = new HashMap<Object, RepNodeStatus>();
    Path[] tablePaths = FileInputFormat.getInputPaths(conf);
    for (int i = 1; i <= nParts; i++) {
        PartitionId partId = new PartitionId(i);
        RepGroupId repGroupId = topology.getRepGroupId(partId);
        RepGroup repGroup = topology.get(repGroupId);
        Collection<RepNode> repNodes = repGroup.getRepNodes();
        List<String> repNodeNames = new ArrayList<String>();
        List<String> repNodeNamesAndPorts = new ArrayList<String>();
        for (RepNode rn : repNodes) {
            RepNodeStatus rnStatus = null;
            try {
                if (statuses.containsKey(rn.getResourceId())) {
                    rnStatus = statuses.get(rn.getResourceId());
                } else {
                    RepNodeAdminAPI rna = regUtils.getRepNodeAdmin(rn.getResourceId());
                    rnStatus = rna.ping();
                    statuses.put(rn.getResourceId(), rnStatus);
                }
            } catch (RemoteException re) {
                System.err.println("Ping failed for " + rn.getResourceId() + ": " + re.getMessage());
                re.printStackTrace();
                statuses.put(rn.getResourceId(), null);
            } catch (NotBoundException e) {
                System.err.println(
                        "No RMI service for RN: " + rn.getResourceId() + " message: " + e.getMessage());
            }

            if (rnStatus == null) {
                continue;
            }

            /*
             * com.sleepycat.je.rep.ReplicatedEnvironment.State state = rnStatus.getReplicationState(); if (!state.isActive() ||
             * (consistency == Consistency.ABSOLUTE && !state.isMaster())) { continue; }
             */

            StorageNodeId snid = rn.getStorageNodeId();
            StorageNode sn = topology.get(snid);

            repNodeNames.add(sn.getHostname());
            repNodeNamesAndPorts.add(sn.getHostname() + ":" + sn.getRegistryPort());
        }

        Key parentKey = null;
        String parentKeyValue = conf.get("oracle.kv.parentKey");
        if (parentKeyValue != null && parentKeyValue.length() > 0) {
            parentKey = Key.fromString(parentKeyValue);
        }
        KeyRange subRange = null;
        String subRangeValue = conf.get("oracle.kv.subRange");
        if (subRangeValue != null && subRangeValue.length() > 0) {
            subRange = KeyRange.fromString(subRangeValue);
        }

        int batchSize = conf.getInt("oracle.kv.batchSize", 0);

        ret.add(new KVHiveInputSplit(tablePaths[0])
                .setKVHelperHosts(repNodeNamesAndPorts.toArray(new String[0])).setKVStoreName(kvStoreName)
                .setKVPart(i).setLocations(repNodeNames.toArray(new String[0])).setDirection(direction)
                .setBatchSize(batchSize).setParentKey(parentKey).setSubRange(subRange).setDepth(depth)
                .setConsistency(consistency).setTimeout(timeout).setTimeoutUnit(timeoutUnit));

    }

    return ret.toArray(new InputSplit[ret.size()]);
}

From source file:org.vroyer.hive.solr.SolrInputFormat.java

License:Open Source License

@Override
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    log.debug("conf=" + conf);

    SolrTable table = new SolrTable(conf);
    long total = table.count();
    int _numSplits = (numSplits < 1 || total <= numSplits) ? 1 : numSplits;
    final long splitSize = total / _numSplits;
    SolrSplit[] splits = new SolrSplit[_numSplits];
    final Path[] tablePaths = FileInputFormat.getInputPaths(conf);
    for (int i = 0; i < _numSplits; i++) {
        if ((i + 1) == _numSplits) {
            splits[i] = new SolrSplit(i * splitSize, total, tablePaths[0]);
            splits[i].setLastSplit();/*from w w w .  j  a v  a2  s .c o  m*/
        } else {
            splits[i] = new SolrSplit(i * splitSize, (i + 1) * splitSize, tablePaths[0]);
        }
    }
    log.debug("splits=" + Arrays.toString(splits));
    return splits;
}

From source file:org.warcbase.index.IndexerRunner.java

License:Apache License

@SuppressWarnings("static-access")
public int run(String[] args) throws IOException, ParseException {
    LOG.info("Initializing indexer...");

    Options options = new Options();

    options.addOption(/*from  ww  w  .j a va2s . c  o m*/
            OptionBuilder.withArgName("file").hasArg().withDescription("input file list").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("HDFS index output path")
            .create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of shards")
            .create(SHARDS_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("config file (optional)")
            .create(CONFIG_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)
            || !cmdline.hasOption(SHARDS_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String configPath = null;
    if (cmdline.hasOption(CONFIG_OPTION)) {
        configPath = cmdline.getOptionValue(CONFIG_OPTION);
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(INDEX_OPTION);
    int shards = Integer.parseInt(cmdline.getOptionValue(SHARDS_OPTION));

    JobConf conf = new JobConf(getConf(), IndexerRunner.class);

    if (configPath == null) {
        LOG.info("Config not specified, using default src/main/solr/WARCIndexer.conf");
        configPath = "src/main/solr/WARCIndexer.conf";
    }
    File configFile = new File(configPath);
    if (!configFile.exists()) {
        LOG.error("Error: config does not exist!");
        System.exit(-1);
    }
    Config config = ConfigFactory.parseFile(configFile);
    conf.set(CONFIG_PROPERTIES, config.withOnlyPath("warc").root().render(ConfigRenderOptions.concise()));

    FileSystem fs = FileSystem.get(conf);

    LOG.info("HDFS index output path: " + outputPath);
    conf.set(IndexerReducer.HDFS_OUTPUT_PATH, outputPath);
    if (fs.exists(new Path(outputPath))) {
        LOG.error("Error: path exists already!");
        System.exit(-1);
    }

    LOG.info("Number of shards: " + shards);
    conf.setInt(IndexerMapper.NUM_SHARDS, shards);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    conf.setJobName(IndexerRunner.class.getSimpleName() + ": " + inputPath);
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(IndexerMapper.class);
    conf.setReducerClass(IndexerReducer.class);
    conf.setOutputFormat(NullOutputFormat.class);

    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.job.user.classpath.first", true);
    // Also set reduce speculative execution off, avoiding duplicate submissions to Solr.
    conf.setBoolean("mapreduce.reduce.speculative", false);

    // Note that we need this to ensure FileSystem.get is thread-safe:
    // @see https://issues.apache.org/jira/browse/HDFS-925
    // @see https://mail-archives.apache.org/mod_mbox/hadoop-user/201208.mbox/%3CCA+4kjVt-QE2L83p85uELjWXiog25bYTKOZXdc1Ahun+oBSJYpQ@mail.gmail.com%3E
    conf.setBoolean("fs.hdfs.impl.disable.cache", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WritableSolrRecord.class);
    conf.setNumReduceTasks(shards); // number of reducers = number of shards

    cacheSolrHome(conf, solrHomeZipName);

    JobClient.runJob(conf);

    return 0;
}

From source file:tachyon.client.keyvalue.hadoop.KeyValueInputFormat.java

License:Apache License

/**
 * Returns each partition as a {@link KeyValueInputSplit}.
 *
 * @param conf MapReduce job configuration
 * @param numSplits number of splits, ignored because it is determined by number of partitions
 * @return list of {@link InputSplit}s, each split is a partition
 * @throws IOException if information about the partition cannot be retrieved
 *///  ww  w  .j  a  va 2 s.  c  o m
@Override
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    // The paths are MapReduce program's inputs specified in
    // {@code mapreduce.input.fileinputformat.inputdir}, each path should be a key-value store.
    Path[] paths = FileInputFormat.getInputPaths(conf);
    List<InputSplit> splits = Lists.newArrayList();
    try {
        for (Path path : paths) {
            List<PartitionInfo> partitionInfos = mKeyValueMasterClient
                    .getPartitionInfo(new TachyonURI(path.toString()));
            for (PartitionInfo partitionInfo : partitionInfos) {
                splits.add(new KeyValueInputSplit(partitionInfo));
            }
        }
    } catch (TachyonException te) {
        throw new IOException(te);
    }
    InputSplit[] ret = new InputSplit[splits.size()];
    return splits.toArray(ret);
}

From source file:ucsc.hadoop.mapreduce.apache.Sort.java

License:Apache License

/**
 * The main driver for sort program./* w w  w.j av  a 2s . com*/
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.
 */
public int run(String[] args) throws Exception {

    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;

    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;

    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}