Example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths.

Prototype

public static Path[] getInputPaths(JobConf conf)

Source Link

Document

Get the list of input Path s for the map-reduce job.

Usage

From source file:org.hypertable.hadoop.hive.HiveHypertableInputFormat.java

License:Open Source License

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {

    String namespace = Utilities.getNamespace(jobConf.get(Properties.HYPERTABLE_TABLE_NAME));
    String tableName = Utilities.getTableName(jobConf.get(Properties.HYPERTABLE_TABLE_NAME));
    String columnsMappingSpec = jobConf.get(Properties.HYPERTABLE_COLUMNS_MAPPING);

    if (columnsMappingSpec == null) {
        throw new IOException("hypertable.columns.mapping required for Hypertable Table.");
    }//from w ww  .  ja  v a  2 s. c  o  m

    ColumnMappings columnMappings = null;
    try {
        columnMappings = ColumnMappings.parseColumnsMapping(columnsMappingSpec);
    } catch (SerDeException e) {
        throw new IOException(e);
    }

    int iKey = columnMappings.getKeyIndex();
    ColumnMapping keyMapping = columnMappings.getKeyMapping();

    RowInputFormat rif = new RowInputFormat();
    rif.set_namespace(namespace);
    rif.set_table_name(tableName);

    ScanSpec scanSpec = new ScanSpec();

    boolean readAllColumns = ColumnProjectionUtils.isReadAllColumns(jobConf);
    scanSpec.setKeys_only(true);

    // The list of families that have been added to the scan
    List<String> addedFamilies = new ArrayList<String>();

    if (!readAllColumns) {
        for (ColumnMapping colMap : columnMappings) {
            if (colMap.isRowKey) {
                continue;
            }
            if (colMap.qualifierName == null) {
                scanSpec.addToColumns(colMap.familyName);
                addedFamilies.add(colMap.familyName);
            } else {
                if (!addedFamilies.contains(colMap.familyName)) {
                    String column = colMap.familyName + ":" + colMap.qualifierName;
                    scanSpec.addToColumns(column);
                }
            }
            scanSpec.setKeys_only(false);
        }
    }

    scanSpec.setVersions(1);

    rif.set_scan_spec(scanSpec);

    Path[] tablePaths = FileInputFormat.getInputPaths(jobConf);

    int num_splits = 0;
    InputSplit[] splits = rif.getSplits(jobConf, num_splits);
    InputSplit[] results = new InputSplit[splits.length];
    for (int ii = 0; ii < splits.length; ii++) {
        results[ii] = new HiveHypertableSplit((TableSplit) splits[ii], tablePaths[0]);
    }
    return results;
}

From source file:org.kiji.hive.KijiTableInputFormat.java

License:Apache License

/**
 * Returns an array of input splits to be used as input to map tasks.
 *
 * @param job The job configuration./*from   w  ww.j  a v a  2  s  .c  o m*/
 * @param numTasks A hint from the MR framework for the number of mappers.
 * @return The specifications of each split.
 * @throws IOException If there is an error.
 */
@Override
public InputSplit[] getSplits(JobConf job, int numTasks) throws IOException {
    // TODO: Use the numTasks hint effectively. We just ignore it right now.

    final KijiURI kijiURI = getKijiURI(job);
    final InputSplit[] splits;

    Kiji kiji = null;
    KijiTable kijiTable = null;
    try {
        kiji = Kiji.Factory.open(kijiURI);
        kijiTable = kiji.openTable(kijiURI.getTable());

        // Get the start keys for each region in the table.
        List<KijiRegion> kijiRegions = kijiTable.getRegions();
        splits = new InputSplit[kijiRegions.size()];
        for (int i = 0; i < kijiRegions.size(); i++) {
            KijiRegion kijiRegion = kijiRegions.get(i);
            byte[] regionStartKey = kijiRegion.getStartKey();
            byte[] regionEndKey = kijiRegion.getEndKey();

            Collection<String> regionLocations = kijiRegion.getLocations();
            String regionHost = null;
            if (!regionLocations.isEmpty()) {
                // TODO: Allow the usage of regions that aren't the first.
                String regionLocation = regionLocations.iterator().next();
                regionHost = regionLocation.substring(0, regionLocation.indexOf(":"));
            } else {
                LOG.warn("No locations found for region: {}", kijiRegion.toString());
            }
            final Path dummyPath = FileInputFormat.getInputPaths(job)[0];
            splits[i] = new KijiTableInputSplit(kijiURI, regionStartKey, regionEndKey, regionHost, dummyPath);
        }
    } catch (IOException e) {
        LOG.warn("Unable to get region information.  Returning an empty list of splits.");
        LOG.warn(StringUtils.stringifyException(e));
        return new InputSplit[0];
    } finally {
        ResourceUtils.releaseOrLog(kijiTable);
        ResourceUtils.releaseOrLog(kiji);
    }
    return splits;
}

From source file:org.macau.util.FuzzyJoinDriver.java

License:Apache License

/**
 * //from w ww. j a v  a  2  s  .c  o m
 * @param job
 * @throws IOException
 * run the job and output the basic information of the job
 * the start time
 * the finished time
 * the running time(finished_Time - start_Time)
 */
public static void run(JobConf job) throws IOException {
    job.setJarByClass(FuzzyJoinDriver.class);
    //
    // print info
    //
    String ret = "FuzzyJoinDriver(" + job.getJobName() + ")\n" + "  Input Path:  {";
    Path inputs[] = FileInputFormat.getInputPaths(job);
    for (int ctr = 0; ctr < inputs.length; ctr++) {
        if (ctr > 0) {
            ret += "\n                ";
        }
        ret += inputs[ctr].toString();
    }
    ret += "}\n";
    ret += "  Output Path: " + FileOutputFormat.getOutputPath(job) + "\n" + "  Map Jobs:    "
            + job.getNumMapTasks() + "\n" + "  Reduce Jobs: " + job.getNumReduceTasks() + "\n"
            + "  Properties:  {";
    String[][] properties = new String[][] {
            new String[] { FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE },
            new String[] { FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY,
                    "" + FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE },
            new String[] { FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE },
            new String[] { TOKENS_PACKAGE_PROPERTY, TOKENS_PACKAGE_VALUE },
            new String[] { TOKENS_LENGTHSTATS_PROPERTY, "" + TOKENS_LENGTHSTATS_VALUE },
            new String[] { RIDPAIRS_GROUP_CLASS_PROPERTY, RIDPAIRS_GROUP_CLASS_VALUE },
            new String[] { RIDPAIRS_GROUP_FACTOR_PROPERTY, "" + RIDPAIRS_GROUP_FACTOR_VALUE },
            new String[] { FuzzyJoinConfig.DATA_TOKENS_PROPERTY, "" },
            new String[] { DATA_JOININDEX_PROPERTY, "" }, };
    for (int crt = 0; crt < properties.length; crt++) {
        if (crt > 0) {
            ret += "\n                ";
        }
        ret += properties[crt][0] + "=" + job.get(properties[crt][0], properties[crt][1]);
    }
    ret += "}";
    System.out.println(ret);
    //
    // run job
    //
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(job);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println(
            "The job took " + (end_time.getTime() - startTime.getTime()) / (float) 1000.0 + " seconds.");
}

From source file:org.pentaho.hadoop.shim.common.ConfigurationProxyTest.java

License:Apache License

@Test
public void testSetInputPaths() throws Exception {
    configurationProxy.setInputPaths(null);
    Path[] inputPaths = FileInputFormat.getInputPaths(configurationProxy);
    assertEquals(0, inputPaths.length);/*from ww  w.j av a  2 s . com*/

    PathProxy path1 = new PathProxy("file://path1");
    PathProxy path2 = new PathProxy("file://path2");
    configurationProxy.setInputPaths(path1, path2);

    inputPaths = FileInputFormat.getInputPaths(configurationProxy);
    assertEquals(2, inputPaths.length);
    assertArrayEquals(new Path[] { path1, path2 }, inputPaths);
}

From source file:org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat.java

License:Mozilla Public License

@SuppressWarnings("unchecked")
@Override//from w  w w.j a v a2s.c om
/**
 * Splits the input collection into
 * sets of files where each Map task 
 * gets about the same number of files
 */
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    Path[] paths = FileInputFormat.getInputPaths(job);
    // HADOOP-1818: Manage splits only if there are paths
    if (paths.length == 0) {
        return new InputSplit[0];
    }

    if (numSplits > paths.length) {
        numSplits = paths.length;
    } else if (numSplits < 1) {
        numSplits = 1;
    }
    logger.info("Allocating " + paths.length + " files across " + numSplits + " map tasks");
    List<PositionAwareSplit<CombineFileSplit>> splits = new ArrayList<PositionAwareSplit<CombineFileSplit>>(
            numSplits);
    final int numPaths = paths.length;
    long[] lengths = new long[numPaths];
    TObjectLongHashMap<String>[] locations = (TObjectLongHashMap<String>[]) Array
            .newInstance(TObjectLongHashMap.class, numPaths);
    final FileSystem fs = FileSystem.get(job);
    for (int i = 0; i < paths.length; i++) {
        final FileStatus fss = fs.getFileStatus(paths[i]);
        lengths[i] = fss.getLen();
        final TObjectLongHashMap<String> location2size = locations[i] = new TObjectLongHashMap<String>();
        final long normalblocksize = fss.getBlockSize();
        for (long offset = 0; offset < lengths[i]; offset += normalblocksize) {
            final long blocksize = Math.min(offset + normalblocksize, lengths[i]);
            final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize);
            for (BlockLocation bl : blockLocations) {
                for (String host : bl.getHosts()) {
                    location2size.adjustOrPutValue(host, blocksize, blocksize);
                }
            }
        }
    }

    //we need to over-estimate using ceil, to ensure that the last split is not /too/ big
    final int numberOfFilesPerSplit = (int) Math.ceil((double) paths.length / (double) numSplits);

    int pathsUsed = 0;
    int splitnum = 0;
    CombineFileSplit mfs;
    // for each split except the last one (which may be smaller than numberOfFilesPerSplit)
    while (pathsUsed < numPaths) {
        /* caclulate split size for this task - usually numberOfFilesPerSplit, but
         * less than this for the last split */
        final int splitSizeForThisSplit = numberOfFilesPerSplit + pathsUsed > numPaths ? numPaths - pathsUsed
                : numberOfFilesPerSplit;
        //arrays of information for split
        Path[] splitPaths = new Path[splitSizeForThisSplit];
        long[] splitLengths = new long[splitSizeForThisSplit];
        long[] splitStarts = new long[splitSizeForThisSplit];
        final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>();
        String[] splitLocations = null; //final recommended locations for this split.
        for (int i = 0; i < splitSizeForThisSplit; i++) {
            locations[pathsUsed + i].forEachEntry(new TObjectLongProcedure<String>() {
                public boolean execute(String a, long b) {
                    allLocationsForSplit.adjustOrPutValue(a, b, b);
                    return true;
                }
            });
            if (allLocationsForSplit.size() <= 3) {
                splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
            } else {
                String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
                Arrays.sort(hosts, new Comparator<String>() {
                    public int compare(String o1, String o2) {
                        long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2);
                        if (diffamount > 0) {
                            return -1;
                        } else if (diffamount < 0) {
                            return 1;
                        }
                        return 0;
                    }
                });
                splitLocations = new String[3];
                System.arraycopy(hosts, 0, splitLocations, 0, 3);
            }
        }

        //copy information for this split
        System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit);
        System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit);
        //count the number of paths consumed
        pathsUsed += splitSizeForThisSplit;

        //make the actual split object
        //logger.info("New split of size " + splitSizeForThisSplit);
        mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations);
        splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum));
        splitnum++;
    }

    if (!(pathsUsed == paths.length)) {
        throw new IOException("Number of used paths does not equal total available paths!");
    }
    return splits.toArray(new PositionAwareSplit[splits.size()]);
}

From source file:org.vilcek.hive.kv.KVHiveInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    String kvHostPort = conf.get(ConfigProperties.KV_HOST_PORT);
    Pattern pattern = Pattern.compile(",");
    kvHelperHosts = pattern.split(kvHostPort);
    kvStoreName = conf.get(ConfigProperties.KV_NAME);

    Topology topology = null;//  w  ww .j  a va2s.  co m
    try {
        topology = TopologyLocator.get(kvHelperHosts, 0);
    } catch (KVStoreException KVSE) {
        KVSE.printStackTrace();
        return null;
    }
    RegistryUtils regUtils = new RegistryUtils(topology);
    PartitionMap partitionMap = topology.getPartitionMap();
    int nParts = partitionMap.getNPartitions();
    List<InputSplit> ret = new ArrayList<InputSplit>(nParts);

    Map<Object, RepNodeStatus> statuses = new HashMap<Object, RepNodeStatus>();
    Path[] tablePaths = FileInputFormat.getInputPaths(conf);
    for (int i = 1; i <= nParts; i++) {
        PartitionId partId = new PartitionId(i);
        RepGroupId repGroupId = topology.getRepGroupId(partId);
        RepGroup repGroup = topology.get(repGroupId);
        Collection<RepNode> repNodes = repGroup.getRepNodes();
        List<String> repNodeNames = new ArrayList<String>();
        List<String> repNodeNamesAndPorts = new ArrayList<String>();
        for (RepNode rn : repNodes) {
            RepNodeStatus rnStatus = null;
            try {
                if (statuses.containsKey(rn.getResourceId())) {
                    rnStatus = statuses.get(rn.getResourceId());
                } else {
                    RepNodeAdminAPI rna = regUtils.getRepNodeAdmin(rn.getResourceId());
                    rnStatus = rna.ping();
                    statuses.put(rn.getResourceId(), rnStatus);
                }
            } catch (RemoteException re) {
                System.err.println("Ping failed for " + rn.getResourceId() + ": " + re.getMessage());
                re.printStackTrace();
                statuses.put(rn.getResourceId(), null);
            } catch (NotBoundException e) {
                System.err.println(
                        "No RMI service for RN: " + rn.getResourceId() + " message: " + e.getMessage());
            }

            if (rnStatus == null) {
                continue;
            }

            /*
             * com.sleepycat.je.rep.ReplicatedEnvironment.State state = rnStatus.getReplicationState(); if (!state.isActive() ||
             * (consistency == Consistency.ABSOLUTE && !state.isMaster())) { continue; }
             */

            StorageNodeId snid = rn.getStorageNodeId();
            StorageNode sn = topology.get(snid);

            repNodeNames.add(sn.getHostname());
            repNodeNamesAndPorts.add(sn.getHostname() + ":" + sn.getRegistryPort());
        }

        Key parentKey = null;
        String parentKeyValue = conf.get("oracle.kv.parentKey");
        if (parentKeyValue != null && parentKeyValue.length() > 0) {
            parentKey = Key.fromString(parentKeyValue);
        }
        KeyRange subRange = null;
        String subRangeValue = conf.get("oracle.kv.subRange");
        if (subRangeValue != null && subRangeValue.length() > 0) {
            subRange = KeyRange.fromString(subRangeValue);
        }

        int batchSize = conf.getInt("oracle.kv.batchSize", 0);

        ret.add(new KVHiveInputSplit(tablePaths[0])
                .setKVHelperHosts(repNodeNamesAndPorts.toArray(new String[0])).setKVStoreName(kvStoreName)
                .setKVPart(i).setLocations(repNodeNames.toArray(new String[0])).setDirection(direction)
                .setBatchSize(batchSize).setParentKey(parentKey).setSubRange(subRange).setDepth(depth)
                .setConsistency(consistency).setTimeout(timeout).setTimeoutUnit(timeoutUnit));

    }

    return ret.toArray(new InputSplit[ret.size()]);
}

From source file:org.vroyer.hive.solr.SolrInputFormat.java

License:Open Source License

@Override
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    log.debug("conf=" + conf);

    SolrTable table = new SolrTable(conf);
    long total = table.count();
    int _numSplits = (numSplits < 1 || total <= numSplits) ? 1 : numSplits;
    final long splitSize = total / _numSplits;
    SolrSplit[] splits = new SolrSplit[_numSplits];
    final Path[] tablePaths = FileInputFormat.getInputPaths(conf);
    for (int i = 0; i < _numSplits; i++) {
        if ((i + 1) == _numSplits) {
            splits[i] = new SolrSplit(i * splitSize, total, tablePaths[0]);
            splits[i].setLastSplit();/*from w w w .  j  a v  a2  s .c o  m*/
        } else {
            splits[i] = new SolrSplit(i * splitSize, (i + 1) * splitSize, tablePaths[0]);
        }
    }
    log.debug("splits=" + Arrays.toString(splits));
    return splits;
}

From source file:org.warcbase.index.IndexerRunner.java

License:Apache License

@SuppressWarnings("static-access")
public int run(String[] args) throws IOException, ParseException {
    LOG.info("Initializing indexer...");

    Options options = new Options();

    options.addOption(/*from  ww  w  .j a va2s . c  o m*/
            OptionBuilder.withArgName("file").hasArg().withDescription("input file list").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("HDFS index output path")
            .create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of shards")
            .create(SHARDS_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("config file (optional)")
            .create(CONFIG_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)
            || !cmdline.hasOption(SHARDS_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String configPath = null;
    if (cmdline.hasOption(CONFIG_OPTION)) {
        configPath = cmdline.getOptionValue(CONFIG_OPTION);
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(INDEX_OPTION);
    int shards = Integer.parseInt(cmdline.getOptionValue(SHARDS_OPTION));

    JobConf conf = new JobConf(getConf(), IndexerRunner.class);

    if (configPath == null) {
        LOG.info("Config not specified, using default src/main/solr/WARCIndexer.conf");
        configPath = "src/main/solr/WARCIndexer.conf";
    }
    File configFile = new File(configPath);
    if (!configFile.exists()) {
        LOG.error("Error: config does not exist!");
        System.exit(-1);
    }
    Config config = ConfigFactory.parseFile(configFile);
    conf.set(CONFIG_PROPERTIES, config.withOnlyPath("warc").root().render(ConfigRenderOptions.concise()));

    FileSystem fs = FileSystem.get(conf);

    LOG.info("HDFS index output path: " + outputPath);
    conf.set(IndexerReducer.HDFS_OUTPUT_PATH, outputPath);
    if (fs.exists(new Path(outputPath))) {
        LOG.error("Error: path exists already!");
        System.exit(-1);
    }

    LOG.info("Number of shards: " + shards);
    conf.setInt(IndexerMapper.NUM_SHARDS, shards);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    conf.setJobName(IndexerRunner.class.getSimpleName() + ": " + inputPath);
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(IndexerMapper.class);
    conf.setReducerClass(IndexerReducer.class);
    conf.setOutputFormat(NullOutputFormat.class);

    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.job.user.classpath.first", true);
    // Also set reduce speculative execution off, avoiding duplicate submissions to Solr.
    conf.setBoolean("mapreduce.reduce.speculative", false);

    // Note that we need this to ensure FileSystem.get is thread-safe:
    // @see https://issues.apache.org/jira/browse/HDFS-925
    // @see https://mail-archives.apache.org/mod_mbox/hadoop-user/201208.mbox/%3CCA+4kjVt-QE2L83p85uELjWXiog25bYTKOZXdc1Ahun+oBSJYpQ@mail.gmail.com%3E
    conf.setBoolean("fs.hdfs.impl.disable.cache", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WritableSolrRecord.class);
    conf.setNumReduceTasks(shards); // number of reducers = number of shards

    cacheSolrHome(conf, solrHomeZipName);

    JobClient.runJob(conf);

    return 0;
}

From source file:tachyon.client.keyvalue.hadoop.KeyValueInputFormat.java

License:Apache License

/**
 * Returns each partition as a {@link KeyValueInputSplit}.
 *
 * @param conf MapReduce job configuration
 * @param numSplits number of splits, ignored because it is determined by number of partitions
 * @return list of {@link InputSplit}s, each split is a partition
 * @throws IOException if information about the partition cannot be retrieved
 *///  ww  w  .j  a  va 2 s.  c  o m
@Override
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    // The paths are MapReduce program's inputs specified in
    // {@code mapreduce.input.fileinputformat.inputdir}, each path should be a key-value store.
    Path[] paths = FileInputFormat.getInputPaths(conf);
    List<InputSplit> splits = Lists.newArrayList();
    try {
        for (Path path : paths) {
            List<PartitionInfo> partitionInfos = mKeyValueMasterClient
                    .getPartitionInfo(new TachyonURI(path.toString()));
            for (PartitionInfo partitionInfo : partitionInfos) {
                splits.add(new KeyValueInputSplit(partitionInfo));
            }
        }
    } catch (TachyonException te) {
        throw new IOException(te);
    }
    InputSplit[] ret = new InputSplit[splits.size()];
    return splits.toArray(ret);
}

From source file:ucsc.hadoop.mapreduce.apache.Sort.java

License:Apache License

/**
 * The main driver for sort program./* w w  w.j av  a 2s . com*/
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.
 */
public int run(String[] args) throws Exception {

    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;

    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;

    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}