public static void setInputPaths(JobConf conf, Path... inputPaths) 

Set the array of Path s as the list of inputs for the map-reduce job.


From source file:org.apache.whirr.service.yarn.integration.AbstractHadoopServiceTest.java

License:Apache License

public void test() throws Exception {
    Configuration conf = controller.getConfiguration();
    JobConf job = new JobConf(conf, AbstractHadoopServiceTest.class);

    FileSystem fs = FileSystem.get(conf);

    OutputStream os = fs.create(new Path("input"));
    Writer wr = new OutputStreamWriter(os);
    wr.write("b a\n");
    wr.close();/*  w w w.j  ava  2s.  c o m*/

    FileInputFormat.setInputPaths(job, new Path("input"));
    FileOutputFormat.setOutputPath(job, new Path("output"));


    FSDataInputStream in = fs.open(new Path("output/part-00000"));
    BufferedReader reader = new BufferedReader(new InputStreamReader(in));
    assertEquals("a\t1", reader.readLine());
    assertEquals("b\t1", reader.readLine());


From source file:org.archive.wayback.hadoop.CDXSort.java

License:Apache License

 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job./*from w ww . j  a v  a 2s.c  om*/
 * @throws IOException
 *             When there is communication problems with the job tracker.
public int run(String[] args) throws Exception {

    boolean compressOutput = false;
    boolean dereferenceInputs = false;
    boolean canonicalize = false;
    boolean funkyInput = false;

    JobConf jobConf = new JobConf(getConf(), CDXSort.class);


    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    List<String> otherArgs = new ArrayList<String>();

    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
            } else if ("--compress-output".equals(args[i])) {
                compressOutput = true;
            } else if ("--funky-input".equals(args[i])) {
                funkyInput = true;
            } else if ("--dereference-inputs".equals(args[i])) {
                dereferenceInputs = true;
            } else if ("--canonicalize".equals(args[i])) {
                canonicalize = true;
            } else {
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits

    // Make sure there are exactly 3 parameters left: split input output
    if (otherArgs.size() != 3) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 3.");
        return printUsage();

    String splitPath = otherArgs.get(0);
    String inputPath = otherArgs.get(1);
    String outputPath = otherArgs.get(2);

    // load the split file, find and set the number of reduces
    AlphaPartitioner partitioner = new AlphaPartitioner();
    File localSplitFile = new File(splitPath);
    FileInputStream fis = new FileInputStream(localSplitFile);
    InputStreamReader isr = new InputStreamReader(fis, ByteOp.UTF8);
    BufferedReader bis = new BufferedReader(isr);
    //      try {
    //         partitioner.loadBoundaries(bis);
    //      } catch (IOException except) {
    //         System.err.println("ERROR: Problem loading file " + splitPath);
    //         return printUsage(); // exits
    //      }
    //      jobConf.setNumReduceTasks(partitioner.getNumPartitions());
    //      // copy the split file into the FS, add to the DistributedCache:
    ////      AlphaPartitioner.setPartitionFile(jobConf, localSplitFile);
    //      AlphaPartitioner.setSplitCache(jobConf, localSplitFile);
    //      System.err.println("uploaded split file to FS and DistributedCache");
    //      // Set job configs:
    //      jobConf.setInputFormat(TextInputFormat.class);
    //      jobConf.setOutputFormat(TextOutputFormat.class);
    //      if (canonicalize) {
    //         jobConf.setMapperClass(CDXCanonicalizerMapClass.class);
    //      } else {
    //         jobConf.setMapperClass(CDXMapClass.class);
    //      }
    //      jobConf.setOutputKeyClass(Text.class);
    //      jobConf.setOutputValueClass(Text.class);
    //      jobConf.set("mapred.textoutputformat.separator", " ");
    //      jobConf.setPartitionerClass(AlphaPartitioner.class);

    int inputCount = 0;
    // Set job input:
    if (dereferenceInputs) {

        // SO SLOW... can't add one at a time...
        //         FileReader is2 = new FileReader(new File(inputPath));
        //         BufferedReader bis2 = new BufferedReader(is2);
        //         while (true) {
        //            String line = bis2.readLine();
        //            if (line == null) {
        //               break;
        //            }
        //            FileInputFormat.addInputPath(jobConf, new Path(line));
        //            inputCount++;
        //            System.err.println("Added path(" + inputCount + "): " + line);
        //         }

        // PASS 2:
        //         FileReader is2 = new FileReader(new File(inputPath));
        //         BufferedReader bis2 = new BufferedReader(is2);
        //         ArrayList<String> list = new ArrayList<String>();
        //         while (true) {
        //            String line = bis2.readLine();
        //            if (line == null) {
        //               break;
        //            }
        //            list.add(line);
        //            inputCount++;
        //         }
        //         Path arr[] = new Path[list.size()];
        //         for(int i=0; i < list.size(); i++) {
        //            arr[i] = new Path(list.get(i));
        //         }
        //         FileInputFormat.setInputPaths(jobConf, arr);

        // PASS 3:
        if (funkyInput) {
        } else {
        FileInputFormat.setInputPaths(jobConf, new Path(inputPath));
        inputCount = 1;

    } else {
        FileInputFormat.setInputPaths(jobConf, new Path(inputPath));
        inputCount = 1;

    // Set job output:
    FileOutputFormat.setOutputPath(jobConf, new Path(outputPath));

    if (compressOutput) {
        FileOutputFormat.setCompressOutput(jobConf, true);
        FileOutputFormat.setOutputCompressorClass(jobConf, GzipCodec.class);

    //      System.out.println("Running on " + cluster.getTaskTrackers()
    //            + " nodes, processing " + inputCount + " files/directories"
    //            + " into " + outputPath + " with "
    //            + partitioner.getNumPartitions() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;

From source file:org.asayler.WikiTitleCount.java

License:Apache License

 * The main driver for wikititlecount map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker./*from  ww  w. j  a v a  2s  . c om*/
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WikiTitleCount.class);
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();

    int num_maps = 1;
    int num_reducers = 1;





    /** Set Default Mappers */
    num_maps = (int) (cluster.getMaxMapTasks());

    /** Set Default Mappers */
    num_reducers = (int) (cluster.getMaxReduceTasks() * 0.9);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    /* Set Mappers and Reducer */

    return 0;

From source file:org.asayler.WikiTitleSort.java

License:Apache License

 * The main driver for wikititlecount map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.//from  w  w w .  ja  va 2s  . c  om
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WikiTitleSort.class);
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();

    int num_maps = 1;
    final int num_reducers = 1;





    /** Set Default Mappers */
    num_maps = (int) (cluster.getMaxMapTasks());

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    /* Set Mappers and Reducer */

    return 0;

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java

License:Apache License

public void TestArcInputFormat() throws IOException, InterruptedException {
    for (int i = 0; i < NUM_ITERATIONS; ++i) {
        JobConf job = new JobConf();
        FileSystem fs = LocalFileSystem.get(job);
        Path path = new Path("/tmp/" + File.createTempFile("ARCInputFormat", "test").getName());
        fs.mkdirs(path);/*  w  ww. ja v a  2 s  .c  o  m*/

        List<Pair<Path, List<TestRecord>>> fileList = buildTestFiles(path, fs, NUM_TEST_FILES);

        FileInputFormat.setInputPaths(job, path);

        ARCFileInputFormat inputFormat = new ARCFileInputFormat();

        InputSplit splits[] = inputFormat.getSplits(job, 0);

        for (InputSplit split : splits) {
            RecordReader<Text, BytesWritable> reader = inputFormat.getRecordReader(split, job, null);
            validateSplit(fs, split, fileList, reader);

        Assert.assertTrue(fileList.size() == 0);

        fs.delete(path, true);


From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java

License:Apache License

public void TestArcItemInputFormat() throws IOException, InterruptedException {
    for (int i = 0; i < NUM_ITERATIONS; ++i) {
        JobConf job = new JobConf();
        FileSystem fs = LocalFileSystem.get(job);
        Path path = new Path("/tmp/" + File.createTempFile("ARCInputFormat", "test").getName());
        fs.mkdirs(path);/*  w  w w.j a  va  2  s. c om*/

        List<Pair<Path, List<TestRecord>>> fileList = buildTestFiles(path, fs, NUM_TEST_FILES);

        FileInputFormat.setInputPaths(job, path);

        ARCFileItemInputFormat inputFormat = new ARCFileItemInputFormat();

        InputSplit splits[] = inputFormat.getSplits(job, 0);

        for (InputSplit split : splits) {
            RecordReader<Text, ArcFileItem> reader = inputFormat.getRecordReader(split, job, null);
            validateArcFileItemSplit(fs, split, fileList, reader);

        Assert.assertTrue(fileList.size() == 0);

        fs.delete(path, true);


From source file:org.datavec.hadoop.records.reader.TestBasicHDFS_Integration.java

License:Apache License

 * generate splits for this run/*from ww  w .j a  va2  s.c o m*/
 * @param input_path
 * @param job
 * @return
private InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, input_path);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block

    return splits;


From source file:org.deeplearning4j.iterativereduce.irunit.IRUnitDriver.java

License:Apache License

 * generate splits for this run/*from  www . ja  v a 2s .c  om*/
 * @param inputPath
 * @param job
 * @return array of {@link InputSplit}
private InputSplit[] generateDebugSplits(Path inputPath, JobConf job) {

    long block_size = localFs.getDefaultBlockSize(inputPath);

    log.info("default block size: " + (block_size / 1024 / 1024) + "MB");

    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, inputPath);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        log.error("Error loading properties ", e);


    return splits;


From source file:org.deeplearning4j.iterativereduce.runtime.irunit.IRUnitDriver.java

License:Apache License

 * generate splits for this run/*from  w w  w  .  j  a  v  a  2 s .  co  m*/
 * @param input_path
 * @param job
 * @return
private InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    log.info("default block size: " + (block_size / 1024 / 1024) + "MB");

    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, input_path);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        log.error("Error with splits", e);

    return splits;


From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java

License:Apache License

private Set<ConfigurationTuple> getConfigurationTuples() throws IOException {
    if (confTuples != null)
        return confTuples;
    Path inputPath = new Path(props.getProperty(ConfigFields.APP_INPUT_PATH));
    FileSystem fs = FileSystem.get(conf);
    FileStatus f = fs.getFileStatus(inputPath);
    //BlockLocation[] bl = fs.getFileBlockLocations(p, 0, f.getLen());
    Set<ConfigurationTuple> configTuples = new HashSet<>();
    int workerId = 0;

    JobConf job = new JobConf(new Configuration());

    job.setInputFormat((Class<? extends InputFormat>) this.inputFormatClass); //TextInputFormat.class);

    FileInputFormat.setInputPaths(job, inputPath);

    InputSplit[] splits = job.getInputFormat().getSplits(job, job.getNumMapTasks());

    for (InputSplit split : splits) {

        FileSplit convertedToMetronomeSplit = new FileSplit();

        org.apache.hadoop.mapred.FileSplit hadoopFileSplit = (org.apache.hadoop.mapred.FileSplit) split;

        if (hadoopFileSplit.getLength() - hadoopFileSplit.getStart() > 0) {

            StartupConfiguration config = StartupConfiguration.newBuilder().setBatchSize(batchSize)
                    .build();/*from   ww  w. ja v a2s. c om*/

            String wid = "worker-" + workerId;
            ConfigurationTuple tuple = new ConfigurationTuple(split.getLocations()[0], wid, config);


            LOG.info("IR_AM_worker: " + wid + " added split: " + convertedToMetronomeSplit.toString());

        } else {
            LOG.info("IR_AM: Culled out 0 length Split: " + convertedToMetronomeSplit.toString());


    LOG.info("Total Splits/Workers: " + configTuples.size());

    confTuples = configTuples;
    return configTuples;