Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths) 

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:org.apache.whirr.service.yarn.integration.AbstractHadoopServiceTest.java

License:Apache License

@Test
public void test() throws Exception {
    Configuration conf = controller.getConfiguration();
    JobConf job = new JobConf(conf, AbstractHadoopServiceTest.class);

    FileSystem fs = FileSystem.get(conf);

    OutputStream os = fs.create(new Path("input"));
    Writer wr = new OutputStreamWriter(os);
    wr.write("b a\n");
    wr.close();/*  w w w.j  ava  2s.  c o m*/

    job.setMapperClass(TokenCountMapper.class);
    job.setReducerClass(LongSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    FileInputFormat.setInputPaths(job, new Path("input"));
    FileOutputFormat.setOutputPath(job, new Path("output"));

    JobClient.runJob(job);

    FSDataInputStream in = fs.open(new Path("output/part-00000"));
    BufferedReader reader = new BufferedReader(new InputStreamReader(in));
    assertEquals("a\t1", reader.readLine());
    assertEquals("b\t1", reader.readLine());
    assertNull(reader.readLine());
    reader.close();

}

From source file:org.archive.wayback.hadoop.CDXSort.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job./*from w ww . j  a v  a 2s.c  om*/
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {

    boolean compressOutput = false;
    boolean dereferenceInputs = false;
    boolean canonicalize = false;
    boolean funkyInput = false;

    JobConf jobConf = new JobConf(getConf(), CDXSort.class);
    jobConf.setJobName("cdxsort");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    List<String> otherArgs = new ArrayList<String>();

    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("--compress-output".equals(args[i])) {
                compressOutput = true;
            } else if ("--funky-input".equals(args[i])) {
                funkyInput = true;
            } else if ("--dereference-inputs".equals(args[i])) {
                dereferenceInputs = true;
            } else if ("--canonicalize".equals(args[i])) {
                canonicalize = true;
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Make sure there are exactly 3 parameters left: split input output
    if (otherArgs.size() != 3) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 3.");
        return printUsage();
    }

    String splitPath = otherArgs.get(0);
    String inputPath = otherArgs.get(1);
    String outputPath = otherArgs.get(2);

    // load the split file, find and set the number of reduces
    AlphaPartitioner partitioner = new AlphaPartitioner();
    File localSplitFile = new File(splitPath);
    FileInputStream fis = new FileInputStream(localSplitFile);
    InputStreamReader isr = new InputStreamReader(fis, ByteOp.UTF8);
    BufferedReader bis = new BufferedReader(isr);
    //      try {
    //         partitioner.loadBoundaries(bis);
    //      } catch (IOException except) {
    //         System.err.println("ERROR: Problem loading file " + splitPath);
    //         return printUsage(); // exits
    //      }
    //      jobConf.setNumReduceTasks(partitioner.getNumPartitions());
    //
    //      // copy the split file into the FS, add to the DistributedCache:
    ////      AlphaPartitioner.setPartitionFile(jobConf, localSplitFile);
    //      AlphaPartitioner.setSplitCache(jobConf, localSplitFile);
    //      System.err.println("uploaded split file to FS and DistributedCache");
    //
    //      // Set job configs:
    //      jobConf.setInputFormat(TextInputFormat.class);
    //
    //      jobConf.setOutputFormat(TextOutputFormat.class);
    //      if (canonicalize) {
    //         jobConf.setMapperClass(CDXCanonicalizerMapClass.class);
    //      } else {
    //         jobConf.setMapperClass(CDXMapClass.class);
    //      }
    //      jobConf.setOutputKeyClass(Text.class);
    //      jobConf.setOutputValueClass(Text.class);
    //      jobConf.set("mapred.textoutputformat.separator", " ");
    //      jobConf.setPartitionerClass(AlphaPartitioner.class);

    int inputCount = 0;
    // Set job input:
    if (dereferenceInputs) {

        // SO SLOW... can't add one at a time...
        //         FileReader is2 = new FileReader(new File(inputPath));
        //         BufferedReader bis2 = new BufferedReader(is2);
        //         while (true) {
        //            String line = bis2.readLine();
        //            if (line == null) {
        //               break;
        //            }
        //            FileInputFormat.addInputPath(jobConf, new Path(line));
        //            inputCount++;
        //            System.err.println("Added path(" + inputCount + "): " + line);
        //         }

        // PASS 2:
        //         FileReader is2 = new FileReader(new File(inputPath));
        //         BufferedReader bis2 = new BufferedReader(is2);
        //         ArrayList<String> list = new ArrayList<String>();
        //         
        //         while (true) {
        //            String line = bis2.readLine();
        //            if (line == null) {
        //               break;
        //            }
        //            list.add(line);
        //            inputCount++;
        //         }
        //         Path arr[] = new Path[list.size()];
        //         for(int i=0; i < list.size(); i++) {
        //            arr[i] = new Path(list.get(i));
        //         }
        //         FileInputFormat.setInputPaths(jobConf, arr);

        // PASS 3:
        if (funkyInput) {
            jobConf.setMapperClass(FunkyDeReffingCDXCanonicalizerMapClass.class);
        } else {
            jobConf.setMapperClass(DeReffingCDXCanonicalizerMapClass.class);
        }
        FileInputFormat.setInputPaths(jobConf, new Path(inputPath));
        inputCount = 1;

    } else {
        FileInputFormat.setInputPaths(jobConf, new Path(inputPath));
        inputCount = 1;
    }

    // Set job output:
    FileOutputFormat.setOutputPath(jobConf, new Path(outputPath));

    if (compressOutput) {
        FileOutputFormat.setCompressOutput(jobConf, true);
        FileOutputFormat.setOutputCompressorClass(jobConf, GzipCodec.class);
    }

    //      System.out.println("Running on " + cluster.getTaskTrackers()
    //            + " nodes, processing " + inputCount + " files/directories"
    //            + " into " + outputPath + " with "
    //            + partitioner.getNumPartitions() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:org.asayler.WikiTitleCount.java

License:Apache License

/**
 * The main driver for wikititlecount map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker./*from  ww  w. j  a v a  2s  . c om*/
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WikiTitleCount.class);
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();

    int num_maps = 1;
    int num_reducers = 1;

    conf.setJobName("wikititlecount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    /** Set Default Mappers */
    num_maps = (int) (cluster.getMaxMapTasks());

    /** Set Default Mappers */
    num_reducers = (int) (cluster.getMaxReduceTasks() * 0.9);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            other_args.add(args[i]);
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    /* Set Mappers and Reducer */
    conf.setNumMapTasks(num_maps);
    conf.setNumReduceTasks(num_reducers);

    JobClient.runJob(conf);
    return 0;
}

From source file:org.asayler.WikiTitleSort.java

License:Apache License

/**
 * The main driver for wikititlecount map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.//from  w  w w .  ja  va 2s  . c  om
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WikiTitleSort.class);
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();

    int num_maps = 1;
    final int num_reducers = 1;

    conf.setJobName("wikititlesort");

    conf.setMapperClass(MapClass.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    /** Set Default Mappers */
    num_maps = (int) (cluster.getMaxMapTasks());

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            other_args.add(args[i]);
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    /* Set Mappers and Reducer */
    conf.setNumMapTasks(num_maps);
    conf.setNumReduceTasks(num_reducers);

    JobClient.runJob(conf);
    return 0;
}

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java

License:Apache License

@Test
public void TestArcInputFormat() throws IOException, InterruptedException {
    for (int i = 0; i < NUM_ITERATIONS; ++i) {
        JobConf job = new JobConf();
        FileSystem fs = LocalFileSystem.get(job);
        Path path = new Path("/tmp/" + File.createTempFile("ARCInputFormat", "test").getName());
        fs.mkdirs(path);/*  w  ww. ja v a  2 s  .c  o  m*/

        List<Pair<Path, List<TestRecord>>> fileList = buildTestFiles(path, fs, NUM_TEST_FILES);

        FileInputFormat.setInputPaths(job, path);

        ARCFileInputFormat inputFormat = new ARCFileInputFormat();

        InputSplit splits[] = inputFormat.getSplits(job, 0);

        for (InputSplit split : splits) {
            RecordReader<Text, BytesWritable> reader = inputFormat.getRecordReader(split, job, null);
            validateSplit(fs, split, fileList, reader);
        }

        Assert.assertTrue(fileList.size() == 0);

        fs.delete(path, true);
    }

}

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java

License:Apache License

@Test
public void TestArcItemInputFormat() throws IOException, InterruptedException {
    for (int i = 0; i < NUM_ITERATIONS; ++i) {
        JobConf job = new JobConf();
        FileSystem fs = LocalFileSystem.get(job);
        Path path = new Path("/tmp/" + File.createTempFile("ARCInputFormat", "test").getName());
        fs.mkdirs(path);/*  w  w w.j a  va  2  s. c om*/

        List<Pair<Path, List<TestRecord>>> fileList = buildTestFiles(path, fs, NUM_TEST_FILES);

        FileInputFormat.setInputPaths(job, path);

        ARCFileItemInputFormat inputFormat = new ARCFileItemInputFormat();

        InputSplit splits[] = inputFormat.getSplits(job, 0);

        for (InputSplit split : splits) {
            RecordReader<Text, ArcFileItem> reader = inputFormat.getRecordReader(split, job, null);
            validateArcFileItemSplit(fs, split, fileList, reader);
        }

        Assert.assertTrue(fileList.size() == 0);

        fs.delete(path, true);
    }

}

From source file:org.datavec.hadoop.records.reader.TestBasicHDFS_Integration.java

License:Apache License

/**
 * generate splits for this run/*from ww  w .j a  va2  s.c o m*/
 * 
 * @param input_path
 * @param job
 * @return
 */
private InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, input_path);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return splits;

}

From source file:org.deeplearning4j.iterativereduce.irunit.IRUnitDriver.java

License:Apache License

/**
 * generate splits for this run/*from  www . ja  v a 2s .c  om*/
 *
 * @param inputPath
 * @param job
 * @return array of {@link InputSplit}
 */
private InputSplit[] generateDebugSplits(Path inputPath, JobConf job) {

    long block_size = localFs.getDefaultBlockSize(inputPath);

    log.info("default block size: " + (block_size / 1024 / 1024) + "MB");

    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, inputPath);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        log.error("Error loading properties ", e);

    }

    return splits;

}

From source file:org.deeplearning4j.iterativereduce.runtime.irunit.IRUnitDriver.java

License:Apache License

/**
 * generate splits for this run/*from  w w  w  .  j  a  v  a  2 s .  co  m*/
 *
 * @param input_path
 * @param job
 * @return
 */
private InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    log.info("default block size: " + (block_size / 1024 / 1024) + "MB");

    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, input_path);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        log.error("Error with splits", e);
    }

    return splits;

}

From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java

License:Apache License

private Set<ConfigurationTuple> getConfigurationTuples() throws IOException {
    if (confTuples != null)
        return confTuples;
    Path inputPath = new Path(props.getProperty(ConfigFields.APP_INPUT_PATH));
    FileSystem fs = FileSystem.get(conf);
    FileStatus f = fs.getFileStatus(inputPath);
    //BlockLocation[] bl = fs.getFileBlockLocations(p, 0, f.getLen());
    Set<ConfigurationTuple> configTuples = new HashSet<>();
    int workerId = 0;

    JobConf job = new JobConf(new Configuration());

    job.setInputFormat((Class<? extends InputFormat>) this.inputFormatClass); //TextInputFormat.class);

    FileInputFormat.setInputPaths(job, inputPath);

    InputSplit[] splits = job.getInputFormat().getSplits(job, job.getNumMapTasks());

    for (InputSplit split : splits) {

        FileSplit convertedToMetronomeSplit = new FileSplit();

        org.apache.hadoop.mapred.FileSplit hadoopFileSplit = (org.apache.hadoop.mapred.FileSplit) split;

        if (hadoopFileSplit.getLength() - hadoopFileSplit.getStart() > 0) {
            convertedToMetronomeSplit.setLength(hadoopFileSplit.getLength());
            convertedToMetronomeSplit.setOffset(hadoopFileSplit.getStart());
            convertedToMetronomeSplit.setPath(hadoopFileSplit.getPath().toString());

            StartupConfiguration config = StartupConfiguration.newBuilder().setBatchSize(batchSize)
                    .setIterations(iterationCount).setOther(appConfig).setSplit(convertedToMetronomeSplit)
                    .build();/*from   ww  w. ja v a2s. c om*/

            String wid = "worker-" + workerId;
            ConfigurationTuple tuple = new ConfigurationTuple(split.getLocations()[0], wid, config);

            configTuples.add(tuple);
            workerId++;

            LOG.info("IR_AM_worker: " + wid + " added split: " + convertedToMetronomeSplit.toString());

        } else {
            LOG.info("IR_AM: Culled out 0 length Split: " + convertedToMetronomeSplit.toString());
        }

    }

    LOG.info("Total Splits/Workers: " + configTuples.size());

    confTuples = configTuples;
    return configTuples;
}