In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.


public static void setInputPaths(JobConf conf, Path... inputPaths) 

Set the array of Path s as the list of inputs for the map-reduce job.


From source file:com.trace.hadoop.examples.Grep.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
        return -1;
    }

    Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf grepJob = new JobConf(getConf(), Grep.class);

    try {


        FileInputFormat.setInputPaths(grepJob, args[0]);

        grepJob.set("mapred.mapper.regex", args[2]);
        if (args.length == 4)
            grepJob.set("mapred.mapper.regex.group", args[3]);


        FileOutputFormat.setOutputPath(grepJob, tempDir);


        JobConf sortJob = new JobConf(getConf(), Grep.class);

        FileInputFormat.setInputPaths(sortJob, tempDir);


        sortJob.setNumReduceTasks(1); // write a single file
        FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
        sortJob.setOutputKeyComparatorClass // sort by decreasing freq

    } finally {
        FileSystem.get(grepJob).delete(tempDir, true);
    return 0;

From source file:com.twitter.maple.jdbc.JDBCTap.java

License:Open Source License

public void sourceConfInit(FlowProcess<JobConf> process, JobConf conf) {
    // a hack for MultiInputFormat to see that there is a child format
    FileInputFormat.setInputPaths(conf, getPath());

    if (username == null)
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl);
    else
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl, username, password);

    super.sourceConfInit(process, conf);

From source file:com.twitter.meatlocker.jdbc.JDBCTap.java

License:Open Source License

public void sourceConfInit(HadoopFlowProcess process, JobConf conf) {
    // a hack for MultiInputFormat to see that there is a child format
    FileInputFormat.setInputPaths(conf, getPath());

    if (username == null)
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl);
    else
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl, username, password);

    super.sourceConfInit(process, conf);

From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java

License:Apache License

public void testInputFormatLoad() throws IOException {
    // initial commit
    File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
    InputFormatTestUtil.commit(basePath, "100");

    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 10);
    assertEquals(10, inputSplits.length);

    FileStatus[] files = inputFormat.listStatus(jobConf);
    assertEquals(10, files.length);

From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java

License:Apache License

public void testInputFormatUpdates() throws IOException {
    // initial commit
    File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
    InputFormatTestUtil.commit(basePath, "100");

    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    FileStatus[] files = inputFormat.listStatus(jobConf);
    assertEquals(10, files.length);

    // update files
    InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", true);
    // Before the commit
    files = inputFormat.listStatus(jobConf);
    assertEquals(10, files.length);
    ensureFilesInCommit("Commit 200 has not been committed. We should not see files from this commit", files,
            "200", 0);
    InputFormatTestUtil.commit(basePath, "200");
    files = inputFormat.listStatus(jobConf);
    assertEquals(10, files.length);
    ensureFilesInCommit("5 files have been updated to commit 200. We should see 5 files from commit 200 and 5 "
            + "files from 100 commit", files, "200", 5);
    ensureFilesInCommit("5 files have been updated to commit 200. We should see 5 files from commit 100 and 5 "
            + "files from 200 commit", files, "100", 5);

From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java

License:Apache License

public void testIncrementalSimple() throws IOException {
    // initial commit
    File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
    InputFormatTestUtil.commit(basePath, "100");

    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    InputFormatTestUtil.setupIncremental(jobConf, "100", 1);

    FileStatus[] files = inputFormat.listStatus(jobConf);
    assertEquals(//  ww w  .  jav  a  2 s  .  c o m
            "We should exclude commit 100 when returning incremental pull with start commit time as " + "100",
            0, files.length);

From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java

License:Apache License

public void testIncrementalWithMultipleCommits() throws IOException {
    // initial commit
    File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
    InputFormatTestUtil.commit(basePath, "100");
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
    // update files
    InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", false);
    InputFormatTestUtil.commit(basePath, "200");

    InputFormatTestUtil.simulateUpdates(partitionDir, "100", 4, "300", false);
    InputFormatTestUtil.commit(basePath, "300");

    InputFormatTestUtil.simulateUpdates(partitionDir, "100", 3, "400", false);
    InputFormatTestUtil.commit(basePath, "400");

    InputFormatTestUtil.simulateUpdates(partitionDir, "100", 2, "500", false);
    InputFormatTestUtil.commit(basePath, "500");

    InputFormatTestUtil.simulateUpdates(partitionDir, "100", 1, "600", false);
    InputFormatTestUtil.commit(basePath, "600");

    InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
    FileStatus[] files = inputFormat.listStatus(jobConf);
    assertEquals("Pulling 1 commit from 100, should get us the 5 files committed at 200", 5, files.length);
    ensureFilesInCommit("Pulling 1 commit from 100, should get us the 5 files committed at 200", files, "200",
            5);

    InputFormatTestUtil.setupIncremental(jobConf, "100", 3);
    files = inputFormat.listStatus(jobConf);

    assertEquals("Pulling 3 commits from 100, should get us the 3 files from 400 commit, 1 file from 300 "
            + "commit and 1 file from 200 commit", 5, files.length);
    ensureFilesInCommit("Pulling 3 commits from 100, should get us the 3 files from 400 commit", files, "400",
    ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 300 commit", files, "300",
    ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 200 commit", files, "200",

    InputFormatTestUtil.setupIncremental(jobConf, "100", HoodieHiveUtil.MAX_COMMIT_ALL);
    files = inputFormat.listStatus(jobConf);

    assertEquals("Pulling all commits from 100, should get us the 1 file from each of 200,300,400,500,400 "
            + "commits", 5, files.length);
    ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 600 commit", files, "600",
    ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 500 commit", files, "500",
    ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 400 commit", files, "400",
    ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 300 commit", files, "300",
    ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200",

From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java

License:Apache License

public void testPredicatePushDown() throws IOException {
    // initial commit
    Schema schema = InputFormatTestUtil.readSchema("/sample1.avsc");
    String commit1 = "20160628071126";
    File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 10, commit1);
    InputFormatTestUtil.commit(basePath, commit1);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
    // check whether we have 10 records at this point
    ensureRecordsInCommit("We need to have 10 records at this point for commit " + commit1, commit1, 10, 10);

    // update 2 records in the original parquet file and save it as commit 200
    String commit2 = "20160629193623";
    InputFormatTestUtil.simulateParquetUpdates(partitionDir, schema, commit1, 10, 2, commit2);
    InputFormatTestUtil.commit(basePath, commit2);

    InputFormatTestUtil.setupIncremental(jobConf, commit1, 1);
    // check whether we have 2 records at this point
    ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more",
            commit2, 2, 2);
    // Make sure we have the 10 records if we roll back the stattime
    InputFormatTestUtil.setupIncremental(jobConf, "0", 2);
    ensureRecordsInCommit("We need to have 8 records that was modified at commit " + commit1 + " and no more",
            commit1, 8, 10);
    ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more",
            commit2, 2, 10);

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

public void testReader(boolean partitioned) throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
    String baseInstant = "100";
    File partitionDir = partitioned
            ? InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 100, baseInstant)
            : InputFormatTestUtil.prepareNonPartitionedParquetDataset(basePath, schema, 1, 100, baseInstant);
    InputFormatTestUtil.commit(basePath, baseInstant);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    List<Pair<String, Integer>> logVersionsWithAction = new ArrayList<>();
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 1));
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 2));
    // TODO: HUDI-154 Once Hive 2.x PR (PR-674) is merged, enable this change
    // logVersionsWithAction.add(Pair.of(HoodieTimeline.ROLLBACK_ACTION, 3));
    FileSlice fileSlice = new FileSlice(
            partitioned ? FSUtils.getRelativePartitionPath(new Path(basePath.getRoot().getAbsolutePath()),
                    new Path(partitionDir.getAbsolutePath())) : "default",
            baseInstant, "fileid0");
    logVersionsWithAction.stream().forEach(logVersionWithAction -> {
        try {
            // update files or generate new log file
            int logVersion = logVersionWithAction.getRight();
            String action = logVersionWithAction.getKey();
            int baseInstantTs = Integer.parseInt(baseInstant);
            String instantTime = String.valueOf(baseInstantTs + logVersion);
            String latestInstant = action.equals(HoodieTimeline.ROLLBACK_ACTION)
                    ? String.valueOf(baseInstantTs + logVersion - 2)
                    : instantTime;

            HoodieLogFormat.Writer writer = null;
            if (action.equals(HoodieTimeline.ROLLBACK_ACTION)) {
                writer = writeRollback(partitionDir, schema, "fileid0", baseInstant, instantTime,
                        String.valueOf(baseInstantTs + logVersion - 1), logVersion);
            } else {
                writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", baseInstant, instantTime, 100,
                        0, logVersion);
            long size = writer.getCurrentSize();
            assertTrue("block - size should be > 0", size > 0);

            //create a split with baseFile (parquet file written earlier) and new log file(s)
            HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
                    new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1,
                            .map(h -> h.getPath().toString()).collect(Collectors.toList()),

            //create a RecordReader to be used by HoodieRealtimeRecordReader
            RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
                    new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf,
            JobConf jobConf = new JobConf();
            List<Schema.Field> fields = schema.getFields();
            String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
            String postions = fields.stream().map(f -> String.valueOf(f.pos()))
            jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
            jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
            if (partitioned) {
                jobConf.set("partition_columns", "datestr");

            //validate record reader compaction
            HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);

            //use reader to read base Parquet File and log file, merge in flight and return latest commit
            //here all 100 records should be updated, see above
            NullWritable key = recordReader.createKey();
            ArrayWritable value = recordReader.createValue();
            while (recordReader.next(key, value)) {
                Writable[] values = value.get();
                //check if the record written is with latest commit, here "101"
                Assert.assertEquals(latestInstant, values[0].toString());
                key = recordReader.createKey();
                value = recordReader.createValue();
        } catch (Exception ioe) {
            throw new HoodieException(ioe.getMessage(), ioe);

    // Add Rollback last version to next log-file


From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

public void testUnMergedReader() throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
    String commitTime = "100";
    final int numRecords = 1000;
    final int firstBatchLastRecordKey = numRecords - 1;
    final int secondBatchLastRecordKey = 2 * numRecords - 1;
    File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numRecords, commitTime);
    InputFormatTestUtil.commit(basePath, commitTime);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    // insert new records to log file
    String newCommitTime = "101";
    HoodieLogFormat.Writer writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime,
            newCommitTime, numRecords, numRecords, 0);
    long size = writer.getCurrentSize();
    writer.close();
    assertTrue("block - size should be > 0", size > 0);
    assertTrue("block - size should be > 0", size > 0);

    //create a split with baseFile (parquet file written earlier) and new log file(s)
    String logFilePath = writer.getLogFile().getPath().toString();
    HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
            new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf),
            basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime);

    //create a RecordReader to be used by HoodieRealtimeRecordReader
    RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
            new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null);
    JobConf jobConf = new JobConf();
    List<Schema.Field> fields = schema.getFields();
    String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
    String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
    jobConf.set("partition_columns", "datestr");
    // Enable merge skipping.
    jobConf.set("hoodie.realtime.merge.skip", "true");

    //validate unmerged record reader
    RealtimeUnmergedRecordReader recordReader = new RealtimeUnmergedRecordReader(split, jobConf, reader);

    //use reader to read base Parquet File and log file
    //here all records should be present. Also ensure log records are in order.
    NullWritable key = recordReader.createKey();
    ArrayWritable value = recordReader.createValue();
    int numRecordsAtCommit1 = 0;
    int numRecordsAtCommit2 = 0;
    Set<Integer> seenKeys = new HashSet<>();
    Integer lastSeenKeyFromLog = firstBatchLastRecordKey;
    while (recordReader.next(key, value)) {
        Writable[] values = value.get();
        String gotCommit = values[0].toString();
        String keyStr = values[2].toString();
        Integer gotKey = Integer.parseInt(keyStr.substring("key".length()));
        if (gotCommit.equals(newCommitTime)) {
            Assert.assertTrue(gotKey > firstBatchLastRecordKey);
            Assert.assertTrue(gotKey <= secondBatchLastRecordKey);
            Assert.assertEquals(gotKey.intValue(), lastSeenKeyFromLog + 1);
        } else {
            Assert.assertTrue(gotKey >= 0);
            Assert.assertTrue(gotKey <= firstBatchLastRecordKey);
        // Ensure unique key
        key = recordReader.createKey();
        value = recordReader.createValue();
    Assert.assertEquals(numRecords, numRecordsAtCommit1);
    Assert.assertEquals(numRecords, numRecordsAtCommit2);
    Assert.assertEquals(2 * numRecords, seenKeys.size());