Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths)

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:com.trace.hadoop.examples.Grep.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }//from w  w  w . j a v a2  s  .c  om

    Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf grepJob = new JobConf(getConf(), Grep.class);

    try {

        grepJob.setJobName("grep-search");

        FileInputFormat.setInputPaths(grepJob, args[0]);

        grepJob.setMapperClass(RegexMapper.class);
        grepJob.set("mapred.mapper.regex", args[2]);
        if (args.length == 4)
            grepJob.set("mapred.mapper.regex.group", args[3]);

        grepJob.setCombinerClass(LongSumReducer.class);
        grepJob.setReducerClass(LongSumReducer.class);

        FileOutputFormat.setOutputPath(grepJob, tempDir);
        grepJob.setOutputFormat(SequenceFileOutputFormat.class);
        grepJob.setOutputKeyClass(Text.class);
        grepJob.setOutputValueClass(LongWritable.class);

        JobClient.runJob(grepJob);

        JobConf sortJob = new JobConf(getConf(), Grep.class);
        sortJob.setJobName("grep-sort");

        FileInputFormat.setInputPaths(sortJob, tempDir);
        sortJob.setInputFormat(SequenceFileInputFormat.class);

        sortJob.setMapperClass(InverseMapper.class);

        sortJob.setNumReduceTasks(1); // write a single file
        FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
        sortJob.setOutputKeyComparatorClass // sort by decreasing freq
        (LongWritable.DecreasingComparator.class);

        JobClient.runJob(sortJob);
    } finally {
        FileSystem.get(grepJob).delete(tempDir, true);
    }
    return 0;
}

From source file:com.twitter.maple.jdbc.JDBCTap.java

License:Open Source License

@Override
public void sourceConfInit(FlowProcess<JobConf> process, JobConf conf) {
    // a hack for MultiInputFormat to see that there is a child format
    FileInputFormat.setInputPaths(conf, getPath());

    if (username == null)
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl);
    else//from   w  w w.ja  v  a 2s  . c o  m
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl, username, password);

    super.sourceConfInit(process, conf);
}

From source file:com.twitter.meatlocker.jdbc.JDBCTap.java

License:Open Source License

@Override
public void sourceConfInit(HadoopFlowProcess process, JobConf conf) {
    // a hack for MultiInputFormat to see that there is a child format
    FileInputFormat.setInputPaths(conf, getPath());

    if (username == null)
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl);
    else/*from ww  w  .j  av a 2  s .co  m*/
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl, username, password);

    super.sourceConfInit(process, conf);
}

From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java

License:Apache License

@Test
public void testInputFormatLoad() throws IOException {
    // initial commit
    File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
    InputFormatTestUtil.commit(basePath, "100");

    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 10);
    assertEquals(10, inputSplits.length);

    FileStatus[] files = inputFormat.listStatus(jobConf);
    assertEquals(10, files.length);/*from   ww  w  . j  av a2  s.  c o m*/
}

From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java

License:Apache License

@Test
public void testInputFormatUpdates() throws IOException {
    // initial commit
    File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
    InputFormatTestUtil.commit(basePath, "100");

    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    FileStatus[] files = inputFormat.listStatus(jobConf);
    assertEquals(10, files.length);/*  w w w  . j a  v  a  2s  . com*/

    // update files
    InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", true);
    // Before the commit
    files = inputFormat.listStatus(jobConf);
    assertEquals(10, files.length);
    ensureFilesInCommit("Commit 200 has not been committed. We should not see files from this commit", files,
            "200", 0);
    InputFormatTestUtil.commit(basePath, "200");
    files = inputFormat.listStatus(jobConf);
    assertEquals(10, files.length);
    ensureFilesInCommit("5 files have been updated to commit 200. We should see 5 files from commit 200 and 5 "
            + "files from 100 commit", files, "200", 5);
    ensureFilesInCommit("5 files have been updated to commit 200. We should see 5 files from commit 100 and 5 "
            + "files from 200 commit", files, "100", 5);
}

From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java

License:Apache License

@Test
public void testIncrementalSimple() throws IOException {
    // initial commit
    File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
    InputFormatTestUtil.commit(basePath, "100");

    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    InputFormatTestUtil.setupIncremental(jobConf, "100", 1);

    FileStatus[] files = inputFormat.listStatus(jobConf);
    assertEquals(//  ww w  .  jav  a  2 s  .  c o m
            "We should exclude commit 100 when returning incremental pull with start commit time as " + "100",
            0, files.length);
}

From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java

License:Apache License

@Test
public void testIncrementalWithMultipleCommits() throws IOException {
    // initial commit
    File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
    InputFormatTestUtil.commit(basePath, "100");
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
    // update files
    InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", false);
    InputFormatTestUtil.commit(basePath, "200");

    InputFormatTestUtil.simulateUpdates(partitionDir, "100", 4, "300", false);
    InputFormatTestUtil.commit(basePath, "300");

    InputFormatTestUtil.simulateUpdates(partitionDir, "100", 3, "400", false);
    InputFormatTestUtil.commit(basePath, "400");

    InputFormatTestUtil.simulateUpdates(partitionDir, "100", 2, "500", false);
    InputFormatTestUtil.commit(basePath, "500");

    InputFormatTestUtil.simulateUpdates(partitionDir, "100", 1, "600", false);
    InputFormatTestUtil.commit(basePath, "600");

    InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
    FileStatus[] files = inputFormat.listStatus(jobConf);
    assertEquals("Pulling 1 commit from 100, should get us the 5 files committed at 200", 5, files.length);
    ensureFilesInCommit("Pulling 1 commit from 100, should get us the 5 files committed at 200", files, "200",
            5);/* w w  w .  j av  a 2  s  .  c  o m*/

    InputFormatTestUtil.setupIncremental(jobConf, "100", 3);
    files = inputFormat.listStatus(jobConf);

    assertEquals("Pulling 3 commits from 100, should get us the 3 files from 400 commit, 1 file from 300 "
            + "commit and 1 file from 200 commit", 5, files.length);
    ensureFilesInCommit("Pulling 3 commits from 100, should get us the 3 files from 400 commit", files, "400",
            3);
    ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 300 commit", files, "300",
            1);
    ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 200 commit", files, "200",
            1);

    InputFormatTestUtil.setupIncremental(jobConf, "100", HoodieHiveUtil.MAX_COMMIT_ALL);
    files = inputFormat.listStatus(jobConf);

    assertEquals("Pulling all commits from 100, should get us the 1 file from each of 200,300,400,500,400 "
            + "commits", 5, files.length);
    ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 600 commit", files, "600",
            1);
    ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 500 commit", files, "500",
            1);
    ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 400 commit", files, "400",
            1);
    ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 300 commit", files, "300",
            1);
    ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200",
            1);
}

From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java

License:Apache License

public void testPredicatePushDown() throws IOException {
    // initial commit
    Schema schema = InputFormatTestUtil.readSchema("/sample1.avsc");
    String commit1 = "20160628071126";
    File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 10, commit1);
    InputFormatTestUtil.commit(basePath, commit1);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
    // check whether we have 10 records at this point
    ensureRecordsInCommit("We need to have 10 records at this point for commit " + commit1, commit1, 10, 10);

    // update 2 records in the original parquet file and save it as commit 200
    String commit2 = "20160629193623";
    InputFormatTestUtil.simulateParquetUpdates(partitionDir, schema, commit1, 10, 2, commit2);
    InputFormatTestUtil.commit(basePath, commit2);

    InputFormatTestUtil.setupIncremental(jobConf, commit1, 1);
    // check whether we have 2 records at this point
    ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more",
            commit2, 2, 2);//from  w  ww  . j  ava  2  s .  c om
    // Make sure we have the 10 records if we roll back the stattime
    InputFormatTestUtil.setupIncremental(jobConf, "0", 2);
    ensureRecordsInCommit("We need to have 8 records that was modified at commit " + commit1 + " and no more",
            commit1, 8, 10);
    ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more",
            commit2, 2, 10);
}

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

public void testReader(boolean partitioned) throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
            HoodieTableType.MERGE_ON_READ);
    String baseInstant = "100";
    File partitionDir = partitioned
            ? InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 100, baseInstant)
            : InputFormatTestUtil.prepareNonPartitionedParquetDataset(basePath, schema, 1, 100, baseInstant);
    InputFormatTestUtil.commit(basePath, baseInstant);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    List<Pair<String, Integer>> logVersionsWithAction = new ArrayList<>();
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 1));
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 2));
    // TODO: HUDI-154 Once Hive 2.x PR (PR-674) is merged, enable this change
    // logVersionsWithAction.add(Pair.of(HoodieTimeline.ROLLBACK_ACTION, 3));
    FileSlice fileSlice = new FileSlice(
            partitioned ? FSUtils.getRelativePartitionPath(new Path(basePath.getRoot().getAbsolutePath()),
                    new Path(partitionDir.getAbsolutePath())) : "default",
            baseInstant, "fileid0");
    logVersionsWithAction.stream().forEach(logVersionWithAction -> {
        try {//from w  w  w .  j a  va2s  . c  om
            // update files or generate new log file
            int logVersion = logVersionWithAction.getRight();
            String action = logVersionWithAction.getKey();
            int baseInstantTs = Integer.parseInt(baseInstant);
            String instantTime = String.valueOf(baseInstantTs + logVersion);
            String latestInstant = action.equals(HoodieTimeline.ROLLBACK_ACTION)
                    ? String.valueOf(baseInstantTs + logVersion - 2)
                    : instantTime;

            HoodieLogFormat.Writer writer = null;
            if (action.equals(HoodieTimeline.ROLLBACK_ACTION)) {
                writer = writeRollback(partitionDir, schema, "fileid0", baseInstant, instantTime,
                        String.valueOf(baseInstantTs + logVersion - 1), logVersion);
            } else {
                writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", baseInstant, instantTime, 100,
                        0, logVersion);
            }
            long size = writer.getCurrentSize();
            writer.close();
            assertTrue("block - size should be > 0", size > 0);

            //create a split with baseFile (parquet file written earlier) and new log file(s)
            fileSlice.addLogFile(writer.getLogFile());
            HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
                    new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1,
                            jobConf),
                    basePath.getRoot().getPath(),
                    fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
                            .map(h -> h.getPath().toString()).collect(Collectors.toList()),
                    instantTime);

            //create a RecordReader to be used by HoodieRealtimeRecordReader
            RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
                    new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf,
                    null);
            JobConf jobConf = new JobConf();
            List<Schema.Field> fields = schema.getFields();
            String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
            String postions = fields.stream().map(f -> String.valueOf(f.pos()))
                    .collect(Collectors.joining(","));
            jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
            jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
            if (partitioned) {
                jobConf.set("partition_columns", "datestr");
            }

            //validate record reader compaction
            HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);

            //use reader to read base Parquet File and log file, merge in flight and return latest commit
            //here all 100 records should be updated, see above
            NullWritable key = recordReader.createKey();
            ArrayWritable value = recordReader.createValue();
            while (recordReader.next(key, value)) {
                Writable[] values = value.get();
                //check if the record written is with latest commit, here "101"
                Assert.assertEquals(latestInstant, values[0].toString());
                key = recordReader.createKey();
                value = recordReader.createValue();
            }
        } catch (Exception ioe) {
            throw new HoodieException(ioe.getMessage(), ioe);
        }
    });

    // Add Rollback last version to next log-file

}

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

@Test
public void testUnMergedReader() throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
            HoodieTableType.MERGE_ON_READ);
    String commitTime = "100";
    final int numRecords = 1000;
    final int firstBatchLastRecordKey = numRecords - 1;
    final int secondBatchLastRecordKey = 2 * numRecords - 1;
    File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numRecords, commitTime);
    InputFormatTestUtil.commit(basePath, commitTime);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    // insert new records to log file
    String newCommitTime = "101";
    HoodieLogFormat.Writer writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime,
            newCommitTime, numRecords, numRecords, 0);
    long size = writer.getCurrentSize();
    writer.close();/*from w  ww.  j  a v a  2  s . c om*/
    assertTrue("block - size should be > 0", size > 0);

    //create a split with baseFile (parquet file written earlier) and new log file(s)
    String logFilePath = writer.getLogFile().getPath().toString();
    HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
            new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf),
            basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime);

    //create a RecordReader to be used by HoodieRealtimeRecordReader
    RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
            new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null);
    JobConf jobConf = new JobConf();
    List<Schema.Field> fields = schema.getFields();
    String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
    String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
    jobConf.set("partition_columns", "datestr");
    // Enable merge skipping.
    jobConf.set("hoodie.realtime.merge.skip", "true");

    //validate unmerged record reader
    RealtimeUnmergedRecordReader recordReader = new RealtimeUnmergedRecordReader(split, jobConf, reader);

    //use reader to read base Parquet File and log file
    //here all records should be present. Also ensure log records are in order.
    NullWritable key = recordReader.createKey();
    ArrayWritable value = recordReader.createValue();
    int numRecordsAtCommit1 = 0;
    int numRecordsAtCommit2 = 0;
    Set<Integer> seenKeys = new HashSet<>();
    Integer lastSeenKeyFromLog = firstBatchLastRecordKey;
    while (recordReader.next(key, value)) {
        Writable[] values = value.get();
        String gotCommit = values[0].toString();
        String keyStr = values[2].toString();
        Integer gotKey = Integer.parseInt(keyStr.substring("key".length()));
        if (gotCommit.equals(newCommitTime)) {
            numRecordsAtCommit2++;
            Assert.assertTrue(gotKey > firstBatchLastRecordKey);
            Assert.assertTrue(gotKey <= secondBatchLastRecordKey);
            Assert.assertEquals(gotKey.intValue(), lastSeenKeyFromLog + 1);
            lastSeenKeyFromLog++;
        } else {
            numRecordsAtCommit1++;
            Assert.assertTrue(gotKey >= 0);
            Assert.assertTrue(gotKey <= firstBatchLastRecordKey);
        }
        // Ensure unique key
        Assert.assertFalse(seenKeys.contains(gotKey));
        seenKeys.add(gotKey);
        key = recordReader.createKey();
        value = recordReader.createValue();
    }
    Assert.assertEquals(numRecords, numRecordsAtCommit1);
    Assert.assertEquals(numRecords, numRecordsAtCommit2);
    Assert.assertEquals(2 * numRecords, seenKeys.size());
}