List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:com.trace.hadoop.examples.Grep.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); return -1; }//from w w w . j a v a2 s .c om Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf grepJob = new JobConf(getConf(), Grep.class); try { grepJob.setJobName("grep-search"); FileInputFormat.setInputPaths(grepJob, args[0]); grepJob.setMapperClass(RegexMapper.class); grepJob.set("mapred.mapper.regex", args[2]); if (args.length == 4) grepJob.set("mapred.mapper.regex.group", args[3]); grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(grepJob, tempDir); grepJob.setOutputFormat(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); JobClient.runJob(grepJob); JobConf sortJob = new JobConf(getConf(), Grep.class); sortJob.setJobName("grep-sort"); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormat(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setOutputKeyComparatorClass // sort by decreasing freq (LongWritable.DecreasingComparator.class); JobClient.runJob(sortJob); } finally { FileSystem.get(grepJob).delete(tempDir, true); } return 0; }
From source file:com.twitter.maple.jdbc.JDBCTap.java
License:Open Source License
@Override public void sourceConfInit(FlowProcess<JobConf> process, JobConf conf) { // a hack for MultiInputFormat to see that there is a child format FileInputFormat.setInputPaths(conf, getPath()); if (username == null) DBConfiguration.configureDB(conf, driverClassName, connectionUrl); else//from w w w.ja v a 2s . c o m DBConfiguration.configureDB(conf, driverClassName, connectionUrl, username, password); super.sourceConfInit(process, conf); }
From source file:com.twitter.meatlocker.jdbc.JDBCTap.java
License:Open Source License
@Override public void sourceConfInit(HadoopFlowProcess process, JobConf conf) { // a hack for MultiInputFormat to see that there is a child format FileInputFormat.setInputPaths(conf, getPath()); if (username == null) DBConfiguration.configureDB(conf, driverClassName, connectionUrl); else/*from ww w .j av a 2 s .co m*/ DBConfiguration.configureDB(conf, driverClassName, connectionUrl, username, password); super.sourceConfInit(process, conf); }
From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java
License:Apache License
@Test public void testInputFormatLoad() throws IOException { // initial commit File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100"); InputFormatTestUtil.commit(basePath, "100"); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 10); assertEquals(10, inputSplits.length); FileStatus[] files = inputFormat.listStatus(jobConf); assertEquals(10, files.length);/*from ww w . j av a2 s. c o m*/ }
From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java
License:Apache License
@Test public void testInputFormatUpdates() throws IOException { // initial commit File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100"); InputFormatTestUtil.commit(basePath, "100"); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); FileStatus[] files = inputFormat.listStatus(jobConf); assertEquals(10, files.length);/* w w w . j a v a 2s . com*/ // update files InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", true); // Before the commit files = inputFormat.listStatus(jobConf); assertEquals(10, files.length); ensureFilesInCommit("Commit 200 has not been committed. We should not see files from this commit", files, "200", 0); InputFormatTestUtil.commit(basePath, "200"); files = inputFormat.listStatus(jobConf); assertEquals(10, files.length); ensureFilesInCommit("5 files have been updated to commit 200. We should see 5 files from commit 200 and 5 " + "files from 100 commit", files, "200", 5); ensureFilesInCommit("5 files have been updated to commit 200. We should see 5 files from commit 100 and 5 " + "files from 200 commit", files, "100", 5); }
From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java
License:Apache License
@Test public void testIncrementalSimple() throws IOException { // initial commit File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100"); InputFormatTestUtil.commit(basePath, "100"); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); InputFormatTestUtil.setupIncremental(jobConf, "100", 1); FileStatus[] files = inputFormat.listStatus(jobConf); assertEquals(// ww w . jav a 2 s . c o m "We should exclude commit 100 when returning incremental pull with start commit time as " + "100", 0, files.length); }
From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java
License:Apache License
@Test public void testIncrementalWithMultipleCommits() throws IOException { // initial commit File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100"); InputFormatTestUtil.commit(basePath, "100"); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); // update files InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", false); InputFormatTestUtil.commit(basePath, "200"); InputFormatTestUtil.simulateUpdates(partitionDir, "100", 4, "300", false); InputFormatTestUtil.commit(basePath, "300"); InputFormatTestUtil.simulateUpdates(partitionDir, "100", 3, "400", false); InputFormatTestUtil.commit(basePath, "400"); InputFormatTestUtil.simulateUpdates(partitionDir, "100", 2, "500", false); InputFormatTestUtil.commit(basePath, "500"); InputFormatTestUtil.simulateUpdates(partitionDir, "100", 1, "600", false); InputFormatTestUtil.commit(basePath, "600"); InputFormatTestUtil.setupIncremental(jobConf, "100", 1); FileStatus[] files = inputFormat.listStatus(jobConf); assertEquals("Pulling 1 commit from 100, should get us the 5 files committed at 200", 5, files.length); ensureFilesInCommit("Pulling 1 commit from 100, should get us the 5 files committed at 200", files, "200", 5);/* w w w . j av a 2 s . c o m*/ InputFormatTestUtil.setupIncremental(jobConf, "100", 3); files = inputFormat.listStatus(jobConf); assertEquals("Pulling 3 commits from 100, should get us the 3 files from 400 commit, 1 file from 300 " + "commit and 1 file from 200 commit", 5, files.length); ensureFilesInCommit("Pulling 3 commits from 100, should get us the 3 files from 400 commit", files, "400", 3); ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 300 commit", files, "300", 1); ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 200 commit", files, "200", 1); InputFormatTestUtil.setupIncremental(jobConf, "100", HoodieHiveUtil.MAX_COMMIT_ALL); files = inputFormat.listStatus(jobConf); assertEquals("Pulling all commits from 100, should get us the 1 file from each of 200,300,400,500,400 " + "commits", 5, files.length); ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 600 commit", files, "600", 1); ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 500 commit", files, "500", 1); ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 400 commit", files, "400", 1); ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 300 commit", files, "300", 1); ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200", 1); }
From source file:com.uber.hoodie.hadoop.HoodieInputFormatTest.java
License:Apache License
public void testPredicatePushDown() throws IOException { // initial commit Schema schema = InputFormatTestUtil.readSchema("/sample1.avsc"); String commit1 = "20160628071126"; File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 10, commit1); InputFormatTestUtil.commit(basePath, commit1); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); // check whether we have 10 records at this point ensureRecordsInCommit("We need to have 10 records at this point for commit " + commit1, commit1, 10, 10); // update 2 records in the original parquet file and save it as commit 200 String commit2 = "20160629193623"; InputFormatTestUtil.simulateParquetUpdates(partitionDir, schema, commit1, 10, 2, commit2); InputFormatTestUtil.commit(basePath, commit2); InputFormatTestUtil.setupIncremental(jobConf, commit1, 1); // check whether we have 2 records at this point ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2, 2, 2);//from w ww . j ava 2 s . c om // Make sure we have the 10 records if we roll back the stattime InputFormatTestUtil.setupIncremental(jobConf, "0", 2); ensureRecordsInCommit("We need to have 8 records that was modified at commit " + commit1 + " and no more", commit1, 8, 10); ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2, 2, 10); }
From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java
License:Apache License
public void testReader(boolean partitioned) throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String baseInstant = "100"; File partitionDir = partitioned ? InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 100, baseInstant) : InputFormatTestUtil.prepareNonPartitionedParquetDataset(basePath, schema, 1, 100, baseInstant); InputFormatTestUtil.commit(basePath, baseInstant); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); List<Pair<String, Integer>> logVersionsWithAction = new ArrayList<>(); logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 1)); logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 2)); // TODO: HUDI-154 Once Hive 2.x PR (PR-674) is merged, enable this change // logVersionsWithAction.add(Pair.of(HoodieTimeline.ROLLBACK_ACTION, 3)); FileSlice fileSlice = new FileSlice( partitioned ? FSUtils.getRelativePartitionPath(new Path(basePath.getRoot().getAbsolutePath()), new Path(partitionDir.getAbsolutePath())) : "default", baseInstant, "fileid0"); logVersionsWithAction.stream().forEach(logVersionWithAction -> { try {//from w w w . j a va2s . c om // update files or generate new log file int logVersion = logVersionWithAction.getRight(); String action = logVersionWithAction.getKey(); int baseInstantTs = Integer.parseInt(baseInstant); String instantTime = String.valueOf(baseInstantTs + logVersion); String latestInstant = action.equals(HoodieTimeline.ROLLBACK_ACTION) ? String.valueOf(baseInstantTs + logVersion - 2) : instantTime; HoodieLogFormat.Writer writer = null; if (action.equals(HoodieTimeline.ROLLBACK_ACTION)) { writer = writeRollback(partitionDir, schema, "fileid0", baseInstant, instantTime, String.valueOf(baseInstantTs + logVersion - 1), logVersion); } else { writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", baseInstant, instantTime, 100, 0, logVersion); } long size = writer.getCurrentSize(); writer.close(); assertTrue("block - size should be > 0", size > 0); //create a split with baseFile (parquet file written earlier) and new log file(s) fileSlice.addLogFile(writer.getLogFile()); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(), fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()) .map(h -> h.getPath().toString()).collect(Collectors.toList()), instantTime); //create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader( new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List<Schema.Field> fields = schema.getFields(); String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); String postions = fields.stream().map(f -> String.valueOf(f.pos())) .collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); if (partitioned) { jobConf.set("partition_columns", "datestr"); } //validate record reader compaction HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); //use reader to read base Parquet File and log file, merge in flight and return latest commit //here all 100 records should be updated, see above NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); while (recordReader.next(key, value)) { Writable[] values = value.get(); //check if the record written is with latest commit, here "101" Assert.assertEquals(latestInstant, values[0].toString()); key = recordReader.createKey(); value = recordReader.createValue(); } } catch (Exception ioe) { throw new HoodieException(ioe.getMessage(), ioe); } }); // Add Rollback last version to next log-file }
From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java
License:Apache License
@Test public void testUnMergedReader() throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String commitTime = "100"; final int numRecords = 1000; final int firstBatchLastRecordKey = numRecords - 1; final int secondBatchLastRecordKey = 2 * numRecords - 1; File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numRecords, commitTime); InputFormatTestUtil.commit(basePath, commitTime); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); // insert new records to log file String newCommitTime = "101"; HoodieLogFormat.Writer writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, numRecords, numRecords, 0); long size = writer.getCurrentSize(); writer.close();/*from w ww. j a v a 2 s . c om*/ assertTrue("block - size should be > 0", size > 0); //create a split with baseFile (parquet file written earlier) and new log file(s) String logFilePath = writer.getLogFile().getPath().toString(); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime); //create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader( new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List<Schema.Field> fields = schema.getFields(); String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); jobConf.set("partition_columns", "datestr"); // Enable merge skipping. jobConf.set("hoodie.realtime.merge.skip", "true"); //validate unmerged record reader RealtimeUnmergedRecordReader recordReader = new RealtimeUnmergedRecordReader(split, jobConf, reader); //use reader to read base Parquet File and log file //here all records should be present. Also ensure log records are in order. NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); int numRecordsAtCommit1 = 0; int numRecordsAtCommit2 = 0; Set<Integer> seenKeys = new HashSet<>(); Integer lastSeenKeyFromLog = firstBatchLastRecordKey; while (recordReader.next(key, value)) { Writable[] values = value.get(); String gotCommit = values[0].toString(); String keyStr = values[2].toString(); Integer gotKey = Integer.parseInt(keyStr.substring("key".length())); if (gotCommit.equals(newCommitTime)) { numRecordsAtCommit2++; Assert.assertTrue(gotKey > firstBatchLastRecordKey); Assert.assertTrue(gotKey <= secondBatchLastRecordKey); Assert.assertEquals(gotKey.intValue(), lastSeenKeyFromLog + 1); lastSeenKeyFromLog++; } else { numRecordsAtCommit1++; Assert.assertTrue(gotKey >= 0); Assert.assertTrue(gotKey <= firstBatchLastRecordKey); } // Ensure unique key Assert.assertFalse(seenKeys.contains(gotKey)); seenKeys.add(gotKey); key = recordReader.createKey(); value = recordReader.createValue(); } Assert.assertEquals(numRecords, numRecordsAtCommit1); Assert.assertEquals(numRecords, numRecordsAtCommit2); Assert.assertEquals(2 * numRecords, seenKeys.size()); }