List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java
License:Apache License
@Test public void testReaderWithNestedAndComplexSchema() throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getComplexEvolvedSchema()); HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String commitTime = "100"; int numberOfRecords = 100; int numberOfLogRecords = numberOfRecords / 2; File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numberOfRecords, commitTime);/* w ww. ja v a 2s.c om*/ InputFormatTestUtil.commit(basePath, commitTime); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); // update files or generate new log file String newCommitTime = "101"; HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, numberOfLogRecords); long size = writer.getCurrentSize(); writer.close(); assertTrue("block - size should be > 0", size > 0); InputFormatTestUtil.deltaCommit(basePath, newCommitTime); //create a split with baseFile (parquet file written earlier) and new log file(s) String logFilePath = writer.getLogFile().getPath().toString(); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime); //create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader( new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List<Schema.Field> fields = schema.getFields(); String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(",")); String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions); jobConf.set("partition_columns", "datestr"); // validate record reader compaction HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); // use reader to read base Parquet File and log file, merge in flight and return latest commit // here the first 50 records should be updated, see above NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); int numRecordsRead = 0; while (recordReader.next(key, value)) { int currentRecordNo = numRecordsRead; ++numRecordsRead; Writable[] values = value.get(); String recordCommitTime; //check if the record written is with latest commit, here "101" if (numRecordsRead > numberOfLogRecords) { recordCommitTime = commitTime; } else { recordCommitTime = newCommitTime; } String recordCommitTimeSuffix = "@" + recordCommitTime; Assert.assertEquals(values[0].toString(), recordCommitTime); key = recordReader.createKey(); value = recordReader.createValue(); // Assert type STRING Assert.assertEquals("test value for field: field1", values[5].toString(), "field" + currentRecordNo); Assert.assertEquals("test value for field: field2", values[6].toString(), "field" + currentRecordNo + recordCommitTimeSuffix); Assert.assertEquals("test value for field: name", values[7].toString(), "name" + currentRecordNo); // Assert type INT IntWritable intWritable = (IntWritable) values[8]; Assert.assertEquals("test value for field: favoriteIntNumber", intWritable.get(), currentRecordNo + recordCommitTime.hashCode()); // Assert type LONG LongWritable longWritable = (LongWritable) values[9]; Assert.assertEquals("test value for field: favoriteNumber", longWritable.get(), currentRecordNo + recordCommitTime.hashCode()); // Assert type FLOAT FloatWritable floatWritable = (FloatWritable) values[10]; Assert.assertEquals("test value for field: favoriteFloatNumber", floatWritable.get(), (float) ((currentRecordNo + recordCommitTime.hashCode()) / 1024.0), 0); // Assert type DOUBLE DoubleWritable doubleWritable = (DoubleWritable) values[11]; Assert.assertEquals("test value for field: favoriteDoubleNumber", doubleWritable.get(), (currentRecordNo + recordCommitTime.hashCode()) / 1024.0, 0); // Assert type MAP ArrayWritable mapItem = (ArrayWritable) values[12]; Writable mapItemValue1 = mapItem.get()[0]; Writable mapItemValue2 = mapItem.get()[1]; Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue1).get()[0].toString(), "mapItem1"); Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue2).get()[0].toString(), "mapItem2"); Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue1).get().length, 2); Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue2).get().length, 2); Writable mapItemValue1value = ((ArrayWritable) mapItemValue1).get()[1]; Writable mapItemValue2value = ((ArrayWritable) mapItemValue2).get()[1]; Assert.assertEquals("test value for field: tags[\"mapItem1\"].item1", ((ArrayWritable) mapItemValue1value).get()[0].toString(), "item" + currentRecordNo); Assert.assertEquals("test value for field: tags[\"mapItem2\"].item1", ((ArrayWritable) mapItemValue2value).get()[0].toString(), "item2" + currentRecordNo); Assert.assertEquals("test value for field: tags[\"mapItem1\"].item2", ((ArrayWritable) mapItemValue1value).get()[1].toString(), "item" + currentRecordNo + recordCommitTimeSuffix); Assert.assertEquals("test value for field: tags[\"mapItem2\"].item2", ((ArrayWritable) mapItemValue2value).get()[1].toString(), "item2" + currentRecordNo + recordCommitTimeSuffix); // Assert type RECORD ArrayWritable recordItem = (ArrayWritable) values[13]; Writable[] nestedRecord = recordItem.get(); Assert.assertEquals("test value for field: testNestedRecord.isAdmin", ((BooleanWritable) nestedRecord[0]).get(), false); Assert.assertEquals("test value for field: testNestedRecord.userId", nestedRecord[1].toString(), "UserId" + currentRecordNo + recordCommitTimeSuffix); // Assert type ARRAY ArrayWritable arrayValue = (ArrayWritable) values[14]; Writable[] arrayValues = arrayValue.get(); for (int i = 0; i < arrayValues.length; i++) { Assert.assertEquals("test value for field: stringArray", "stringArray" + i + recordCommitTimeSuffix, arrayValues[i].toString()); } } }
From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java
License:Apache License
@Test public void testSchemaEvolutionAndRollbackBlockInLastLogFile() throws Exception { // initial commit List<String> logFilePaths = new ArrayList<>(); Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String commitTime = "100"; int numberOfRecords = 100; int numberOfLogRecords = numberOfRecords / 2; File partitionDir = InputFormatTestUtil.prepareSimpleParquetDataset(basePath, schema, 1, numberOfRecords, commitTime);/*from www . ja v a 2s. c o m*/ InputFormatTestUtil.commit(basePath, commitTime); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); List<Field> firstSchemaFields = schema.getFields(); // update files and generate new log file but don't commit schema = SchemaTestUtil.getComplexEvolvedSchema(); String newCommitTime = "101"; HoodieLogFormat.Writer writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, numberOfLogRecords, 0, 1); long size = writer.getCurrentSize(); logFilePaths.add(writer.getLogFile().getPath().toString()); writer.close(); assertTrue("block - size should be > 0", size > 0); // write rollback for the previous block in new log file version newCommitTime = "102"; writer = writeRollbackBlockToLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, "101", 1); logFilePaths.add(writer.getLogFile().getPath().toString()); writer.close(); assertTrue("block - size should be > 0", size > 0); InputFormatTestUtil.deltaCommit(basePath, newCommitTime); //create a split with baseFile (parquet file written earlier) and new log file(s) HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1_" + commitTime + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(), logFilePaths, newCommitTime); //create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader( new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List<Schema.Field> fields = schema.getFields(); assert (firstSchemaFields.containsAll(fields) == false); // Try to read all the fields passed by the new schema String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(",")); String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions); jobConf.set("partition_columns", "datestr"); HoodieRealtimeRecordReader recordReader = null; try { // validate record reader compaction recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); throw new RuntimeException("should've failed the previous line"); } catch (HoodieException e) { // expected, field not found since the data written with the evolved schema was rolled back } // Try to read all the fields passed by the new schema names = firstSchemaFields.stream().map(f -> f.name()).collect(Collectors.joining(",")); positions = firstSchemaFields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions); jobConf.set("partition_columns", "datestr"); // This time read only the fields which are part of parquet recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); // use reader to read base Parquet File and log file NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); while (recordReader.next(key, value)) { // keep reading } }
From source file:com.unstruct.demo.WordCount.java
License:Apache License
/** * The main driver for word count map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker./* w w w. j a v a2s . com*/ */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), WordCount.class); conf.setJobName("wordcount"); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are counts (ints) conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MapClass.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
From source file:com.yahoo.druid.hadoop.HiveDatasourceInputFormat.java
License:Apache License
private String[] getFrequentLocations(List<WindowedDataSegment> segments, InputFormat fio, JobConf conf) throws IOException { Iterable<String> locations = Collections.emptyList(); for (WindowedDataSegment segment : segments) { FileInputFormat.setInputPaths(conf, new Path(JobHelper.getURIFromSegment(segment.getSegment()))); logger.info("CheckPost 4" + fio.getClass()); for (InputSplit split : fio.getSplits(conf, 1)) { locations = Iterables.concat(locations, Arrays.asList(split.getLocations())); }/* www . ja v a 2 s .c om*/ } return getFrequentLocations(locations); }
From source file:com.zfylin.demo.bigdata.hadoop.mr.WordCount2.java
License:Apache License
public static void main(String[] args) throws Exception { System.setProperty("HADOOP_USER_NAME", "hdfs"); //? ???hadoop? String input = "hdfs://hadoop-master:8020/data/hive/warehouse/channel_test.db/tbl_student"; /**/*from www . j av a2 s. com*/ * HDFSout * ??? */ String output = "hdfs://hadoop-master:8020/data/hive/warehouse/channel_test.db/tbl_student/output/"; JobConf conf = new JobConf(WordCount2.class); /** * ERROR: Exception message: /bin/bash: line 0: fg: no job control */ conf.set("mapreduce.app-submission.cross-platform", "true"); conf.setJobName("WordCount"); // conf.addResource("classpath:/hadoop/core-site.xml"); // conf.addResource("classpath:/hadoop/hdfs-site.xml"); // conf.addResource("classpath:/hadoop/mapred-site.xml"); //?? conf.setOutputKeyClass(Text.class); //?? int conf.setOutputValueClass(IntWritable.class); //mapper conf.setMapperClass(WordCountMapper.class); /** * ??Reducer * ???mapreduce?? * ???? * ???? * ? * ??? * ????? * ? */ conf.setCombinerClass(WordCountReducer.class); //reduce conf.setReducerClass(WordCountReducer.class); /** * ?TextInputFormat? * ???? * LongWritable???? * Text */ conf.setInputFormat(TextInputFormat.class); /** * ?TextOutpuTFormat? * ????toString() * */ conf.setOutputFormat(TextOutputFormat.class); //? FileInputFormat.setInputPaths(conf, new Path(input)); //??? FileOutputFormat.setOutputPath(conf, new Path(output)); //?mapreduce JobClient.runJob(conf); System.exit(0); }
From source file:combiner.CombinerDriver.java
public static void main(String[] args) { JobClient client = new JobClient(); // Configurations for Job set in this variable JobConf conf = new JobConf(combiner.CombinerDriver.class); // Name of the Job conf.setJobName("BookCrossing1.0"); // Data type of Output Key and Value conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); // Setting the Mapper and Reducer Class conf.setMapperClass(combiner.CombinerMapper.class); conf.setCombinerClass(combiner.CombinerReducer.class); conf.setReducerClass(combiner.CombinerReducer.class); // Formats of the Data Type of Input and output conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); // Specify input and output DIRECTORIES (not files) FileInputFormat.setInputPaths(conf, new Path(args[1])); FileOutputFormat.setOutputPath(conf, new Path(args[2])); client.setConf(conf);/*from ww w.ja va2 s . c o m*/ try { // Running the job with Configurations set in the conf. JobClient.runJob(conf); } catch (Exception e) { e.printStackTrace(); } }
From source file:countTheGivenWords.searchAndCountJob.java
public static void start(String[] args) { try {//from w w w .j a v a 2 s.c o m JobConf conf = new JobConf(WordCount.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); searchAndCountMapper Map = new searchAndCountMapper(); conf.setMapperClass(Map.getClass()); searchAndCountReducer Reduce = new searchAndCountReducer(); conf.setCombinerClass(Reduce.getClass()); conf.setReducerClass(Reduce.getClass()); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[1])); Path outputDir = new Path(args[2]); outputDir.getFileSystem(conf).delete(outputDir, true); FileSystem fs = FileSystem.get(conf); fs.delete(outputDir, true); FileOutputFormat.setOutputPath(conf, outputDir); JobClient.runJob(conf); FileSystem FS = FileSystem.get(conf); Path src = new Path(FS.getWorkingDirectory() + "/output/part-00000"); if (FS.exists(src)) { System.out.println("\t\t------ Results ------ "); /* BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(src))); String line; line = br.readLine(); while (line != null) { System.out.println("\t" + line); line = br.readLine(); } */ List<String> FileList = (new fileInteractions()).readLines(src, conf); for (String LocString : FileList) { System.out.println(LocString); } } } catch (Exception Exp) { Exp.printStackTrace(); } }
From source file:crimecount.CrimeCount.java
/** * @param args the command line arguments * @throws java.io.IOException/*from w w w . jav a2 s. co m*/ */ public static void main(String[] args) throws IOException { // TODO code application logic here JobConf conf = new JobConf(CrimeCount.class); conf.setJobName("crime count"); conf.setMapperClass(DistrictCrimeMapper.class); conf.setCombinerClass(DistrictCrimeReducer.class); conf.setReducerClass(DistrictCrimeReducer.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
From source file:crimesbyblockbymonth.CrimesByBlockByMonth.java
public int run(String[] args) throws Exception { Configuration conf = getConf(); JobConf job = new JobConf(conf, CrimesByBlockByMonth.class); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("CrimesByBlockByMonth"); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // job.set("key.value.separator.in.input.line", ""); JobClient.runJob(job);/*from ww w. ja v a 2 s . com*/ return 0; }
From source file:crimesbyblockbyyear.CrimesByBlockByYear.java
public int run(String[] args) throws Exception { Configuration conf = getConf(); JobConf job = new JobConf(conf, CrimesByBlockByYear.class); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("CrimesByBlockByYear"); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // job.set("key.value.separator.in.input.line", ""); JobClient.runJob(job);//from w ww . j a v a 2 s . c om return 0; }