List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testInOutFormat() throws Exception { Properties properties = new Properties(); StructObjectInspector inspector;/*from w ww . jav a2 s . co m*/ synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } SerDe serde = new OrcSerde(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); ReaderWriterProfiler.setProfilerOptions(conf); writer.write(serde.serialize(new MyRow(1, 2), inspector)); writer.write(serde.serialize(new MyRow(2, 2), inspector)); writer.write(serde.serialize(new MyRow(3, 2), inspector)); writer.close(true); serde = new OrcSerde(); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); serde.initialize(conf, properties); assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass()); inspector = (StructObjectInspector) serde.getObjectInspector(); assertEquals("struct<x:int,y:int>", inspector.getTypeName()); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // the the validate input method ArrayList<FileStatus> fileList = new ArrayList<FileStatus>(3); assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); fileList.add(fs.getFileStatus(testFilePath)); assertEquals(true, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); fileList.add(fs.getFileStatus(workDir)); assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); // read the whole file org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Writable value = (Writable) reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector(); assertEquals(0.0, reader.getProgress(), 0.00001); assertEquals(0, reader.getPos()); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(0)))); assertEquals(2, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1)))); } assertEquals(3, rowNum); assertEquals(1.0, reader.getProgress(), 0.00001); reader.close(); // read just the first column conf.set("hive.io.file.readcolumn.ids", "0"); reader = in.getRecordReader(splits[0], conf, Reporter.NULL); key = reader.createKey(); value = (Writable) reader.createValue(); rowNum = 0; fields = inspector.getAllStructFieldRefs(); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(null, inspector.getStructFieldData(value, fields.get(1))); } assertEquals(3, rowNum); reader.close(); // test the mapping of empty string to all columns conf.set("hive.io.file.readcolumn.ids", ""); reader = in.getRecordReader(splits[0], conf, Reporter.NULL); key = reader.createKey(); value = (Writable) reader.createValue(); rowNum = 0; fields = inspector.getAllStructFieldRefs(); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(2, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1)))); } assertEquals(3, rowNum); reader.close(); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testMROutput() throws Exception { JobConf job = new JobConf(conf); Properties properties = new Properties(); StructObjectInspector inspector;//www. j a va2 s. co m synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } SerDe serde = new OrcSerde(); OutputFormat<?, ?> outFormat = new OrcOutputFormat(); RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL); writer.write(NullWritable.get(), serde.serialize(new NestedRow(1, 2, 3), inspector)); writer.write(NullWritable.get(), serde.serialize(new NestedRow(4, 5, 6), inspector)); writer.write(NullWritable.get(), serde.serialize(new NestedRow(7, 8, 9), inspector)); writer.close(Reporter.NULL); serde = new OrcSerde(); properties.setProperty("columns", "z,r"); properties.setProperty("columns.types", "int:struct<x:int,y:int>"); serde.initialize(conf, properties); inspector = (StructObjectInspector) serde.getObjectInspector(); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); conf.set("hive.io.file.readcolumn.ids", "1"); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); StructObjectInspector inner = (StructObjectInspector) fields.get(1).getFieldObjectInspector(); List<? extends StructField> inFields = inner.getAllStructFieldRefs(); IntObjectInspector intInspector = (IntObjectInspector) inFields.get(0).getFieldObjectInspector(); while (reader.next(key, value)) { assertEquals(null, inspector.getStructFieldData(value, fields.get(0))); Object sub = inspector.getStructFieldData(value, fields.get(1)); assertEquals(3 * rowNum + 1, intInspector.get(inner.getStructFieldData(sub, inFields.get(0)))); assertEquals(3 * rowNum + 2, intInspector.get(inner.getStructFieldData(sub, inFields.get(1)))); rowNum += 1; } assertEquals(3, rowNum); reader.close(); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testMROutput2() throws Exception { JobConf job = new JobConf(conf); // Test that you can set the output directory using this config job.set("mapred.work.output.dir", testFilePath.getParent().toString()); Properties properties = new Properties(); StructObjectInspector inspector;// w w w . j a v a 2s . c o m synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } SerDe serde = new OrcSerde(); OutputFormat<?, ?> outFormat = new OrcOutputFormat(); RecordWriter writer = outFormat.getRecordWriter(fs, job, testFilePath.getName(), Reporter.NULL); writer.write(NullWritable.get(), serde.serialize(new StringRow("a"), inspector)); writer.close(Reporter.NULL); serde = new OrcSerde(); properties.setProperty("columns", "col"); properties.setProperty("columns.types", "string"); serde.initialize(conf, properties); inspector = (StructObjectInspector) serde.getObjectInspector(); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); reader.next(key, value); assertEquals("a", ((StringObjectInspector) fields.get(0).getFieldObjectInspector()) .getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); reader.close(); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testEmptyFile() throws Exception { JobConf job = new JobConf(conf); Properties properties = new Properties(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL);//from w ww. j ava2 s . c om writer.close(true); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); SerDe serde = new OrcSerde(); serde.initialize(conf, properties); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // read the whole file conf.set("hive.io.file.readcolumn.ids", "0,1"); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); assertEquals(0.0, reader.getProgress(), 0.00001); assertEquals(0, reader.getPos()); assertEquals(false, reader.next(key, value)); reader.close(); assertEquals(null, serde.getSerDeStats()); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testDefaultTypes() throws Exception { JobConf job = new JobConf(conf); Properties properties = new Properties(); StructObjectInspector inspector;/*from ww w . ja v a2 s. c om*/ synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } SerDe serde = new OrcSerde(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, StringRow.class, true, properties, Reporter.NULL); writer.write(serde.serialize(new StringRow("owen"), inspector)); writer.write(serde.serialize(new StringRow("beth"), inspector)); writer.write(serde.serialize(new StringRow("laurel"), inspector)); writer.write(serde.serialize(new StringRow("hazen"), inspector)); writer.write(serde.serialize(new StringRow("colin"), inspector)); writer.write(serde.serialize(new StringRow("miles"), inspector)); writer.close(true); serde = new OrcSerde(); properties.setProperty("columns", "str,str2"); serde.initialize(conf, properties); inspector = (StructObjectInspector) serde.getObjectInspector(); assertEquals("struct<str:string,str2:string>", inspector.getTypeName()); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // read the whole file org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Writable value = (Writable) reader.createValue(); List<? extends StructField> fields = inspector.getAllStructFieldRefs(); StringObjectInspector strInspector = (StringObjectInspector) fields.get(0).getFieldObjectInspector(); assertEquals(true, reader.next(key, value)); assertEquals("owen", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("beth", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("laurel", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("hazen", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("colin", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("miles", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(false, reader.next(key, value)); reader.close(); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
/** * Tests that passing null as the file system to getRecordWriter works, this is * to be compatible with the way Sequence and RC file tolerate nulls. * @throws Exception/*from w w w . j a v a2 s. c o m*/ */ @Test public void testNullFileSystem() throws Exception { conf.set("mapred.work.output.dir", testFilePath.getParent().toString()); JobConf job = new JobConf(conf); Properties properties = new Properties(); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } OrcSerde serde = new OrcSerde(); OrcOutputFormat outFormat = new OrcOutputFormat(); RecordWriter<NullWritable, OrcSerdeRow> writer = outFormat.getRecordWriter(null, conf, testFilePath.getName(), Reporter.NULL); writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("a"), inspector)); writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("b"), inspector)); writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("c"), inspector)); writer.close(Reporter.NULL); serde = new OrcSerde(); properties.setProperty("columns", "str,str2"); serde.initialize(conf, properties); inspector = (StructObjectInspector) serde.getObjectInspector(); OrcInputFormat in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // read the whole file org.apache.hadoop.mapred.RecordReader<NullWritable, OrcLazyRow> reader = in.getRecordReader(splits[0], conf, Reporter.NULL); NullWritable key = reader.createKey(); OrcLazyRow value = (OrcLazyRow) reader.createValue(); List<? extends StructField> fields = inspector.getAllStructFieldRefs(); StringObjectInspector strInspector = (StringObjectInspector) fields.get(0).getFieldObjectInspector(); assertEquals(true, reader.next(key, value)); assertEquals("a", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("b", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("c", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(false, reader.next(key, value)); reader.close(); }
From source file:com.facebook.LinkBench.LinkBenchDriverMR.java
License:Apache License
/** * setup input files for map reduce job//from ww w.j a v a 2 s . co m * @param jobconf configuration of the map reduce job * @param nmappers number of mappers (loader or requester) */ private static FileSystem setupInputFiles(JobConf jobconf, int nmappers) throws IOException, InterruptedException { //setup input/output directories final Path indir = new Path(TMP_DIR, "in"); final Path outdir = new Path(TMP_DIR, "out"); FileInputFormat.setInputPaths(jobconf, indir); FileOutputFormat.setOutputPath(jobconf, outdir); final FileSystem fs = FileSystem.get(jobconf); if (fs.exists(TMP_DIR)) { throw new IOException( "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists. Please remove it first."); } if (!fs.mkdirs(indir)) { throw new IOException("Cannot create input directory " + indir); } //generate an input file for each map task if (USE_INPUT_FILES) { for (int i = 0; i < nmappers; ++i) { final Path file = new Path(indir, "part" + i); final IntWritable mapperid = new IntWritable(i); final IntWritable nummappers = new IntWritable(nmappers); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, jobconf, file, IntWritable.class, IntWritable.class, CompressionType.NONE); try { writer.append(mapperid, nummappers); } finally { writer.close(); } logger.info("Wrote input for Map #" + i); } } return fs; }
From source file:com.facebook.presto.hive.BackgroundHiveSplitLoader.java
License:Apache License
private void loadPartition(HivePartitionMetadata partition) throws IOException { String partitionName = partition.getHivePartition().getPartitionId(); Properties schema = getPartitionSchema(table, partition.getPartition()); List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition()); TupleDomain<HiveColumnHandle> effectivePredicate = partition.getHivePartition().getEffectivePredicate(); Path path = new Path(getPartitionLocation(table, partition.getPartition())); Configuration configuration = hdfsEnvironment.getConfiguration(path); InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false); FileSystem fs = hdfsEnvironment.getFileSystem(session.getUser(), path); if (inputFormat instanceof SymlinkTextInputFormat) { if (bucketHandle.isPresent()) { throw new PrestoException(StandardErrorCode.NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported"); }/*from w w w . j a v a 2 s . c om*/ // TODO: This should use an iterator like the HiveFileIterator for (Path targetPath : getTargetPathsFromSymlink(fs, path)) { // The input should be in TextInputFormat. TextInputFormat targetInputFormat = new TextInputFormat(); // get the configuration for the target path -- it may be a different hdfs instance Configuration targetConfiguration = hdfsEnvironment.getConfiguration(targetPath); JobConf targetJob = new JobConf(targetConfiguration); targetJob.setInputFormat(TextInputFormat.class); targetInputFormat.configure(targetJob); FileInputFormat.setInputPaths(targetJob, targetPath); InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0); for (InputSplit inputSplit : targetSplits) { FileSplit split = (FileSplit) inputSplit; FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(session.getUser(), split.getPath()); FileStatus file = targetFilesystem.getFileStatus(split.getPath()); hiveSplitSource.addToQueue(createHiveSplits(partitionName, file.getPath().toString(), targetFilesystem.getFileBlockLocations(file, split.getStart(), split.getLength()), split.getStart(), split.getLength(), schema, partitionKeys, false, session, OptionalInt.empty(), effectivePredicate, partition.getColumnCoercions())); if (stopped) { return; } } } return; } // If only one bucket could match: load that one file HiveFileIterator iterator = new HiveFileIterator(path, fs, directoryLister, namenodeStats, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions()); if (!buckets.isEmpty()) { int bucketCount = buckets.get(0).getBucketCount(); List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount); for (HiveBucket bucket : buckets) { int bucketNumber = bucket.getBucketNumber(); LocatedFileStatus file = list.get(bucketNumber); boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath()); hiveSplitSource.addToQueue(createHiveSplits(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketNumber), effectivePredicate, partition.getColumnCoercions())); } return; } // If table is bucketed: list the directory, sort, tag with bucket id if (bucketHandle.isPresent()) { // HiveFileIterator skips hidden files automatically. int bucketCount = bucketHandle.get().getBucketCount(); List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount); for (int bucketIndex = 0; bucketIndex < bucketCount; bucketIndex++) { LocatedFileStatus file = list.get(bucketIndex); boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath()); hiveSplitSource.addToQueue(createHiveSplits(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketIndex), iterator.getEffectivePredicate(), partition.getColumnCoercions())); } return; } fileIterators.addLast(iterator); }
From source file:com.facebook.presto.hive.HiveSplitIterable.java
License:Apache License
private void loadPartitionSplits(final HiveSplitQueue hiveSplitQueue, SuspendingExecutor suspendingExecutor) throws InterruptedException { final Semaphore semaphore = new Semaphore(maxPartitionBatchSize); try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) { ImmutableList.Builder<ListenableFuture<Void>> futureBuilder = ImmutableList.builder(); Iterator<String> nameIterator = partitionNames.iterator(); for (Partition partition : partitions) { checkState(nameIterator.hasNext(), "different number of partitions and partition names!"); semaphore.acquire();/*from w w w . j a v a 2 s.c om*/ final String partitionName = nameIterator.next(); final Properties schema = getPartitionSchema(table, partition); final List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition); Path path = new Path(getPartitionLocation(table, partition)); final Configuration configuration = hdfsEnvironment.getConfiguration(path); final InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false); Path partitionPath = hdfsEnvironment.getFileSystemWrapper().wrap(path); FileSystem fs = partitionPath.getFileSystem(configuration); final LastSplitMarkingQueue markerQueue = new LastSplitMarkingQueue(hiveSplitQueue); if (inputFormat instanceof SymlinkTextInputFormat) { JobConf jobConf = new JobConf(configuration); FileInputFormat.setInputPaths(jobConf, partitionPath); InputSplit[] splits = inputFormat.getSplits(jobConf, 0); for (InputSplit rawSplit : splits) { FileSplit split = ((SymlinkTextInputFormat.SymlinkTextInputSplit) rawSplit) .getTargetSplit(); // get the filesystem for the target path -- it may be a different hdfs instance FileSystem targetFilesystem = split.getPath().getFileSystem(configuration); FileStatus fileStatus = targetFilesystem.getFileStatus(split.getPath()); markerQueue.addToQueue(createHiveSplits(partitionName, fileStatus, targetFilesystem.getFileBlockLocations(fileStatus, split.getStart(), split.getLength()), split.getStart(), split.getLength(), schema, partitionKeys, false)); } markerQueue.finish(); continue; } ListenableFuture<Void> partitionFuture = new AsyncRecursiveWalker(fs, suspendingExecutor) .beginWalk(partitionPath, new FileStatusCallback() { @Override public void process(FileStatus file, BlockLocation[] blockLocations) { if (bucket.isPresent() && !fileMatchesBucket(file.getPath().getName(), bucket.get())) { return; } try { boolean splittable = isSplittable(inputFormat, file.getPath().getFileSystem(configuration), file.getPath()); markerQueue.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0, file.getLen(), schema, partitionKeys, splittable)); } catch (IOException e) { hiveSplitQueue.fail(e); } } }); // release the semaphore when the partition finishes Futures.addCallback(partitionFuture, new FutureCallback<Void>() { @Override public void onSuccess(Void result) { markerQueue.finish(); semaphore.release(); } @Override public void onFailure(Throwable t) { markerQueue.finish(); semaphore.release(); } }); futureBuilder.add(partitionFuture); } // when all partitions finish, mark the queue as finished Futures.addCallback(Futures.allAsList(futureBuilder.build()), new FutureCallback<List<Void>>() { @Override public void onSuccess(List<Void> result) { hiveSplitQueue.finished(); } @Override public void onFailure(Throwable t) { hiveSplitQueue.fail(t); } }); } catch (Throwable e) { hiveSplitQueue.fail(e); Throwables.propagateIfInstanceOf(e, Error.class); } }
From source file:com.facebook.presto.hive.HiveSplitSourceProvider.java
License:Apache License
private void loadPartitionSplits(final HiveSplitSource hiveSplitSource, SuspendingExecutor suspendingExecutor, final ConnectorSession session) { final Semaphore semaphore = new Semaphore(maxPartitionBatchSize); try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) { ImmutableList.Builder<ListenableFuture<Void>> futureBuilder = ImmutableList.builder(); Iterator<String> nameIterator = partitionNames.iterator(); for (Partition partition : partitions) { checkState(nameIterator.hasNext(), "different number of partitions and partition names!"); final String partitionName = nameIterator.next(); final Properties schema = getPartitionSchema(table, partition); final List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition); Path path = new Path(getPartitionLocation(table, partition)); final Configuration configuration = hdfsEnvironment.getConfiguration(path); final InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false); FileSystem fs = path.getFileSystem(configuration); if (inputFormat instanceof SymlinkTextInputFormat) { JobConf jobConf = new JobConf(configuration); FileInputFormat.setInputPaths(jobConf, path); InputSplit[] splits = inputFormat.getSplits(jobConf, 0); for (InputSplit rawSplit : splits) { FileSplit split = ((SymlinkTextInputFormat.SymlinkTextInputSplit) rawSplit) .getTargetSplit(); // get the filesystem for the target path -- it may be a different hdfs instance FileSystem targetFilesystem = split.getPath().getFileSystem(configuration); FileStatus fileStatus = targetFilesystem.getFileStatus(split.getPath()); hiveSplitSource.addToQueue(createHiveSplits(partitionName, fileStatus, targetFilesystem.getFileBlockLocations(fileStatus, split.getStart(), split.getLength()), split.getStart(), split.getLength(), schema, partitionKeys, false, session)); }/*from w w w . j a va 2 s .c om*/ continue; } // TODO: this is currently serial across all partitions and should be done in suspendingExecutor if (bucket.isPresent()) { Optional<FileStatus> bucketFile = getBucketFile(bucket.get(), fs, path); if (bucketFile.isPresent()) { FileStatus file = bucketFile.get(); BlockLocation[] blockLocations = fs.getFileBlockLocations(file, 0, file.getLen()); boolean splittable = isSplittable(inputFormat, fs, file.getPath()); hiveSplitSource.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0, file.getLen(), schema, partitionKeys, splittable, session)); continue; } } // Acquire semaphore so that we only have a fixed number of outstanding partitions being processed asynchronously // NOTE: there must not be any calls that throw in the space between acquiring the semaphore and setting the Future // callback to release it. Otherwise, we will need a try-finally block around this section. try { semaphore.acquire(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); return; } ListenableFuture<Void> partitionFuture = createAsyncWalker(fs, suspendingExecutor).beginWalk(path, new FileStatusCallback() { @Override public void process(FileStatus file, BlockLocation[] blockLocations) { try { boolean splittable = isSplittable(inputFormat, file.getPath().getFileSystem(configuration), file.getPath()); hiveSplitSource.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0, file.getLen(), schema, partitionKeys, splittable, session)); } catch (IOException e) { hiveSplitSource.fail(e); } } }); // release the semaphore when the partition finishes Futures.addCallback(partitionFuture, new FutureCallback<Void>() { @Override public void onSuccess(Void result) { semaphore.release(); } @Override public void onFailure(Throwable t) { semaphore.release(); } }); futureBuilder.add(partitionFuture); } // when all partitions finish, mark the queue as finished Futures.addCallback(Futures.allAsList(futureBuilder.build()), new FutureCallback<List<Void>>() { @Override public void onSuccess(List<Void> result) { hiveSplitSource.finished(); } @Override public void onFailure(Throwable t) { hiveSplitSource.fail(t); } }); } catch (Throwable e) { hiveSplitSource.fail(e); Throwables.propagateIfInstanceOf(e, Error.class); } }