List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus
public abstract FileStatus getFileStatus(Path f) throws IOException;
From source file:com.facebook.hive.orc.ReaderImpl.java
License:Open Source License
public ReaderImpl(FileSystem fs, Path path, Configuration conf) throws IOException { try {//from ww w.j a v a 2 s.c o m this.fileSystem = fs; this.path = path; this.conf = conf; FSDataInputStream file = fs.open(path); long size = fs.getFileStatus(path).getLen(); int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS); ByteBuffer buffer = ByteBuffer.allocate(readSize); InStream.read(file, size - readSize, buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); int psLen = buffer.get(readSize - 1); int psOffset = readSize - 1 - psLen; CodedInputStream in = CodedInputStream.newInstance(buffer.array(), buffer.arrayOffset() + psOffset, psLen); OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in); int footerSize = (int) ps.getFooterLength(); bufferSize = (int) ps.getCompressionBlockSize(); switch (ps.getCompression()) { case NONE: compressionKind = CompressionKind.NONE; break; case ZLIB: compressionKind = CompressionKind.ZLIB; break; case SNAPPY: compressionKind = CompressionKind.SNAPPY; break; case LZO: compressionKind = CompressionKind.LZO; break; default: throw new IllegalArgumentException("Unknown compression"); } codec = WriterImpl.createCodec(compressionKind); InputStream instream = InStream.create("footer", file, size - 1 - psLen - footerSize, footerSize, codec, bufferSize); footer = OrcProto.Footer.parseFrom(instream); inspector = new OrcLazyRowObjectInspector(0, footer.getTypesList()); file.close(); } catch (IndexOutOfBoundsException e) { /** * When a non ORC file is read by ORC reader, we get IndexOutOfBoundsException exception while * creating a reader. Caught that exception and checked the file header to see if the input * file was ORC or not. If its not ORC, throw a NotAnORCFileException with the file * attempted to be reading (thus helping to figure out which table-partition was being read). */ checkIfORC(fs, path); throw new IOException("Failed to create record reader for file " + path, e); } catch (IOException e) { throw new IOException("Failed to create record reader for file " + path, e); } }
From source file:com.facebook.hive.orc.ReaderImpl.java
License:Open Source License
/** * Reads the file header (first 40 bytes) and checks if the first three characters are 'ORC'. *///from w w w.j ava 2s .c om public static void checkIfORC(FileSystem fs, Path path) throws IOException { // hardcoded to 40 because "SEQ-org.apache.hadoop.hive.ql.io.RCFile", the header, is of 40 chars final int buffLen = 40; final byte header[] = new byte[buffLen]; final FSDataInputStream file = fs.open(path); final long fileLength = fs.getFileStatus(path).getLen(); int sizeToBeRead = buffLen; if (buffLen > fileLength) { sizeToBeRead = (int) fileLength; } IOUtils.readFully(file, header, 0, sizeToBeRead); file.close(); final String headerString = new String(header); if (headerString.startsWith("ORC")) { LOG.error("Error while parsing the footer of the file : " + path); } else { throw new NotAnORCFileException("Input file = " + path + " , header = " + headerString); } }
From source file:com.facebook.hiveio.common.FileSystems.java
License:Apache License
/** * Move a file or directory from source to destination, recursively copying * subdirectories./*from w w w. j a v a 2 s . c om*/ * * @param fs FileSystem * @param file path to copy (file or directory) * @param src path to source directory * @param dest path to destination directory * @throws IOException I/O problems */ public static void move(FileSystem fs, Path file, Path src, Path dest) throws IOException { Path destFilePath = pathInDestination(file, src, dest); if (fs.isFile(file)) { if (fs.exists(destFilePath)) { if (!fs.delete(destFilePath, true)) { throw new IllegalArgumentException("Could not remove existing file " + destFilePath); } } if (!fs.rename(file, destFilePath)) { throw new IllegalArgumentException("Could not move " + file + " to " + destFilePath); } } else if (fs.getFileStatus(file).isDir()) { FileStatus[] statuses = fs.listStatus(file); fs.mkdirs(destFilePath); if (statuses != null) { for (FileStatus status : statuses) { move(fs, status.getPath(), src, dest); } } } }
From source file:com.facebook.presto.hdfs.HDFSPageSourceProvider.java
License:Apache License
private HdfsParquetDataSource buildHdfsParquetDataSource(FileSystem fileSystem, Path path, long start, long length) { try {/*from w w w . ja v a2s . com*/ long size = fileSystem.getFileStatus(path).getLen(); FSDataInputStream inputStream = fileSystem.open(path); return new HdfsParquetDataSource(path, size, inputStream); } catch (IOException e) { throw new HdfsSplitNotOpenException(path); } }
From source file:com.facebook.presto.hive.AbstractTestHiveClientS3.java
License:Apache License
@Test public void testGetFileStatus() throws Exception { Path basePath = new Path("s3://presto-test-hive/"); Path tablePath = new Path(basePath, "presto_test_s3"); Path filePath = new Path(tablePath, "test1.csv"); FileSystem fs = basePath.getFileSystem(hdfsEnvironment.getConfiguration(basePath)); assertTrue(isDirectory(fs.getFileStatus(basePath))); assertTrue(isDirectory(fs.getFileStatus(tablePath))); assertFalse(isDirectory(fs.getFileStatus(filePath))); assertFalse(fs.exists(new Path(basePath, "foo"))); }
From source file:com.facebook.presto.hive.AbstractTestHiveFileSystem.java
License:Apache License
@Test public void testGetFileStatus() throws Exception { Path basePath = getBasePath(); Path tablePath = new Path(basePath, "presto_test_external_fs"); Path filePath = new Path(tablePath, "test1.csv"); FileSystem fs = hdfsEnvironment.getFileSystem(TESTING_CONTEXT, basePath); assertTrue(fs.getFileStatus(basePath).isDirectory()); assertTrue(fs.getFileStatus(tablePath).isDirectory()); assertFalse(fs.getFileStatus(filePath).isDirectory()); assertFalse(fs.exists(new Path(basePath, "foo"))); }
From source file:com.facebook.presto.hive.BackgroundHiveSplitLoader.java
License:Apache License
private void loadPartition(HivePartitionMetadata partition) throws IOException { String partitionName = partition.getHivePartition().getPartitionId(); Properties schema = getPartitionSchema(table, partition.getPartition()); List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition()); TupleDomain<HiveColumnHandle> effectivePredicate = partition.getHivePartition().getEffectivePredicate(); Path path = new Path(getPartitionLocation(table, partition.getPartition())); Configuration configuration = hdfsEnvironment.getConfiguration(path); InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false); FileSystem fs = hdfsEnvironment.getFileSystem(session.getUser(), path); if (inputFormat instanceof SymlinkTextInputFormat) { if (bucketHandle.isPresent()) { throw new PrestoException(StandardErrorCode.NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported"); }/*from ww w.j av a 2 s.co m*/ // TODO: This should use an iterator like the HiveFileIterator for (Path targetPath : getTargetPathsFromSymlink(fs, path)) { // The input should be in TextInputFormat. TextInputFormat targetInputFormat = new TextInputFormat(); // get the configuration for the target path -- it may be a different hdfs instance Configuration targetConfiguration = hdfsEnvironment.getConfiguration(targetPath); JobConf targetJob = new JobConf(targetConfiguration); targetJob.setInputFormat(TextInputFormat.class); targetInputFormat.configure(targetJob); FileInputFormat.setInputPaths(targetJob, targetPath); InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0); for (InputSplit inputSplit : targetSplits) { FileSplit split = (FileSplit) inputSplit; FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(session.getUser(), split.getPath()); FileStatus file = targetFilesystem.getFileStatus(split.getPath()); hiveSplitSource.addToQueue(createHiveSplits(partitionName, file.getPath().toString(), targetFilesystem.getFileBlockLocations(file, split.getStart(), split.getLength()), split.getStart(), split.getLength(), schema, partitionKeys, false, session, OptionalInt.empty(), effectivePredicate, partition.getColumnCoercions())); if (stopped) { return; } } } return; } // If only one bucket could match: load that one file HiveFileIterator iterator = new HiveFileIterator(path, fs, directoryLister, namenodeStats, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions()); if (!buckets.isEmpty()) { int bucketCount = buckets.get(0).getBucketCount(); List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount); for (HiveBucket bucket : buckets) { int bucketNumber = bucket.getBucketNumber(); LocatedFileStatus file = list.get(bucketNumber); boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath()); hiveSplitSource.addToQueue(createHiveSplits(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketNumber), effectivePredicate, partition.getColumnCoercions())); } return; } // If table is bucketed: list the directory, sort, tag with bucket id if (bucketHandle.isPresent()) { // HiveFileIterator skips hidden files automatically. int bucketCount = bucketHandle.get().getBucketCount(); List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount); for (int bucketIndex = 0; bucketIndex < bucketCount; bucketIndex++) { LocatedFileStatus file = list.get(bucketIndex); boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath()); hiveSplitSource.addToQueue(createHiveSplits(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketIndex), iterator.getEffectivePredicate(), partition.getColumnCoercions())); } return; } fileIterators.addLast(iterator); }
From source file:com.facebook.presto.hive.HiveSplitIterable.java
License:Apache License
private void loadPartitionSplits(final HiveSplitQueue hiveSplitQueue, SuspendingExecutor suspendingExecutor) throws InterruptedException { final Semaphore semaphore = new Semaphore(maxPartitionBatchSize); try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) { ImmutableList.Builder<ListenableFuture<Void>> futureBuilder = ImmutableList.builder(); Iterator<String> nameIterator = partitionNames.iterator(); for (Partition partition : partitions) { checkState(nameIterator.hasNext(), "different number of partitions and partition names!"); semaphore.acquire();//www . j av a 2 s .c o m final String partitionName = nameIterator.next(); final Properties schema = getPartitionSchema(table, partition); final List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition); Path path = new Path(getPartitionLocation(table, partition)); final Configuration configuration = hdfsEnvironment.getConfiguration(path); final InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false); Path partitionPath = hdfsEnvironment.getFileSystemWrapper().wrap(path); FileSystem fs = partitionPath.getFileSystem(configuration); final LastSplitMarkingQueue markerQueue = new LastSplitMarkingQueue(hiveSplitQueue); if (inputFormat instanceof SymlinkTextInputFormat) { JobConf jobConf = new JobConf(configuration); FileInputFormat.setInputPaths(jobConf, partitionPath); InputSplit[] splits = inputFormat.getSplits(jobConf, 0); for (InputSplit rawSplit : splits) { FileSplit split = ((SymlinkTextInputFormat.SymlinkTextInputSplit) rawSplit) .getTargetSplit(); // get the filesystem for the target path -- it may be a different hdfs instance FileSystem targetFilesystem = split.getPath().getFileSystem(configuration); FileStatus fileStatus = targetFilesystem.getFileStatus(split.getPath()); markerQueue.addToQueue(createHiveSplits(partitionName, fileStatus, targetFilesystem.getFileBlockLocations(fileStatus, split.getStart(), split.getLength()), split.getStart(), split.getLength(), schema, partitionKeys, false)); } markerQueue.finish(); continue; } ListenableFuture<Void> partitionFuture = new AsyncRecursiveWalker(fs, suspendingExecutor) .beginWalk(partitionPath, new FileStatusCallback() { @Override public void process(FileStatus file, BlockLocation[] blockLocations) { if (bucket.isPresent() && !fileMatchesBucket(file.getPath().getName(), bucket.get())) { return; } try { boolean splittable = isSplittable(inputFormat, file.getPath().getFileSystem(configuration), file.getPath()); markerQueue.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0, file.getLen(), schema, partitionKeys, splittable)); } catch (IOException e) { hiveSplitQueue.fail(e); } } }); // release the semaphore when the partition finishes Futures.addCallback(partitionFuture, new FutureCallback<Void>() { @Override public void onSuccess(Void result) { markerQueue.finish(); semaphore.release(); } @Override public void onFailure(Throwable t) { markerQueue.finish(); semaphore.release(); } }); futureBuilder.add(partitionFuture); } // when all partitions finish, mark the queue as finished Futures.addCallback(Futures.allAsList(futureBuilder.build()), new FutureCallback<List<Void>>() { @Override public void onSuccess(List<Void> result) { hiveSplitQueue.finished(); } @Override public void onFailure(Throwable t) { hiveSplitQueue.fail(t); } }); } catch (Throwable e) { hiveSplitQueue.fail(e); Throwables.propagateIfInstanceOf(e, Error.class); } }
From source file:com.facebook.presto.hive.HiveSplitSourceProvider.java
License:Apache License
private void loadPartitionSplits(final HiveSplitSource hiveSplitSource, SuspendingExecutor suspendingExecutor, final ConnectorSession session) { final Semaphore semaphore = new Semaphore(maxPartitionBatchSize); try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) { ImmutableList.Builder<ListenableFuture<Void>> futureBuilder = ImmutableList.builder(); Iterator<String> nameIterator = partitionNames.iterator(); for (Partition partition : partitions) { checkState(nameIterator.hasNext(), "different number of partitions and partition names!"); final String partitionName = nameIterator.next(); final Properties schema = getPartitionSchema(table, partition); final List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition); Path path = new Path(getPartitionLocation(table, partition)); final Configuration configuration = hdfsEnvironment.getConfiguration(path); final InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false); FileSystem fs = path.getFileSystem(configuration); if (inputFormat instanceof SymlinkTextInputFormat) { JobConf jobConf = new JobConf(configuration); FileInputFormat.setInputPaths(jobConf, path); InputSplit[] splits = inputFormat.getSplits(jobConf, 0); for (InputSplit rawSplit : splits) { FileSplit split = ((SymlinkTextInputFormat.SymlinkTextInputSplit) rawSplit) .getTargetSplit(); // get the filesystem for the target path -- it may be a different hdfs instance FileSystem targetFilesystem = split.getPath().getFileSystem(configuration); FileStatus fileStatus = targetFilesystem.getFileStatus(split.getPath()); hiveSplitSource.addToQueue(createHiveSplits(partitionName, fileStatus, targetFilesystem.getFileBlockLocations(fileStatus, split.getStart(), split.getLength()), split.getStart(), split.getLength(), schema, partitionKeys, false, session)); }/*from w w w.j a v a 2 s . c om*/ continue; } // TODO: this is currently serial across all partitions and should be done in suspendingExecutor if (bucket.isPresent()) { Optional<FileStatus> bucketFile = getBucketFile(bucket.get(), fs, path); if (bucketFile.isPresent()) { FileStatus file = bucketFile.get(); BlockLocation[] blockLocations = fs.getFileBlockLocations(file, 0, file.getLen()); boolean splittable = isSplittable(inputFormat, fs, file.getPath()); hiveSplitSource.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0, file.getLen(), schema, partitionKeys, splittable, session)); continue; } } // Acquire semaphore so that we only have a fixed number of outstanding partitions being processed asynchronously // NOTE: there must not be any calls that throw in the space between acquiring the semaphore and setting the Future // callback to release it. Otherwise, we will need a try-finally block around this section. try { semaphore.acquire(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); return; } ListenableFuture<Void> partitionFuture = createAsyncWalker(fs, suspendingExecutor).beginWalk(path, new FileStatusCallback() { @Override public void process(FileStatus file, BlockLocation[] blockLocations) { try { boolean splittable = isSplittable(inputFormat, file.getPath().getFileSystem(configuration), file.getPath()); hiveSplitSource.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0, file.getLen(), schema, partitionKeys, splittable, session)); } catch (IOException e) { hiveSplitSource.fail(e); } } }); // release the semaphore when the partition finishes Futures.addCallback(partitionFuture, new FutureCallback<Void>() { @Override public void onSuccess(Void result) { semaphore.release(); } @Override public void onFailure(Throwable t) { semaphore.release(); } }); futureBuilder.add(partitionFuture); } // when all partitions finish, mark the queue as finished Futures.addCallback(Futures.allAsList(futureBuilder.build()), new FutureCallback<List<Void>>() { @Override public void onSuccess(List<Void> result) { hiveSplitSource.finished(); } @Override public void onFailure(Throwable t) { hiveSplitSource.fail(t); } }); } catch (Throwable e) { hiveSplitSource.fail(e); Throwables.propagateIfInstanceOf(e, Error.class); } }
From source file:com.facebook.presto.hive.orc.OrcPageSourceFactory.java
License:Apache License
public static OrcPageSource createOrcPageSource(MetadataReader metadataReader, HdfsEnvironment hdfsEnvironment, String sessionUser, Configuration configuration, Path path, long start, long length, List<HiveColumnHandle> columns, boolean useOrcColumnNames, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone hiveStorageTimeZone, TypeManager typeManager, DataSize maxMergeDistance, DataSize maxBufferSize, DataSize streamBufferSize, boolean orcBloomFiltersEnabled) { OrcDataSource orcDataSource;/* w w w . ja v a 2s . com*/ try { FileSystem fileSystem = hdfsEnvironment.getFileSystem(sessionUser, path, configuration); long size = fileSystem.getFileStatus(path).getLen(); FSDataInputStream inputStream = fileSystem.open(path); orcDataSource = new HdfsOrcDataSource(path.toString(), size, maxMergeDistance, maxBufferSize, streamBufferSize, inputStream); } catch (Exception e) { if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) { throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e); } throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e); } AggregatedMemoryContext systemMemoryUsage = new AggregatedMemoryContext(); try { OrcReader reader = new OrcReader(orcDataSource, metadataReader, maxMergeDistance, maxBufferSize); List<HiveColumnHandle> physicalColumns = getPhysicalHiveColumnHandles(columns, useOrcColumnNames, reader, path); ImmutableMap.Builder<Integer, Type> includedColumns = ImmutableMap.builder(); ImmutableList.Builder<ColumnReference<HiveColumnHandle>> columnReferences = ImmutableList.builder(); for (HiveColumnHandle column : physicalColumns) { if (column.getColumnType() == REGULAR) { Type type = typeManager.getType(column.getTypeSignature()); includedColumns.put(column.getHiveColumnIndex(), type); columnReferences.add(new ColumnReference<>(column, column.getHiveColumnIndex(), type)); } } OrcPredicate predicate = new TupleDomainOrcPredicate<>(effectivePredicate, columnReferences.build(), orcBloomFiltersEnabled); OrcRecordReader recordReader = reader.createRecordReader(includedColumns.build(), predicate, start, length, hiveStorageTimeZone, systemMemoryUsage); return new OrcPageSource(recordReader, orcDataSource, physicalColumns, typeManager, systemMemoryUsage); } catch (Exception e) { try { orcDataSource.close(); } catch (IOException ignored) { } if (e instanceof PrestoException) { throw (PrestoException) e; } String message = splitError(e, path, start, length); if (e.getClass().getSimpleName().equals("BlockMissingException")) { throw new PrestoException(HIVE_MISSING_DATA, message, e); } throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e); } }