Example usage for org.apache.hadoop.fs FileSystem getFileStatus

List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:com.facebook.hive.orc.ReaderImpl.java

License:Open Source License

public ReaderImpl(FileSystem fs, Path path, Configuration conf) throws IOException {
    try {//from  ww  w.j  a v a 2  s.c  o  m
        this.fileSystem = fs;
        this.path = path;
        this.conf = conf;
        FSDataInputStream file = fs.open(path);
        long size = fs.getFileStatus(path).getLen();
        int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
        ByteBuffer buffer = ByteBuffer.allocate(readSize);
        InStream.read(file, size - readSize, buffer.array(), buffer.arrayOffset() + buffer.position(),
                buffer.remaining());
        int psLen = buffer.get(readSize - 1);
        int psOffset = readSize - 1 - psLen;
        CodedInputStream in = CodedInputStream.newInstance(buffer.array(), buffer.arrayOffset() + psOffset,
                psLen);
        OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in);
        int footerSize = (int) ps.getFooterLength();
        bufferSize = (int) ps.getCompressionBlockSize();
        switch (ps.getCompression()) {
        case NONE:
            compressionKind = CompressionKind.NONE;
            break;
        case ZLIB:
            compressionKind = CompressionKind.ZLIB;
            break;
        case SNAPPY:
            compressionKind = CompressionKind.SNAPPY;
            break;
        case LZO:
            compressionKind = CompressionKind.LZO;
            break;
        default:
            throw new IllegalArgumentException("Unknown compression");
        }
        codec = WriterImpl.createCodec(compressionKind);

        InputStream instream = InStream.create("footer", file, size - 1 - psLen - footerSize, footerSize, codec,
                bufferSize);
        footer = OrcProto.Footer.parseFrom(instream);
        inspector = new OrcLazyRowObjectInspector(0, footer.getTypesList());
        file.close();
    } catch (IndexOutOfBoundsException e) {
        /**
         * When a non ORC file is read by ORC reader, we get IndexOutOfBoundsException exception while
         * creating a reader. Caught that exception and checked the file header to see if the input
         * file was ORC or not. If its not ORC, throw a NotAnORCFileException with the file
         * attempted to be reading (thus helping to figure out which table-partition was being read).
         */
        checkIfORC(fs, path);
        throw new IOException("Failed to create record reader for file " + path, e);
    } catch (IOException e) {
        throw new IOException("Failed to create record reader for file " + path, e);
    }
}

From source file:com.facebook.hive.orc.ReaderImpl.java

License:Open Source License

/**
 * Reads the file header (first 40 bytes) and checks if the first three characters are 'ORC'.
 *///from   w  w w.j ava  2s .c om
public static void checkIfORC(FileSystem fs, Path path) throws IOException {
    // hardcoded to 40 because "SEQ-org.apache.hadoop.hive.ql.io.RCFile", the header, is of 40 chars
    final int buffLen = 40;
    final byte header[] = new byte[buffLen];
    final FSDataInputStream file = fs.open(path);
    final long fileLength = fs.getFileStatus(path).getLen();
    int sizeToBeRead = buffLen;
    if (buffLen > fileLength) {
        sizeToBeRead = (int) fileLength;
    }

    IOUtils.readFully(file, header, 0, sizeToBeRead);
    file.close();

    final String headerString = new String(header);
    if (headerString.startsWith("ORC")) {
        LOG.error("Error while parsing the footer of the file : " + path);
    } else {
        throw new NotAnORCFileException("Input file = " + path + " , header = " + headerString);
    }
}

From source file:com.facebook.hiveio.common.FileSystems.java

License:Apache License

/**
 * Move a file or directory from source to destination, recursively copying
 * subdirectories./*from  w w  w. j a v  a 2  s  .  c om*/
 *
 * @param fs FileSystem
 * @param file path to copy (file or directory)
 * @param src path to source directory
 * @param dest path to destination directory
 * @throws IOException I/O problems
 */
public static void move(FileSystem fs, Path file, Path src, Path dest) throws IOException {
    Path destFilePath = pathInDestination(file, src, dest);
    if (fs.isFile(file)) {
        if (fs.exists(destFilePath)) {
            if (!fs.delete(destFilePath, true)) {
                throw new IllegalArgumentException("Could not remove existing file " + destFilePath);
            }
        }
        if (!fs.rename(file, destFilePath)) {
            throw new IllegalArgumentException("Could not move " + file + " to " + destFilePath);
        }
    } else if (fs.getFileStatus(file).isDir()) {
        FileStatus[] statuses = fs.listStatus(file);
        fs.mkdirs(destFilePath);
        if (statuses != null) {
            for (FileStatus status : statuses) {
                move(fs, status.getPath(), src, dest);
            }
        }
    }
}

From source file:com.facebook.presto.hdfs.HDFSPageSourceProvider.java

License:Apache License

private HdfsParquetDataSource buildHdfsParquetDataSource(FileSystem fileSystem, Path path, long start,
        long length) {
    try {/*from   w  w w  .  ja v  a2s  . com*/
        long size = fileSystem.getFileStatus(path).getLen();
        FSDataInputStream inputStream = fileSystem.open(path);
        return new HdfsParquetDataSource(path, size, inputStream);
    } catch (IOException e) {
        throw new HdfsSplitNotOpenException(path);
    }
}

From source file:com.facebook.presto.hive.AbstractTestHiveClientS3.java

License:Apache License

@Test
public void testGetFileStatus() throws Exception {
    Path basePath = new Path("s3://presto-test-hive/");
    Path tablePath = new Path(basePath, "presto_test_s3");
    Path filePath = new Path(tablePath, "test1.csv");
    FileSystem fs = basePath.getFileSystem(hdfsEnvironment.getConfiguration(basePath));

    assertTrue(isDirectory(fs.getFileStatus(basePath)));
    assertTrue(isDirectory(fs.getFileStatus(tablePath)));
    assertFalse(isDirectory(fs.getFileStatus(filePath)));
    assertFalse(fs.exists(new Path(basePath, "foo")));
}

From source file:com.facebook.presto.hive.AbstractTestHiveFileSystem.java

License:Apache License

@Test
public void testGetFileStatus() throws Exception {
    Path basePath = getBasePath();
    Path tablePath = new Path(basePath, "presto_test_external_fs");
    Path filePath = new Path(tablePath, "test1.csv");
    FileSystem fs = hdfsEnvironment.getFileSystem(TESTING_CONTEXT, basePath);

    assertTrue(fs.getFileStatus(basePath).isDirectory());
    assertTrue(fs.getFileStatus(tablePath).isDirectory());
    assertFalse(fs.getFileStatus(filePath).isDirectory());
    assertFalse(fs.exists(new Path(basePath, "foo")));
}

From source file:com.facebook.presto.hive.BackgroundHiveSplitLoader.java

License:Apache License

private void loadPartition(HivePartitionMetadata partition) throws IOException {
    String partitionName = partition.getHivePartition().getPartitionId();
    Properties schema = getPartitionSchema(table, partition.getPartition());
    List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
    TupleDomain<HiveColumnHandle> effectivePredicate = partition.getHivePartition().getEffectivePredicate();

    Path path = new Path(getPartitionLocation(table, partition.getPartition()));
    Configuration configuration = hdfsEnvironment.getConfiguration(path);
    InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
    FileSystem fs = hdfsEnvironment.getFileSystem(session.getUser(), path);

    if (inputFormat instanceof SymlinkTextInputFormat) {
        if (bucketHandle.isPresent()) {
            throw new PrestoException(StandardErrorCode.NOT_SUPPORTED,
                    "Bucketed table in SymlinkTextInputFormat is not yet supported");
        }/*from   ww  w.j  av  a 2 s.co m*/

        // TODO: This should use an iterator like the HiveFileIterator
        for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
            // The input should be in TextInputFormat.
            TextInputFormat targetInputFormat = new TextInputFormat();
            // get the configuration for the target path -- it may be a different hdfs instance
            Configuration targetConfiguration = hdfsEnvironment.getConfiguration(targetPath);
            JobConf targetJob = new JobConf(targetConfiguration);
            targetJob.setInputFormat(TextInputFormat.class);
            targetInputFormat.configure(targetJob);
            FileInputFormat.setInputPaths(targetJob, targetPath);
            InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);

            for (InputSplit inputSplit : targetSplits) {
                FileSplit split = (FileSplit) inputSplit;
                FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(session.getUser(), split.getPath());
                FileStatus file = targetFilesystem.getFileStatus(split.getPath());
                hiveSplitSource.addToQueue(createHiveSplits(partitionName, file.getPath().toString(),
                        targetFilesystem.getFileBlockLocations(file, split.getStart(), split.getLength()),
                        split.getStart(), split.getLength(), schema, partitionKeys, false, session,
                        OptionalInt.empty(), effectivePredicate, partition.getColumnCoercions()));
                if (stopped) {
                    return;
                }
            }
        }
        return;
    }

    // If only one bucket could match: load that one file
    HiveFileIterator iterator = new HiveFileIterator(path, fs, directoryLister, namenodeStats, partitionName,
            inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions());
    if (!buckets.isEmpty()) {
        int bucketCount = buckets.get(0).getBucketCount();
        List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);

        for (HiveBucket bucket : buckets) {
            int bucketNumber = bucket.getBucketNumber();
            LocatedFileStatus file = list.get(bucketNumber);
            boolean splittable = isSplittable(iterator.getInputFormat(),
                    hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());

            hiveSplitSource.addToQueue(createHiveSplits(iterator.getPartitionName(), file.getPath().toString(),
                    file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(),
                    iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketNumber),
                    effectivePredicate, partition.getColumnCoercions()));
        }

        return;
    }

    // If table is bucketed: list the directory, sort, tag with bucket id
    if (bucketHandle.isPresent()) {
        // HiveFileIterator skips hidden files automatically.
        int bucketCount = bucketHandle.get().getBucketCount();
        List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);

        for (int bucketIndex = 0; bucketIndex < bucketCount; bucketIndex++) {
            LocatedFileStatus file = list.get(bucketIndex);
            boolean splittable = isSplittable(iterator.getInputFormat(),
                    hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());

            hiveSplitSource.addToQueue(createHiveSplits(iterator.getPartitionName(), file.getPath().toString(),
                    file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(),
                    iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketIndex),
                    iterator.getEffectivePredicate(), partition.getColumnCoercions()));
        }

        return;
    }

    fileIterators.addLast(iterator);
}

From source file:com.facebook.presto.hive.HiveSplitIterable.java

License:Apache License

private void loadPartitionSplits(final HiveSplitQueue hiveSplitQueue, SuspendingExecutor suspendingExecutor)
        throws InterruptedException {
    final Semaphore semaphore = new Semaphore(maxPartitionBatchSize);
    try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) {
        ImmutableList.Builder<ListenableFuture<Void>> futureBuilder = ImmutableList.builder();

        Iterator<String> nameIterator = partitionNames.iterator();
        for (Partition partition : partitions) {
            checkState(nameIterator.hasNext(), "different number of partitions and partition names!");
            semaphore.acquire();//www . j  av  a  2  s .c  o m
            final String partitionName = nameIterator.next();
            final Properties schema = getPartitionSchema(table, partition);
            final List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition);

            Path path = new Path(getPartitionLocation(table, partition));
            final Configuration configuration = hdfsEnvironment.getConfiguration(path);
            final InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
            Path partitionPath = hdfsEnvironment.getFileSystemWrapper().wrap(path);

            FileSystem fs = partitionPath.getFileSystem(configuration);
            final LastSplitMarkingQueue markerQueue = new LastSplitMarkingQueue(hiveSplitQueue);

            if (inputFormat instanceof SymlinkTextInputFormat) {
                JobConf jobConf = new JobConf(configuration);
                FileInputFormat.setInputPaths(jobConf, partitionPath);
                InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
                for (InputSplit rawSplit : splits) {
                    FileSplit split = ((SymlinkTextInputFormat.SymlinkTextInputSplit) rawSplit)
                            .getTargetSplit();

                    // get the filesystem for the target path -- it may be a different hdfs instance
                    FileSystem targetFilesystem = split.getPath().getFileSystem(configuration);
                    FileStatus fileStatus = targetFilesystem.getFileStatus(split.getPath());
                    markerQueue.addToQueue(createHiveSplits(partitionName, fileStatus,
                            targetFilesystem.getFileBlockLocations(fileStatus, split.getStart(),
                                    split.getLength()),
                            split.getStart(), split.getLength(), schema, partitionKeys, false));
                }
                markerQueue.finish();
                continue;
            }

            ListenableFuture<Void> partitionFuture = new AsyncRecursiveWalker(fs, suspendingExecutor)
                    .beginWalk(partitionPath, new FileStatusCallback() {
                        @Override
                        public void process(FileStatus file, BlockLocation[] blockLocations) {
                            if (bucket.isPresent()
                                    && !fileMatchesBucket(file.getPath().getName(), bucket.get())) {
                                return;
                            }

                            try {
                                boolean splittable = isSplittable(inputFormat,
                                        file.getPath().getFileSystem(configuration), file.getPath());

                                markerQueue.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0,
                                        file.getLen(), schema, partitionKeys, splittable));
                            } catch (IOException e) {
                                hiveSplitQueue.fail(e);
                            }
                        }
                    });

            // release the semaphore when the partition finishes
            Futures.addCallback(partitionFuture, new FutureCallback<Void>() {
                @Override
                public void onSuccess(Void result) {
                    markerQueue.finish();
                    semaphore.release();
                }

                @Override
                public void onFailure(Throwable t) {
                    markerQueue.finish();
                    semaphore.release();
                }
            });
            futureBuilder.add(partitionFuture);
        }

        // when all partitions finish, mark the queue as finished
        Futures.addCallback(Futures.allAsList(futureBuilder.build()), new FutureCallback<List<Void>>() {
            @Override
            public void onSuccess(List<Void> result) {
                hiveSplitQueue.finished();
            }

            @Override
            public void onFailure(Throwable t) {
                hiveSplitQueue.fail(t);
            }
        });
    } catch (Throwable e) {
        hiveSplitQueue.fail(e);
        Throwables.propagateIfInstanceOf(e, Error.class);
    }
}

From source file:com.facebook.presto.hive.HiveSplitSourceProvider.java

License:Apache License

private void loadPartitionSplits(final HiveSplitSource hiveSplitSource, SuspendingExecutor suspendingExecutor,
        final ConnectorSession session) {
    final Semaphore semaphore = new Semaphore(maxPartitionBatchSize);
    try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) {
        ImmutableList.Builder<ListenableFuture<Void>> futureBuilder = ImmutableList.builder();

        Iterator<String> nameIterator = partitionNames.iterator();
        for (Partition partition : partitions) {
            checkState(nameIterator.hasNext(), "different number of partitions and partition names!");
            final String partitionName = nameIterator.next();
            final Properties schema = getPartitionSchema(table, partition);
            final List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition);

            Path path = new Path(getPartitionLocation(table, partition));
            final Configuration configuration = hdfsEnvironment.getConfiguration(path);
            final InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);

            FileSystem fs = path.getFileSystem(configuration);

            if (inputFormat instanceof SymlinkTextInputFormat) {
                JobConf jobConf = new JobConf(configuration);
                FileInputFormat.setInputPaths(jobConf, path);
                InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
                for (InputSplit rawSplit : splits) {
                    FileSplit split = ((SymlinkTextInputFormat.SymlinkTextInputSplit) rawSplit)
                            .getTargetSplit();

                    // get the filesystem for the target path -- it may be a different hdfs instance
                    FileSystem targetFilesystem = split.getPath().getFileSystem(configuration);
                    FileStatus fileStatus = targetFilesystem.getFileStatus(split.getPath());
                    hiveSplitSource.addToQueue(createHiveSplits(partitionName, fileStatus,
                            targetFilesystem.getFileBlockLocations(fileStatus, split.getStart(),
                                    split.getLength()),
                            split.getStart(), split.getLength(), schema, partitionKeys, false, session));
                }/*from w w  w.j a  v a  2  s  .  c  om*/
                continue;
            }

            // TODO: this is currently serial across all partitions and should be done in suspendingExecutor
            if (bucket.isPresent()) {
                Optional<FileStatus> bucketFile = getBucketFile(bucket.get(), fs, path);
                if (bucketFile.isPresent()) {
                    FileStatus file = bucketFile.get();
                    BlockLocation[] blockLocations = fs.getFileBlockLocations(file, 0, file.getLen());
                    boolean splittable = isSplittable(inputFormat, fs, file.getPath());

                    hiveSplitSource.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0,
                            file.getLen(), schema, partitionKeys, splittable, session));
                    continue;
                }
            }

            // Acquire semaphore so that we only have a fixed number of outstanding partitions being processed asynchronously
            // NOTE: there must not be any calls that throw in the space between acquiring the semaphore and setting the Future
            // callback to release it. Otherwise, we will need a try-finally block around this section.
            try {
                semaphore.acquire();
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                return;
            }

            ListenableFuture<Void> partitionFuture = createAsyncWalker(fs, suspendingExecutor).beginWalk(path,
                    new FileStatusCallback() {
                        @Override
                        public void process(FileStatus file, BlockLocation[] blockLocations) {
                            try {
                                boolean splittable = isSplittable(inputFormat,
                                        file.getPath().getFileSystem(configuration), file.getPath());

                                hiveSplitSource.addToQueue(createHiveSplits(partitionName, file, blockLocations,
                                        0, file.getLen(), schema, partitionKeys, splittable, session));
                            } catch (IOException e) {
                                hiveSplitSource.fail(e);
                            }
                        }
                    });

            // release the semaphore when the partition finishes
            Futures.addCallback(partitionFuture, new FutureCallback<Void>() {
                @Override
                public void onSuccess(Void result) {
                    semaphore.release();
                }

                @Override
                public void onFailure(Throwable t) {
                    semaphore.release();
                }
            });

            futureBuilder.add(partitionFuture);
        }

        // when all partitions finish, mark the queue as finished
        Futures.addCallback(Futures.allAsList(futureBuilder.build()), new FutureCallback<List<Void>>() {
            @Override
            public void onSuccess(List<Void> result) {
                hiveSplitSource.finished();
            }

            @Override
            public void onFailure(Throwable t) {
                hiveSplitSource.fail(t);
            }
        });
    } catch (Throwable e) {
        hiveSplitSource.fail(e);
        Throwables.propagateIfInstanceOf(e, Error.class);
    }
}

From source file:com.facebook.presto.hive.orc.OrcPageSourceFactory.java

License:Apache License

public static OrcPageSource createOrcPageSource(MetadataReader metadataReader, HdfsEnvironment hdfsEnvironment,
        String sessionUser, Configuration configuration, Path path, long start, long length,
        List<HiveColumnHandle> columns, boolean useOrcColumnNames,
        TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone hiveStorageTimeZone,
        TypeManager typeManager, DataSize maxMergeDistance, DataSize maxBufferSize, DataSize streamBufferSize,
        boolean orcBloomFiltersEnabled) {
    OrcDataSource orcDataSource;/*  w  w  w . ja  v  a  2s  .  com*/
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(sessionUser, path, configuration);
        long size = fileSystem.getFileStatus(path).getLen();
        FSDataInputStream inputStream = fileSystem.open(path);
        orcDataSource = new HdfsOrcDataSource(path.toString(), size, maxMergeDistance, maxBufferSize,
                streamBufferSize, inputStream);
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed")
                || e instanceof FileNotFoundException) {
            throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
    }

    AggregatedMemoryContext systemMemoryUsage = new AggregatedMemoryContext();
    try {
        OrcReader reader = new OrcReader(orcDataSource, metadataReader, maxMergeDistance, maxBufferSize);

        List<HiveColumnHandle> physicalColumns = getPhysicalHiveColumnHandles(columns, useOrcColumnNames,
                reader, path);
        ImmutableMap.Builder<Integer, Type> includedColumns = ImmutableMap.builder();
        ImmutableList.Builder<ColumnReference<HiveColumnHandle>> columnReferences = ImmutableList.builder();
        for (HiveColumnHandle column : physicalColumns) {
            if (column.getColumnType() == REGULAR) {
                Type type = typeManager.getType(column.getTypeSignature());
                includedColumns.put(column.getHiveColumnIndex(), type);
                columnReferences.add(new ColumnReference<>(column, column.getHiveColumnIndex(), type));
            }
        }

        OrcPredicate predicate = new TupleDomainOrcPredicate<>(effectivePredicate, columnReferences.build(),
                orcBloomFiltersEnabled);

        OrcRecordReader recordReader = reader.createRecordReader(includedColumns.build(), predicate, start,
                length, hiveStorageTimeZone, systemMemoryUsage);

        return new OrcPageSource(recordReader, orcDataSource, physicalColumns, typeManager, systemMemoryUsage);
    } catch (Exception e) {
        try {
            orcDataSource.close();
        } catch (IOException ignored) {
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        String message = splitError(e, path, start, length);
        if (e.getClass().getSimpleName().equals("BlockMissingException")) {
            throw new PrestoException(HIVE_MISSING_DATA, message, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}