Example usage for org.apache.hadoop.fs FileSystem listStatus

List of usage examples for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException 

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:com.twitter.algebra.matrix.format.MapDir.java

License:Apache License

/**
 * Disk usage of the MapDir or a dir of sequence files
 * @param mapDirPath the path to MapDir or a directory of sequence files
 * @param fs/*from w w  w  .  j av a  2s  .c  o  m*/
 * @return
 * @throws FileNotFoundException
 * @throws IOException
 */
public static long du(Path mapDirPath, FileSystem fs) throws FileNotFoundException, IOException {
    FileStatus[] dirs = fs.listStatus(mapDirPath, mapFilter);
    if (dirs.length == 0) //it is not a mapdir then, do a simple ls
        dirs = fs.listStatus(mapDirPath);
    long size = 0;
    for (FileStatus dirStatus : dirs) {
        //if it is a sequence file
        if (dirStatus.isFile())
            size += dirStatus.getLen();
        else
            //or if it is a mapfile, which is directory
            size += dirSize(dirStatus, fs);
    }
    return size;
}

From source file:com.twitter.algebra.matrix.format.MapDir.java

License:Apache License

private static long dirSize(FileStatus dirStatus, FileSystem fs) throws FileNotFoundException, IOException {
    FileStatus[] files = fs.listStatus(dirStatus.getPath(), new PathFilter() {
        @Override/* w w  w .  j av  a  2  s  .c  o m*/
        public boolean accept(final Path file) {
            return true;
        }
    });
    long size = 0;
    for (FileStatus file : files)
        size += file.getLen();
    return size;
}

From source file:com.twitter.elephanttwin.lucene.indexing.AbstractLuceneIndexingJob.java

License:Apache License

protected void addInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path,
        PathFilter inputFilter) throws IOException {
    for (FileStatus stat : fs.listStatus(path, inputFilter)) {
        if (stat.isDir()) {
            addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
        } else {/*from   ww  w  .  ja  va2  s  .c  o m*/
            result.add(stat);
        }
    }
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * @param context//from w  w  w  .  j a  v a2  s.c  o m
 * @param file
 *          the input file provided to the job to work on
 * @param columnName
 * @return the list of index files if there is an index directory created for
 *         the input file
 * @throws IOException
 */
protected static FileStatus[] listIndexFiles(JobContext context, Path file, String columnName)
        throws IOException {

    Path indexFilePath = new Path(getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName);

    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FileStatus[] dirlist = fs.listStatus(indexFilePath, indexFileFilter);
    return dirlist;
}

From source file:com.twitter.hraven.etl.FileLister.java

License:Apache License

/**
 * Gets the list of files for a given path filtered as per the input path range filter
 * Can go into directories recursively//from  w w  w . j ava2  s. c  om
 *
 * @param recurse - whether or not to traverse recursively
 * @param hdfs - the file system
 * @param inputPath - the path to traverse for getting the list of files
 * @param jobFileModifiedRangePathFilter - the filter to include/exclude certain files
 *
 * @return array of file status.
 * @throws IOException
 */
public static FileStatus[] listFiles(boolean recurse, FileSystem hdfs, Path inputPath,
        JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter) throws IOException {
    if (recurse) {
        List<FileStatus> fileStatusesList = new ArrayList<FileStatus>();
        traverseDirs(fileStatusesList, hdfs, inputPath, jobFileModifiedRangePathFilter);
        FileStatus[] fileStatuses = (FileStatus[]) fileStatusesList
                .toArray(new FileStatus[fileStatusesList.size()]);
        return fileStatuses;
    } else {
        return hdfs.listStatus(inputPath, jobFileModifiedRangePathFilter);
    }
}

From source file:com.uber.hoodie.common.table.HoodieTableMetaClient.java

License:Apache License

public static FileStatus[] scanFiles(FileSystem fs, Path metaPath, PathFilter nameFilter) throws IOException {
    return fs.listStatus(metaPath, nameFilter);
}

From source file:com.uber.hoodie.common.util.FSUtils.java

License:Apache License

/**
 * Get all the log files for the passed in FileId in the partition path
 *//*from   w  ww  . j  a va2  s.c  o  m*/
public static Stream<HoodieLogFile> getAllLogFiles(FileSystem fs, Path partitionPath, final String fileId,
        final String logFileExtension, final String baseCommitTime) throws IOException {
    return Arrays
            .stream(fs.listStatus(partitionPath,
                    path -> path.getName().startsWith("." + fileId)
                            && path.getName().contains(logFileExtension)))
            .map(HoodieLogFile::new).filter(s -> s.getBaseCommitTime().equals(baseCommitTime));
}

From source file:com.uber.hoodie.HoodieWriteClient.java

License:Apache License

private void rollback(List<String> commits) {
    if (commits.isEmpty()) {
        logger.info("List of commits to rollback is empty");
        return;//from  w  w  w  . j a v  a  2 s  . c o  m
    }

    final Timer.Context context = metrics.getRollbackCtx();
    String startRollbackTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date());

    // Create a Hoodie table which encapsulated the commits and files visible
    HoodieTable<T> table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true),
            config);
    HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
    HoodieTimeline inflightTimeline = table.getInflightCommitTimeline();
    HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();

    // Check if any of the commits is a savepoint - do not allow rollback on those commits
    List<String> savepoints = table.getCompletedSavepointTimeline().getInstants()
            .map(HoodieInstant::getTimestamp).collect(Collectors.toList());
    commits.forEach(s -> {
        if (savepoints.contains(s)) {
            throw new HoodieRollbackException(
                    "Could not rollback a savepointed commit. Delete savepoint first before rolling back" + s);
        }
    });

    try {
        if (commitTimeline.empty() && inflightTimeline.empty()) {
            // nothing to rollback
            logger.info("No commits to rollback " + commits);
        }

        // Make sure only the last n commits are being rolled back
        // If there is a commit in-between or after that is not rolled back, then abort
        String lastCommit = commits.get(commits.size() - 1);
        if (!commitTimeline.empty()
                && !commitTimeline.findInstantsAfter(lastCommit, Integer.MAX_VALUE).empty()) {
            throw new HoodieRollbackException(
                    "Found commits after time :" + lastCommit + ", please rollback greater commits first");
        }

        List<String> inflights = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp)
                .collect(Collectors.toList());
        if (!inflights.isEmpty() && inflights.indexOf(lastCommit) != inflights.size() - 1) {
            throw new HoodieRollbackException("Found in-flight commits after time :" + lastCommit
                    + ", please rollback greater commits first");
        }

        // Atomically unpublish all the commits
        commits.stream().filter(s -> !inflights.contains(s))
                .map(s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s))
                .forEach(activeTimeline::revertToInflight);
        logger.info("Unpublished " + commits);

        // cleanup index entries
        commits.stream().forEach(s -> {
            if (!index.rollbackCommit(s)) {
                throw new HoodieRollbackException("Clean out index changes failed, for time :" + s);
            }
        });
        logger.info("Index rolled back for commits " + commits);

        // delete all the data files for all these commits
        logger.info("Clean out all parquet files generated for commits: " + commits);
        final LongAccumulator numFilesDeletedCounter = jsc.sc().longAccumulator();
        List<HoodieRollbackStat> stats = jsc
                .parallelize(FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(),
                        config.shouldAssumeDatePartitioning()))
                .map((Function<String, HoodieRollbackStat>) partitionPath -> {
                    // Scan all partitions files with this commit time
                    logger.info("Cleaning path " + partitionPath);
                    FileSystem fs1 = FSUtils.getFs();
                    FileStatus[] toBeDeleted = fs1.listStatus(new Path(config.getBasePath(), partitionPath),
                            path -> {
                                if (!path.toString().contains(".parquet")) {
                                    return false;
                                }
                                String fileCommitTime = FSUtils.getCommitTime(path.getName());
                                return commits.contains(fileCommitTime);
                            });
                    Map<FileStatus, Boolean> results = Maps.newHashMap();
                    for (FileStatus file : toBeDeleted) {
                        boolean success = fs1.delete(file.getPath(), false);
                        results.put(file, success);
                        logger.info("Delete file " + file.getPath() + "\t" + success);
                        if (success) {
                            numFilesDeletedCounter.add(1);
                        }
                    }
                    return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
                            .withDeletedFileResults(results).build();
                }).collect();

        // Remove the rolled back inflight commits
        commits.stream().map(s -> new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, s))
                .forEach(activeTimeline::deleteInflight);
        logger.info("Deleted inflight commits " + commits);

        Optional<Long> durationInMs = Optional.empty();
        if (context != null) {
            durationInMs = Optional.of(metrics.getDurationInMs(context.stop()));
            Long numFilesDeleted = numFilesDeletedCounter.value();
            metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted);
        }
        HoodieRollbackMetadata rollbackMetadata = AvroUtils.convertRollbackMetadata(startRollbackTime,
                durationInMs, commits, stats);
        table.getActiveTimeline().saveAsComplete(
                new HoodieInstant(true, HoodieTimeline.ROLLBACK_ACTION, startRollbackTime),
                AvroUtils.serializeRollbackMetadata(rollbackMetadata));
        logger.info("Commits " + commits + " rollback is complete");

        if (!table.getActiveTimeline().getCleanerTimeline().empty()) {
            logger.info("Cleaning up older rollback meta files");
            // Cleanup of older cleaner meta files
            // TODO - make the commit archival generic and archive rollback metadata
            FSUtils.deleteOlderRollbackMetaFiles(fs, table.getMetaClient().getMetaPath(),
                    table.getActiveTimeline().getRollbackTimeline().getInstants());
        }
    } catch (IOException e) {
        throw new HoodieRollbackException("Failed to rollback " + config.getBasePath() + " commits " + commits,
                e);
    }
}

From source file:com.uber.hoodie.table.HoodieCopyOnWriteTable.java

License:Apache License

/**
 * Common method used for cleaning out parquet files under a partition path during rollback of a
 * set of commits//from w  w w.j a  v  a 2s .c  om
 */
protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String partitionPath,
        PathFilter filter) throws IOException {
    logger.info("Cleaning path " + partitionPath);
    FileSystem fs = getMetaClient().getFs();
    FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath),
            filter);
    for (FileStatus file : toBeDeleted) {
        boolean success = fs.delete(file.getPath(), false);
        results.put(file, success);
        logger.info("Delete file " + file.getPath() + "\t" + success);
    }
    return results;
}

From source file:com.uber.hoodie.table.HoodieCopyOnWriteTable.java

License:Apache License

/**
 * Common method used for cleaning out parquet files under a partition path during rollback of a
 * set of commits/*from w  w  w.  j a  v a 2 s . c  om*/
 */
protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String commit,
        String partitionPath) throws IOException {
    logger.info("Cleaning path " + partitionPath);
    FileSystem fs = getMetaClient().getFs();
    PathFilter filter = (path) -> {
        if (path.toString().contains(".parquet")) {
            String fileCommitTime = FSUtils.getCommitTime(path.getName());
            return commit.equals(fileCommitTime);
        }
        return false;
    };
    FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath),
            filter);
    for (FileStatus file : toBeDeleted) {
        boolean success = fs.delete(file.getPath(), false);
        results.put(file, success);
        logger.info("Delete file " + file.getPath() + "\t" + success);
    }
    return results;
}