Example usage for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:com.twitter.algebra.matrix.format.MapDir.java

License:Apache License

/**
 * Disk usage of the MapDir or a dir of sequence files
 * @param mapDirPath the path to MapDir or a directory of sequence files
 * @param fs/*from w w  w  .  j av a  2s  .c  o  m*/
 * @return
 * @throws FileNotFoundException
 * @throws IOException
 */
public static long du(Path mapDirPath, FileSystem fs) throws FileNotFoundException, IOException {
    FileStatus[] dirs = fs.listStatus(mapDirPath, mapFilter);
    if (dirs.length == 0) //it is not a mapdir then, do a simple ls
        dirs = fs.listStatus(mapDirPath);
    long size = 0;
    for (FileStatus dirStatus : dirs) {
        //if it is a sequence file
        if (dirStatus.isFile())
            size += dirStatus.getLen();
        else
            //or if it is a mapfile, which is directory
            size += dirSize(dirStatus, fs);
    }
    return size;
}

From source file:com.twitter.algebra.matrix.format.MapDir.java

License:Apache License

private static long dirSize(FileStatus dirStatus, FileSystem fs) throws FileNotFoundException, IOException {
    FileStatus[] files = fs.listStatus(dirStatus.getPath(), new PathFilter() {
        @Override/* w w  w .  j av  a  2  s  .c  o m*/
        public boolean accept(final Path file) {
            return true;
        }
    });
    long size = 0;
    for (FileStatus file : files)
        size += file.getLen();
    return size;
}

From source file:com.twitter.elephanttwin.lucene.indexing.AbstractLuceneIndexingJob.java

License:Apache License

protected void addInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path,
        PathFilter inputFilter) throws IOException {
    for (FileStatus stat : fs.listStatus(path, inputFilter)) {
        if (stat.isDir()) {
            addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
        } else {/*from   ww  w  .  ja  va2  s  .c  o m*/
            result.add(stat);
        }
    }
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * @param context//from w  w  w  .  j a  v a2  s.c  o m
 * @param file
 *          the input file provided to the job to work on
 * @param columnName
 * @return the list of index files if there is an index directory created for
 *         the input file
 * @throws IOException
 */
protected static FileStatus[] listIndexFiles(JobContext context, Path file, String columnName)
        throws IOException {

    Path indexFilePath = new Path(getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName);

    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FileStatus[] dirlist = fs.listStatus(indexFilePath, indexFileFilter);
    return dirlist;
}

From source file:com.twitter.hraven.etl.FileLister.java

License:Apache License

/**
 * Gets the list of files for a given path filtered as per the input path range filter
 * Can go into directories recursively//from  w w  w . j ava2  s. c  om
 *
 * @param recurse - whether or not to traverse recursively
 * @param hdfs - the file system
 * @param inputPath - the path to traverse for getting the list of files
 * @param jobFileModifiedRangePathFilter - the filter to include/exclude certain files
 *
 * @return array of file status.
 * @throws IOException
 */
public static FileStatus[] listFiles(boolean recurse, FileSystem hdfs, Path inputPath,
        JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter) throws IOException {
    if (recurse) {
        List<FileStatus> fileStatusesList = new ArrayList<FileStatus>();
        traverseDirs(fileStatusesList, hdfs, inputPath, jobFileModifiedRangePathFilter);
        FileStatus[] fileStatuses = (FileStatus[]) fileStatusesList
                .toArray(new FileStatus[fileStatusesList.size()]);
        return fileStatuses;
    } else {
        return hdfs.listStatus(inputPath, jobFileModifiedRangePathFilter);
    }
}

From source file:com.uber.hoodie.common.table.HoodieTableMetaClient.java

License:Apache License

public static FileStatus[] scanFiles(FileSystem fs, Path metaPath, PathFilter nameFilter) throws IOException {
    return fs.listStatus(metaPath, nameFilter);
}

From source file:com.uber.hoodie.common.util.FSUtils.java

License:Apache License

/**
 * Get all the log files for the passed in FileId in the partition path
 *//*from   w  ww  . j  a va2  s.c  o  m*/
public static Stream<HoodieLogFile> getAllLogFiles(FileSystem fs, Path partitionPath, final String fileId,
        final String logFileExtension, final String baseCommitTime) throws IOException {
    return Arrays
            .stream(fs.listStatus(partitionPath,
                    path -> path.getName().startsWith("." + fileId)
                            && path.getName().contains(logFileExtension)))
            .map(HoodieLogFile::new).filter(s -> s.getBaseCommitTime().equals(baseCommitTime));
}

From source file:com.uber.hoodie.HoodieWriteClient.java

License:Apache License

private void rollback(List<String> commits) {
    if (commits.isEmpty()) {
        logger.info("List of commits to rollback is empty");
        return;//from  w  w  w  . j a v  a  2 s  . c o  m
    }

    final Timer.Context context = metrics.getRollbackCtx();
    String startRollbackTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date());

    // Create a Hoodie table which encapsulated the commits and files visible
    HoodieTable<T> table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true),
            config);
    HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
    HoodieTimeline inflightTimeline = table.getInflightCommitTimeline();
    HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();

    // Check if any of the commits is a savepoint - do not allow rollback on those commits
    List<String> savepoints = table.getCompletedSavepointTimeline().getInstants()
            .map(HoodieInstant::getTimestamp).collect(Collectors.toList());
    commits.forEach(s -> {
        if (savepoints.contains(s)) {
            throw new HoodieRollbackException(
                    "Could not rollback a savepointed commit. Delete savepoint first before rolling back" + s);
        }
    });

    try {
        if (commitTimeline.empty() && inflightTimeline.empty()) {
            // nothing to rollback
            logger.info("No commits to rollback " + commits);
        }

        // Make sure only the last n commits are being rolled back
        // If there is a commit in-between or after that is not rolled back, then abort
        String lastCommit = commits.get(commits.size() - 1);
        if (!commitTimeline.empty()
                && !commitTimeline.findInstantsAfter(lastCommit, Integer.MAX_VALUE).empty()) {
            throw new HoodieRollbackException(
                    "Found commits after time :" + lastCommit + ", please rollback greater commits first");
        }

        List<String> inflights = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp)
                .collect(Collectors.toList());
        if (!inflights.isEmpty() && inflights.indexOf(lastCommit) != inflights.size() - 1) {
            throw new HoodieRollbackException("Found in-flight commits after time :" + lastCommit
                    + ", please rollback greater commits first");
        }

        // Atomically unpublish all the commits
        commits.stream().filter(s -> !inflights.contains(s))
                .map(s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s))
                .forEach(activeTimeline::revertToInflight);
        logger.info("Unpublished " + commits);

        // cleanup index entries
        commits.stream().forEach(s -> {
            if (!index.rollbackCommit(s)) {
                throw new HoodieRollbackException("Clean out index changes failed, for time :" + s);
            }
        });
        logger.info("Index rolled back for commits " + commits);

        // delete all the data files for all these commits
        logger.info("Clean out all parquet files generated for commits: " + commits);
        final LongAccumulator numFilesDeletedCounter = jsc.sc().longAccumulator();
        List<HoodieRollbackStat> stats = jsc
                .parallelize(FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(),
                        config.shouldAssumeDatePartitioning()))
                .map((Function<String, HoodieRollbackStat>) partitionPath -> {
                    // Scan all partitions files with this commit time
                    logger.info("Cleaning path " + partitionPath);
                    FileSystem fs1 = FSUtils.getFs();
                    FileStatus[] toBeDeleted = fs1.listStatus(new Path(config.getBasePath(), partitionPath),
                            path -> {
                                if (!path.toString().contains(".parquet")) {
                                    return false;
                                }
                                String fileCommitTime = FSUtils.getCommitTime(path.getName());
                                return commits.contains(fileCommitTime);
                            });
                    Map<FileStatus, Boolean> results = Maps.newHashMap();
                    for (FileStatus file : toBeDeleted) {
                        boolean success = fs1.delete(file.getPath(), false);
                        results.put(file, success);
                        logger.info("Delete file " + file.getPath() + "\t" + success);
                        if (success) {
                            numFilesDeletedCounter.add(1);
                        }
                    }
                    return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
                            .withDeletedFileResults(results).build();
                }).collect();

        // Remove the rolled back inflight commits
        commits.stream().map(s -> new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, s))
                .forEach(activeTimeline::deleteInflight);
        logger.info("Deleted inflight commits " + commits);

        Optional<Long> durationInMs = Optional.empty();
        if (context != null) {
            durationInMs = Optional.of(metrics.getDurationInMs(context.stop()));
            Long numFilesDeleted = numFilesDeletedCounter.value();
            metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted);
        }
        HoodieRollbackMetadata rollbackMetadata = AvroUtils.convertRollbackMetadata(startRollbackTime,
                durationInMs, commits, stats);
        table.getActiveTimeline().saveAsComplete(
                new HoodieInstant(true, HoodieTimeline.ROLLBACK_ACTION, startRollbackTime),
                AvroUtils.serializeRollbackMetadata(rollbackMetadata));
        logger.info("Commits " + commits + " rollback is complete");

        if (!table.getActiveTimeline().getCleanerTimeline().empty()) {
            logger.info("Cleaning up older rollback meta files");
            // Cleanup of older cleaner meta files
            // TODO - make the commit archival generic and archive rollback metadata
            FSUtils.deleteOlderRollbackMetaFiles(fs, table.getMetaClient().getMetaPath(),
                    table.getActiveTimeline().getRollbackTimeline().getInstants());
        }
    } catch (IOException e) {
        throw new HoodieRollbackException("Failed to rollback " + config.getBasePath() + " commits " + commits,
                e);
    }
}

From source file:com.uber.hoodie.table.HoodieCopyOnWriteTable.java

License:Apache License

/**
 * Common method used for cleaning out parquet files under a partition path during rollback of a
 * set of commits//from w  w w.j a  v  a 2s .c  om
 */
protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String partitionPath,
        PathFilter filter) throws IOException {
    logger.info("Cleaning path " + partitionPath);
    FileSystem fs = getMetaClient().getFs();
    FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath),
            filter);
    for (FileStatus file : toBeDeleted) {
        boolean success = fs.delete(file.getPath(), false);
        results.put(file, success);
        logger.info("Delete file " + file.getPath() + "\t" + success);
    }
    return results;
}

From source file:com.uber.hoodie.table.HoodieCopyOnWriteTable.java

License:Apache License

/**
 * Common method used for cleaning out parquet files under a partition path during rollback of a
 * set of commits/*from w  w  w.  j a  v a 2 s . c  om*/
 */
protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String commit,
        String partitionPath) throws IOException {
    logger.info("Cleaning path " + partitionPath);
    FileSystem fs = getMetaClient().getFs();
    PathFilter filter = (path) -> {
        if (path.toString().contains(".parquet")) {
            String fileCommitTime = FSUtils.getCommitTime(path.getName());
            return commit.equals(fileCommitTime);
        }
        return false;
    };
    FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath),
            filter);
    for (FileStatus file : toBeDeleted) {
        boolean success = fs.delete(file.getPath(), false);
        results.put(file, success);
        logger.info("Delete file " + file.getPath() + "\t" + success);
    }
    return results;
}