List of usage examples for org.apache.hadoop.fs FileSystem listStatus
public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException
From source file:com.twitter.algebra.matrix.format.MapDir.java
License:Apache License
/** * Disk usage of the MapDir or a dir of sequence files * @param mapDirPath the path to MapDir or a directory of sequence files * @param fs/*from w w w . j av a 2s .c o m*/ * @return * @throws FileNotFoundException * @throws IOException */ public static long du(Path mapDirPath, FileSystem fs) throws FileNotFoundException, IOException { FileStatus[] dirs = fs.listStatus(mapDirPath, mapFilter); if (dirs.length == 0) //it is not a mapdir then, do a simple ls dirs = fs.listStatus(mapDirPath); long size = 0; for (FileStatus dirStatus : dirs) { //if it is a sequence file if (dirStatus.isFile()) size += dirStatus.getLen(); else //or if it is a mapfile, which is directory size += dirSize(dirStatus, fs); } return size; }
From source file:com.twitter.algebra.matrix.format.MapDir.java
License:Apache License
private static long dirSize(FileStatus dirStatus, FileSystem fs) throws FileNotFoundException, IOException { FileStatus[] files = fs.listStatus(dirStatus.getPath(), new PathFilter() { @Override/* w w w . j av a 2 s .c o m*/ public boolean accept(final Path file) { return true; } }); long size = 0; for (FileStatus file : files) size += file.getLen(); return size; }
From source file:com.twitter.elephanttwin.lucene.indexing.AbstractLuceneIndexingJob.java
License:Apache License
protected void addInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path, PathFilter inputFilter) throws IOException { for (FileStatus stat : fs.listStatus(path, inputFilter)) { if (stat.isDir()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else {/*from ww w . ja va2 s .c o m*/ result.add(stat); } } }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
/** * @param context//from w w w . j a v a2 s.c o m * @param file * the input file provided to the job to work on * @param columnName * @return the list of index files if there is an index directory created for * the input file * @throws IOException */ protected static FileStatus[] listIndexFiles(JobContext context, Path file, String columnName) throws IOException { Path indexFilePath = new Path(getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName); FileSystem fs = file.getFileSystem(context.getConfiguration()); FileStatus[] dirlist = fs.listStatus(indexFilePath, indexFileFilter); return dirlist; }
From source file:com.twitter.hraven.etl.FileLister.java
License:Apache License
/** * Gets the list of files for a given path filtered as per the input path range filter * Can go into directories recursively//from w w w . j ava2 s. c om * * @param recurse - whether or not to traverse recursively * @param hdfs - the file system * @param inputPath - the path to traverse for getting the list of files * @param jobFileModifiedRangePathFilter - the filter to include/exclude certain files * * @return array of file status. * @throws IOException */ public static FileStatus[] listFiles(boolean recurse, FileSystem hdfs, Path inputPath, JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter) throws IOException { if (recurse) { List<FileStatus> fileStatusesList = new ArrayList<FileStatus>(); traverseDirs(fileStatusesList, hdfs, inputPath, jobFileModifiedRangePathFilter); FileStatus[] fileStatuses = (FileStatus[]) fileStatusesList .toArray(new FileStatus[fileStatusesList.size()]); return fileStatuses; } else { return hdfs.listStatus(inputPath, jobFileModifiedRangePathFilter); } }
From source file:com.uber.hoodie.common.table.HoodieTableMetaClient.java
License:Apache License
public static FileStatus[] scanFiles(FileSystem fs, Path metaPath, PathFilter nameFilter) throws IOException { return fs.listStatus(metaPath, nameFilter); }
From source file:com.uber.hoodie.common.util.FSUtils.java
License:Apache License
/** * Get all the log files for the passed in FileId in the partition path *//*from w ww . j a va2 s.c o m*/ public static Stream<HoodieLogFile> getAllLogFiles(FileSystem fs, Path partitionPath, final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException { return Arrays .stream(fs.listStatus(partitionPath, path -> path.getName().startsWith("." + fileId) && path.getName().contains(logFileExtension))) .map(HoodieLogFile::new).filter(s -> s.getBaseCommitTime().equals(baseCommitTime)); }
From source file:com.uber.hoodie.HoodieWriteClient.java
License:Apache License
private void rollback(List<String> commits) { if (commits.isEmpty()) { logger.info("List of commits to rollback is empty"); return;//from w w w . j a v a 2 s . c o m } final Timer.Context context = metrics.getRollbackCtx(); String startRollbackTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date()); // Create a Hoodie table which encapsulated the commits and files visible HoodieTable<T> table = HoodieTable.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieTimeline inflightTimeline = table.getInflightCommitTimeline(); HoodieTimeline commitTimeline = table.getCompletedCommitTimeline(); // Check if any of the commits is a savepoint - do not allow rollback on those commits List<String> savepoints = table.getCompletedSavepointTimeline().getInstants() .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); commits.forEach(s -> { if (savepoints.contains(s)) { throw new HoodieRollbackException( "Could not rollback a savepointed commit. Delete savepoint first before rolling back" + s); } }); try { if (commitTimeline.empty() && inflightTimeline.empty()) { // nothing to rollback logger.info("No commits to rollback " + commits); } // Make sure only the last n commits are being rolled back // If there is a commit in-between or after that is not rolled back, then abort String lastCommit = commits.get(commits.size() - 1); if (!commitTimeline.empty() && !commitTimeline.findInstantsAfter(lastCommit, Integer.MAX_VALUE).empty()) { throw new HoodieRollbackException( "Found commits after time :" + lastCommit + ", please rollback greater commits first"); } List<String> inflights = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp) .collect(Collectors.toList()); if (!inflights.isEmpty() && inflights.indexOf(lastCommit) != inflights.size() - 1) { throw new HoodieRollbackException("Found in-flight commits after time :" + lastCommit + ", please rollback greater commits first"); } // Atomically unpublish all the commits commits.stream().filter(s -> !inflights.contains(s)) .map(s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s)) .forEach(activeTimeline::revertToInflight); logger.info("Unpublished " + commits); // cleanup index entries commits.stream().forEach(s -> { if (!index.rollbackCommit(s)) { throw new HoodieRollbackException("Clean out index changes failed, for time :" + s); } }); logger.info("Index rolled back for commits " + commits); // delete all the data files for all these commits logger.info("Clean out all parquet files generated for commits: " + commits); final LongAccumulator numFilesDeletedCounter = jsc.sc().longAccumulator(); List<HoodieRollbackStat> stats = jsc .parallelize(FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning())) .map((Function<String, HoodieRollbackStat>) partitionPath -> { // Scan all partitions files with this commit time logger.info("Cleaning path " + partitionPath); FileSystem fs1 = FSUtils.getFs(); FileStatus[] toBeDeleted = fs1.listStatus(new Path(config.getBasePath(), partitionPath), path -> { if (!path.toString().contains(".parquet")) { return false; } String fileCommitTime = FSUtils.getCommitTime(path.getName()); return commits.contains(fileCommitTime); }); Map<FileStatus, Boolean> results = Maps.newHashMap(); for (FileStatus file : toBeDeleted) { boolean success = fs1.delete(file.getPath(), false); results.put(file, success); logger.info("Delete file " + file.getPath() + "\t" + success); if (success) { numFilesDeletedCounter.add(1); } } return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) .withDeletedFileResults(results).build(); }).collect(); // Remove the rolled back inflight commits commits.stream().map(s -> new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, s)) .forEach(activeTimeline::deleteInflight); logger.info("Deleted inflight commits " + commits); Optional<Long> durationInMs = Optional.empty(); if (context != null) { durationInMs = Optional.of(metrics.getDurationInMs(context.stop())); Long numFilesDeleted = numFilesDeletedCounter.value(); metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted); } HoodieRollbackMetadata rollbackMetadata = AvroUtils.convertRollbackMetadata(startRollbackTime, durationInMs, commits, stats); table.getActiveTimeline().saveAsComplete( new HoodieInstant(true, HoodieTimeline.ROLLBACK_ACTION, startRollbackTime), AvroUtils.serializeRollbackMetadata(rollbackMetadata)); logger.info("Commits " + commits + " rollback is complete"); if (!table.getActiveTimeline().getCleanerTimeline().empty()) { logger.info("Cleaning up older rollback meta files"); // Cleanup of older cleaner meta files // TODO - make the commit archival generic and archive rollback metadata FSUtils.deleteOlderRollbackMetaFiles(fs, table.getMetaClient().getMetaPath(), table.getActiveTimeline().getRollbackTimeline().getInstants()); } } catch (IOException e) { throw new HoodieRollbackException("Failed to rollback " + config.getBasePath() + " commits " + commits, e); } }
From source file:com.uber.hoodie.table.HoodieCopyOnWriteTable.java
License:Apache License
/** * Common method used for cleaning out parquet files under a partition path during rollback of a * set of commits//from w w w.j a v a 2s .c om */ protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String partitionPath, PathFilter filter) throws IOException { logger.info("Cleaning path " + partitionPath); FileSystem fs = getMetaClient().getFs(); FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); for (FileStatus file : toBeDeleted) { boolean success = fs.delete(file.getPath(), false); results.put(file, success); logger.info("Delete file " + file.getPath() + "\t" + success); } return results; }
From source file:com.uber.hoodie.table.HoodieCopyOnWriteTable.java
License:Apache License
/** * Common method used for cleaning out parquet files under a partition path during rollback of a * set of commits/*from w w w. j a v a 2 s . c om*/ */ protected Map<FileStatus, Boolean> deleteCleanedFiles(Map<FileStatus, Boolean> results, String commit, String partitionPath) throws IOException { logger.info("Cleaning path " + partitionPath); FileSystem fs = getMetaClient().getFs(); PathFilter filter = (path) -> { if (path.toString().contains(".parquet")) { String fileCommitTime = FSUtils.getCommitTime(path.getName()); return commit.equals(fileCommitTime); } return false; }; FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); for (FileStatus file : toBeDeleted) { boolean success = fs.delete(file.getPath(), false); results.put(file, success); logger.info("Delete file " + file.getPath() + "\t" + success); } return results; }