List of usage examples for org.apache.hadoop.fs FileSystem listStatus
public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException
From source file:com.uber.hoodie.utilities.HoodieSnapshotCopier.java
License:Apache License
public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir, final boolean shouldAssumeDatePartitioning) throws IOException { FileSystem fs = FSUtils.getFs(baseDir, jsc.hadoopConfiguration()); final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration()); final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs.getConf(), baseDir); final TableFileSystemView.ReadOptimizedView fsView = new HoodieTableFileSystemView(tableMetadata, tableMetadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants()); // Get the latest commit Optional<HoodieInstant> latestCommit = tableMetadata.getActiveTimeline().getCommitsTimeline() .filterCompletedInstants().lastInstant(); if (!latestCommit.isPresent()) { logger.warn("No commits present. Nothing to snapshot"); return;/* w w w.j a v a 2 s .c o m*/ } final String latestCommitTimestamp = latestCommit.get().getTimestamp(); logger.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", latestCommitTimestamp)); List<String> partitions = FSUtils.getAllPartitionPaths(fs, baseDir, shouldAssumeDatePartitioning); if (partitions.size() > 0) { logger.info(String.format("The job needs to copy %d partitions.", partitions.size())); // Make sure the output directory is empty Path outputPath = new Path(outputDir); if (fs.exists(outputPath)) { logger.warn( String.format("The output path %s targetBasePath already exists, deleting", outputPath)); fs.delete(new Path(outputDir), true); } jsc.parallelize(partitions, partitions.size()).flatMap(partition -> { // Only take latest version files <= latestCommit. FileSystem fs1 = FSUtils.getFs(baseDir, serConf.get()); List<Tuple2<String, String>> filePaths = new ArrayList<>(); Stream<HoodieDataFile> dataFiles = fsView.getLatestDataFilesBeforeOrOn(partition, latestCommitTimestamp); dataFiles.forEach( hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath()))); // also need to copy over partition metadata Path partitionMetaFile = new Path(new Path(baseDir, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE); if (fs1.exists(partitionMetaFile)) { filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString())); } return filePaths.iterator(); }).foreach(tuple -> { String partition = tuple._1(); Path sourceFilePath = new Path(tuple._2()); Path toPartitionPath = new Path(outputDir, partition); FileSystem ifs = FSUtils.getFs(baseDir, serConf.get()); if (!ifs.exists(toPartitionPath)) { ifs.mkdirs(toPartitionPath); } FileUtil.copy(ifs, sourceFilePath, ifs, new Path(toPartitionPath, sourceFilePath.getName()), false, ifs.getConf()); }); // Also copy the .commit files logger.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp)); FileStatus[] commitFilesToCopy = fs.listStatus( new Path(baseDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> { if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) { return true; } else { String commitTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName()); return HoodieTimeline.compareTimestamps(commitTime, latestCommitTimestamp, HoodieTimeline.LESSER_OR_EQUAL); } }); for (FileStatus commitStatus : commitFilesToCopy) { Path targetFilePath = new Path(outputDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName()); if (!fs.exists(targetFilePath.getParent())) { fs.mkdirs(targetFilePath.getParent()); } if (fs.exists(targetFilePath)) { logger.error(String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath)); } FileUtil.copy(fs, commitStatus.getPath(), fs, targetFilePath, false, fs.getConf()); } } else { logger.info("The job has 0 partition to copy."); } // Create the _SUCCESS tag Path successTagPath = new Path(outputDir + "/_SUCCESS"); if (!fs.exists(successTagPath)) { logger.info(String.format("Creating _SUCCESS under targetBasePath: $s", outputDir)); fs.createNewFile(successTagPath); } }
From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java
License:Apache License
/** * Add files in the input path recursively into the results. * @param result//from ww w . ja v a 2 s.c o m * The List to store all files. * @param fs * The FileSystem. * @param path * The input path. * @param inputFilter * The input filter that can be used to filter files/dirs. * @throws IOException */ protected void addInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path, PathFilter inputFilter) throws IOException { for (FileStatus stat : fs.listStatus(path, inputFilter)) { if (stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } }
From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java
License:Apache License
/** List input directories. * Subclasses may override to, e.g., select only files matching a regular * expression. //w w w . j a v a 2 s.c o m * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected FileStatus[] listStatus(JobConf job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job); // Whether we need to recursive look into the directory structure boolean recursive = job.getBoolean("mapred.input.dir.recursive", false); List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); for (Path p : dirs) { FileSystem fs = p.getFileSystem(job); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) { if (recursive && stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } LOG.info("Total input paths to process : " + result.size()); return result.toArray(new FileStatus[result.size()]); }
From source file:crunch.MaxTemperature.java
License:Apache License
private void checkOutput(Configuration conf, Path output) throws IOException { FileSystem fs = FileSystem.getLocal(conf); Path[] outputFiles = FileUtil.stat2Paths(fs.listStatus(output, new OutputLogFilter())); assertThat(outputFiles.length, is(1)); BufferedReader actual = asBufferedReader(fs.open(outputFiles[0])); BufferedReader expected = asBufferedReader(getClass().getResourceAsStream("/expected.txt")); String expectedLine;/*ww w.j a v a2 s .co m*/ while ((expectedLine = expected.readLine()) != null) { assertThat(actual.readLine(), is(expectedLine)); } assertThat(actual.readLine(), nullValue()); actual.close(); expected.close(); }
From source file:datafu.hourglass.avro.AvroDateRangeMetadata.java
License:Apache License
/** * Reads the date range from the metadata stored in an Avro file. * // w w w. ja va 2 s.co m * @param fs file system to access path * @param path path to get date range for * @return date range * @throws IOException */ public static DateRange getOutputFileDateRange(FileSystem fs, Path path) throws IOException { path = fs.listStatus(path, PathUtils.nonHiddenPathFilter)[0].getPath(); FSDataInputStream dataInputStream = fs.open(path); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader); try { return new DateRange(new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_START))), new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_END)))); } finally { dataFileStream.close(); dataInputStream.close(); } }
From source file:datafu.hourglass.demo.Examples.java
License:Apache License
private int countOutputFolders(Path path) throws IOException { FileSystem fs = getFileSystem(); return fs.listStatus(path, PathUtils.nonHiddenPathFilter).length; }
From source file:datafu.hourglass.fs.PathUtils.java
License:Apache License
/** * List all paths matching the "yyyyMMdd" format under a given path. * /*from www.j a v a 2 s .c o m*/ * @param fs file system * @param path path to search under * @return paths * @throws IOException */ public static List<DatePath> findDatedPaths(FileSystem fs, Path path) throws IOException { FileStatus[] outputPaths = fs.listStatus(path, nonHiddenPathFilter); List<DatePath> outputs = new ArrayList<DatePath>(); if (outputPaths != null) { for (FileStatus outputPath : outputPaths) { Date date; try { date = datedPathFormat.parse(outputPath.getPath().getName()); } catch (ParseException e) { continue; } outputs.add(new DatePath(date, outputPath.getPath())); } } Collections.sort(outputs); return outputs; }
From source file:datafu.hourglass.fs.PathUtils.java
License:Apache License
/** * Gets the schema for the first Avro file under the given path. * //from w ww. j a v a 2 s.co m * @param path path to fetch schema for * @return Avro schema * @throws IOException */ public static Schema getSchemaFromPath(FileSystem fs, Path path) throws IOException { return getSchemaFromFile(fs, fs.listStatus(path, nonHiddenPathFilter)[0].getPath()); }
From source file:datafu.hourglass.fs.PathUtils.java
License:Apache License
/** * Sums the size of all files listed under a given path. * /*from w w w.j ava 2s.c o m*/ * @param fs file system * @param path path to count bytes for * @return total bytes under path * @throws IOException */ public static long countBytes(FileSystem fs, Path path) throws IOException { FileStatus[] files = fs.listStatus(path, nonHiddenPathFilter); long totalForPath = 0L; for (FileStatus file : files) { totalForPath += file.getLen(); } return totalForPath; }
From source file:datafu.hourglass.test.FirstAndSecondPassJobTests.java
License:Apache License
private int countOutputFolders() throws IOException { FileSystem fs = getFileSystem(); return fs.listStatus(_outputPath, PathUtils.nonHiddenPathFilter).length; }