Example usage for org.apache.hadoop.fs FileSystem listStatus

List of usage examples for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException 

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:com.uber.hoodie.utilities.HoodieSnapshotCopier.java

License:Apache License

public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir,
        final boolean shouldAssumeDatePartitioning) throws IOException {
    FileSystem fs = FSUtils.getFs(baseDir, jsc.hadoopConfiguration());
    final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration());
    final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs.getConf(), baseDir);
    final TableFileSystemView.ReadOptimizedView fsView = new HoodieTableFileSystemView(tableMetadata,
            tableMetadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants());
    // Get the latest commit
    Optional<HoodieInstant> latestCommit = tableMetadata.getActiveTimeline().getCommitsTimeline()
            .filterCompletedInstants().lastInstant();
    if (!latestCommit.isPresent()) {
        logger.warn("No commits present. Nothing to snapshot");
        return;/* w w  w.j a  v  a 2 s .c  o m*/
    }
    final String latestCommitTimestamp = latestCommit.get().getTimestamp();
    logger.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.",
            latestCommitTimestamp));

    List<String> partitions = FSUtils.getAllPartitionPaths(fs, baseDir, shouldAssumeDatePartitioning);
    if (partitions.size() > 0) {
        logger.info(String.format("The job needs to copy %d partitions.", partitions.size()));

        // Make sure the output directory is empty
        Path outputPath = new Path(outputDir);
        if (fs.exists(outputPath)) {
            logger.warn(
                    String.format("The output path %s targetBasePath already exists, deleting", outputPath));
            fs.delete(new Path(outputDir), true);
        }

        jsc.parallelize(partitions, partitions.size()).flatMap(partition -> {
            // Only take latest version files <= latestCommit.
            FileSystem fs1 = FSUtils.getFs(baseDir, serConf.get());
            List<Tuple2<String, String>> filePaths = new ArrayList<>();
            Stream<HoodieDataFile> dataFiles = fsView.getLatestDataFilesBeforeOrOn(partition,
                    latestCommitTimestamp);
            dataFiles.forEach(
                    hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));

            // also need to copy over partition metadata
            Path partitionMetaFile = new Path(new Path(baseDir, partition),
                    HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
            if (fs1.exists(partitionMetaFile)) {
                filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
            }

            return filePaths.iterator();
        }).foreach(tuple -> {
            String partition = tuple._1();
            Path sourceFilePath = new Path(tuple._2());
            Path toPartitionPath = new Path(outputDir, partition);
            FileSystem ifs = FSUtils.getFs(baseDir, serConf.get());

            if (!ifs.exists(toPartitionPath)) {
                ifs.mkdirs(toPartitionPath);
            }
            FileUtil.copy(ifs, sourceFilePath, ifs, new Path(toPartitionPath, sourceFilePath.getName()), false,
                    ifs.getConf());
        });

        // Also copy the .commit files
        logger.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
        FileStatus[] commitFilesToCopy = fs.listStatus(
                new Path(baseDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
                    if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
                        return true;
                    } else {
                        String commitTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
                        return HoodieTimeline.compareTimestamps(commitTime, latestCommitTimestamp,
                                HoodieTimeline.LESSER_OR_EQUAL);
                    }
                });
        for (FileStatus commitStatus : commitFilesToCopy) {
            Path targetFilePath = new Path(outputDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
                    + commitStatus.getPath().getName());
            if (!fs.exists(targetFilePath.getParent())) {
                fs.mkdirs(targetFilePath.getParent());
            }
            if (fs.exists(targetFilePath)) {
                logger.error(String.format("The target output commit file (%s targetBasePath) already exists.",
                        targetFilePath));
            }
            FileUtil.copy(fs, commitStatus.getPath(), fs, targetFilePath, false, fs.getConf());
        }
    } else {
        logger.info("The job has 0 partition to copy.");
    }

    // Create the _SUCCESS tag
    Path successTagPath = new Path(outputDir + "/_SUCCESS");
    if (!fs.exists(successTagPath)) {
        logger.info(String.format("Creating _SUCCESS under targetBasePath: $s", outputDir));
        fs.createNewFile(successTagPath);
    }
}

From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java

License:Apache License

/**
 * Add files in the input path recursively into the results.
 * @param result//from   ww  w  . ja  v  a 2 s.c o m
 *          The List to store all files.
 * @param fs
 *          The FileSystem.
 * @param path
 *          The input path.
 * @param inputFilter
 *          The input filter that can be used to filter files/dirs. 
 * @throws IOException
 */
protected void addInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path,
        PathFilter inputFilter) throws IOException {
    for (FileStatus stat : fs.listStatus(path, inputFilter)) {
        if (stat.isDirectory()) {
            addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
        } else {
            result.add(stat);
        }
    }
}

From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java

License:Apache License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. //w w w  .  j a v a 2 s.c  o m
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected FileStatus[] listStatus(JobConf job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job);

    // Whether we need to recursive look into the directory structure
    boolean recursive = job.getBoolean("mapred.input.dir.recursive", false);

    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        if (recursive && stat.isDirectory()) {
                            addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                        } else {
                            result.add(stat);
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result.toArray(new FileStatus[result.size()]);
}

From source file:crunch.MaxTemperature.java

License:Apache License

private void checkOutput(Configuration conf, Path output) throws IOException {
        FileSystem fs = FileSystem.getLocal(conf);
        Path[] outputFiles = FileUtil.stat2Paths(fs.listStatus(output, new OutputLogFilter()));
        assertThat(outputFiles.length, is(1));

        BufferedReader actual = asBufferedReader(fs.open(outputFiles[0]));
        BufferedReader expected = asBufferedReader(getClass().getResourceAsStream("/expected.txt"));
        String expectedLine;/*ww w.j a  v  a2  s  .co m*/
        while ((expectedLine = expected.readLine()) != null) {
            assertThat(actual.readLine(), is(expectedLine));
        }
        assertThat(actual.readLine(), nullValue());
        actual.close();
        expected.close();
    }

From source file:datafu.hourglass.avro.AvroDateRangeMetadata.java

License:Apache License

/**
 * Reads the date range from the metadata stored in an Avro file.
 * // w w w.  ja va  2 s.co m
 * @param fs file system to access path
 * @param path path to get date range for
 * @return date range
 * @throws IOException
 */
public static DateRange getOutputFileDateRange(FileSystem fs, Path path) throws IOException {
    path = fs.listStatus(path, PathUtils.nonHiddenPathFilter)[0].getPath();
    FSDataInputStream dataInputStream = fs.open(path);
    DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader);

    try {
        return new DateRange(new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_START))),
                new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_END))));
    } finally {
        dataFileStream.close();
        dataInputStream.close();
    }
}

From source file:datafu.hourglass.demo.Examples.java

License:Apache License

private int countOutputFolders(Path path) throws IOException {
    FileSystem fs = getFileSystem();
    return fs.listStatus(path, PathUtils.nonHiddenPathFilter).length;
}

From source file:datafu.hourglass.fs.PathUtils.java

License:Apache License

/**
 * List all paths matching the "yyyyMMdd" format under a given path.
 * /*from  www.j  a v  a  2  s  .c o m*/
 * @param fs file system
 * @param path path to search under
 * @return paths
 * @throws IOException
 */
public static List<DatePath> findDatedPaths(FileSystem fs, Path path) throws IOException {
    FileStatus[] outputPaths = fs.listStatus(path, nonHiddenPathFilter);

    List<DatePath> outputs = new ArrayList<DatePath>();

    if (outputPaths != null) {
        for (FileStatus outputPath : outputPaths) {
            Date date;
            try {
                date = datedPathFormat.parse(outputPath.getPath().getName());
            } catch (ParseException e) {
                continue;
            }

            outputs.add(new DatePath(date, outputPath.getPath()));
        }
    }

    Collections.sort(outputs);

    return outputs;
}

From source file:datafu.hourglass.fs.PathUtils.java

License:Apache License

/**
 * Gets the schema for the first Avro file under the given path.
 * //from w ww. j a  v a 2  s.co  m
 * @param path path to fetch schema for
 * @return Avro schema
 * @throws IOException
 */
public static Schema getSchemaFromPath(FileSystem fs, Path path) throws IOException {
    return getSchemaFromFile(fs, fs.listStatus(path, nonHiddenPathFilter)[0].getPath());
}

From source file:datafu.hourglass.fs.PathUtils.java

License:Apache License

/**
 * Sums the size of all files listed under a given path. 
 * /*from  w  w  w.j  ava 2s.c o  m*/
 * @param fs file system
 * @param path path to count bytes for
 * @return total bytes under path
 * @throws IOException
 */
public static long countBytes(FileSystem fs, Path path) throws IOException {
    FileStatus[] files = fs.listStatus(path, nonHiddenPathFilter);
    long totalForPath = 0L;
    for (FileStatus file : files) {
        totalForPath += file.getLen();
    }
    return totalForPath;
}

From source file:datafu.hourglass.test.FirstAndSecondPassJobTests.java

License:Apache License

private int countOutputFolders() throws IOException {
    FileSystem fs = getFileSystem();
    return fs.listStatus(_outputPath, PathUtils.nonHiddenPathFilter).length;
}