Example usage for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:com.uber.hoodie.utilities.HoodieSnapshotCopier.java

License:Apache License

public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir,
        final boolean shouldAssumeDatePartitioning) throws IOException {
    FileSystem fs = FSUtils.getFs(baseDir, jsc.hadoopConfiguration());
    final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration());
    final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs.getConf(), baseDir);
    final TableFileSystemView.ReadOptimizedView fsView = new HoodieTableFileSystemView(tableMetadata,
            tableMetadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants());
    // Get the latest commit
    Optional<HoodieInstant> latestCommit = tableMetadata.getActiveTimeline().getCommitsTimeline()
            .filterCompletedInstants().lastInstant();
    if (!latestCommit.isPresent()) {
        logger.warn("No commits present. Nothing to snapshot");
        return;/* w w  w.j a  v  a 2 s .c  o m*/
    }
    final String latestCommitTimestamp = latestCommit.get().getTimestamp();
    logger.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.",
            latestCommitTimestamp));

    List<String> partitions = FSUtils.getAllPartitionPaths(fs, baseDir, shouldAssumeDatePartitioning);
    if (partitions.size() > 0) {
        logger.info(String.format("The job needs to copy %d partitions.", partitions.size()));

        // Make sure the output directory is empty
        Path outputPath = new Path(outputDir);
        if (fs.exists(outputPath)) {
            logger.warn(
                    String.format("The output path %s targetBasePath already exists, deleting", outputPath));
            fs.delete(new Path(outputDir), true);
        }

        jsc.parallelize(partitions, partitions.size()).flatMap(partition -> {
            // Only take latest version files <= latestCommit.
            FileSystem fs1 = FSUtils.getFs(baseDir, serConf.get());
            List<Tuple2<String, String>> filePaths = new ArrayList<>();
            Stream<HoodieDataFile> dataFiles = fsView.getLatestDataFilesBeforeOrOn(partition,
                    latestCommitTimestamp);
            dataFiles.forEach(
                    hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));

            // also need to copy over partition metadata
            Path partitionMetaFile = new Path(new Path(baseDir, partition),
                    HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
            if (fs1.exists(partitionMetaFile)) {
                filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
            }

            return filePaths.iterator();
        }).foreach(tuple -> {
            String partition = tuple._1();
            Path sourceFilePath = new Path(tuple._2());
            Path toPartitionPath = new Path(outputDir, partition);
            FileSystem ifs = FSUtils.getFs(baseDir, serConf.get());

            if (!ifs.exists(toPartitionPath)) {
                ifs.mkdirs(toPartitionPath);
            }
            FileUtil.copy(ifs, sourceFilePath, ifs, new Path(toPartitionPath, sourceFilePath.getName()), false,
                    ifs.getConf());
        });

        // Also copy the .commit files
        logger.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
        FileStatus[] commitFilesToCopy = fs.listStatus(
                new Path(baseDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
                    if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
                        return true;
                    } else {
                        String commitTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
                        return HoodieTimeline.compareTimestamps(commitTime, latestCommitTimestamp,
                                HoodieTimeline.LESSER_OR_EQUAL);
                    }
                });
        for (FileStatus commitStatus : commitFilesToCopy) {
            Path targetFilePath = new Path(outputDir + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
                    + commitStatus.getPath().getName());
            if (!fs.exists(targetFilePath.getParent())) {
                fs.mkdirs(targetFilePath.getParent());
            }
            if (fs.exists(targetFilePath)) {
                logger.error(String.format("The target output commit file (%s targetBasePath) already exists.",
                        targetFilePath));
            }
            FileUtil.copy(fs, commitStatus.getPath(), fs, targetFilePath, false, fs.getConf());
        }
    } else {
        logger.info("The job has 0 partition to copy.");
    }

    // Create the _SUCCESS tag
    Path successTagPath = new Path(outputDir + "/_SUCCESS");
    if (!fs.exists(successTagPath)) {
        logger.info(String.format("Creating _SUCCESS under targetBasePath: $s", outputDir));
        fs.createNewFile(successTagPath);
    }
}

From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java

License:Apache License

/**
 * Add files in the input path recursively into the results.
 * @param result//from   ww  w  . ja  v  a 2 s.c o m
 *          The List to store all files.
 * @param fs
 *          The FileSystem.
 * @param path
 *          The input path.
 * @param inputFilter
 *          The input filter that can be used to filter files/dirs. 
 * @throws IOException
 */
protected void addInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path,
        PathFilter inputFilter) throws IOException {
    for (FileStatus stat : fs.listStatus(path, inputFilter)) {
        if (stat.isDirectory()) {
            addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
        } else {
            result.add(stat);
        }
    }
}

From source file:com.vertica.hadoop.FixedSplitFileInputFormat.java

License:Apache License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. //w w w  .  j a v a 2 s.c  o m
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected FileStatus[] listStatus(JobConf job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job);

    // Whether we need to recursive look into the directory structure
    boolean recursive = job.getBoolean("mapred.input.dir.recursive", false);

    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        if (recursive && stat.isDirectory()) {
                            addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                        } else {
                            result.add(stat);
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result.toArray(new FileStatus[result.size()]);
}

From source file:crunch.MaxTemperature.java

License:Apache License

private void checkOutput(Configuration conf, Path output) throws IOException {
        FileSystem fs = FileSystem.getLocal(conf);
        Path[] outputFiles = FileUtil.stat2Paths(fs.listStatus(output, new OutputLogFilter()));
        assertThat(outputFiles.length, is(1));

        BufferedReader actual = asBufferedReader(fs.open(outputFiles[0]));
        BufferedReader expected = asBufferedReader(getClass().getResourceAsStream("/expected.txt"));
        String expectedLine;/*ww w.j a  v  a2  s  .co m*/
        while ((expectedLine = expected.readLine()) != null) {
            assertThat(actual.readLine(), is(expectedLine));
        }
        assertThat(actual.readLine(), nullValue());
        actual.close();
        expected.close();
    }

From source file:datafu.hourglass.avro.AvroDateRangeMetadata.java

License:Apache License

/**
 * Reads the date range from the metadata stored in an Avro file.
 * // w w w.  ja va  2 s.co m
 * @param fs file system to access path
 * @param path path to get date range for
 * @return date range
 * @throws IOException
 */
public static DateRange getOutputFileDateRange(FileSystem fs, Path path) throws IOException {
    path = fs.listStatus(path, PathUtils.nonHiddenPathFilter)[0].getPath();
    FSDataInputStream dataInputStream = fs.open(path);
    DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader);

    try {
        return new DateRange(new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_START))),
                new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_END))));
    } finally {
        dataFileStream.close();
        dataInputStream.close();
    }
}

From source file:datafu.hourglass.demo.Examples.java

License:Apache License

private int countOutputFolders(Path path) throws IOException {
    FileSystem fs = getFileSystem();
    return fs.listStatus(path, PathUtils.nonHiddenPathFilter).length;
}

From source file:datafu.hourglass.fs.PathUtils.java

License:Apache License

/**
 * List all paths matching the "yyyyMMdd" format under a given path.
 * /*from  www.j  a v  a  2  s  .c o m*/
 * @param fs file system
 * @param path path to search under
 * @return paths
 * @throws IOException
 */
public static List<DatePath> findDatedPaths(FileSystem fs, Path path) throws IOException {
    FileStatus[] outputPaths = fs.listStatus(path, nonHiddenPathFilter);

    List<DatePath> outputs = new ArrayList<DatePath>();

    if (outputPaths != null) {
        for (FileStatus outputPath : outputPaths) {
            Date date;
            try {
                date = datedPathFormat.parse(outputPath.getPath().getName());
            } catch (ParseException e) {
                continue;
            }

            outputs.add(new DatePath(date, outputPath.getPath()));
        }
    }

    Collections.sort(outputs);

    return outputs;
}

From source file:datafu.hourglass.fs.PathUtils.java

License:Apache License

/**
 * Gets the schema for the first Avro file under the given path.
 * //from w ww. j a  v a 2  s.co  m
 * @param path path to fetch schema for
 * @return Avro schema
 * @throws IOException
 */
public static Schema getSchemaFromPath(FileSystem fs, Path path) throws IOException {
    return getSchemaFromFile(fs, fs.listStatus(path, nonHiddenPathFilter)[0].getPath());
}

From source file:datafu.hourglass.fs.PathUtils.java

License:Apache License

/**
 * Sums the size of all files listed under a given path. 
 * /*from  w  w  w.j  ava 2s.c o  m*/
 * @param fs file system
 * @param path path to count bytes for
 * @return total bytes under path
 * @throws IOException
 */
public static long countBytes(FileSystem fs, Path path) throws IOException {
    FileStatus[] files = fs.listStatus(path, nonHiddenPathFilter);
    long totalForPath = 0L;
    for (FileStatus file : files) {
        totalForPath += file.getLen();
    }
    return totalForPath;
}

From source file:datafu.hourglass.test.FirstAndSecondPassJobTests.java

License:Apache License

private int countOutputFolders() throws IOException {
    FileSystem fs = getFileSystem();
    return fs.listStatus(_outputPath, PathUtils.nonHiddenPathFilter).length;
}