Example usage for org.apache.hadoop.fs FileSystem listStatus

List of usage examples for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException 

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java

License:Apache License

/**
 * Get avro schema of input path. There are three cases:
 * 1. if path is a file, then return its avro schema;
 * 2. if path is a first-level directory (no sub-directories), then
 * return the avro schema of one underlying file;
 * 3. if path contains sub-directories, then recursively check
 * whether all of them share the same schema and return it
 * if so or throw an exception if not./*from w w w  . j  av  a 2  s.  c  o  m*/
 *
 * @param path input path
 * @param fs file system
 * @return avro schema of data
 * @throws IOException if underlying sub-directories do not share the same schema; or if input path is empty or does not exist
 */
@SuppressWarnings("deprecation")
protected Schema getAvroSchema(Path path, FileSystem fs) throws IOException {
    if (!fs.exists(path) || !AvroStorageUtils.PATH_FILTER.accept(path))
        return null;

    /* if path is first level directory or is a file */
    if (!fs.isDirectory(path)) {
        return getSchema(path, fs);
    }

    FileStatus[] ss = fs.listStatus(path, AvroStorageUtils.PATH_FILTER);
    Schema schema = null;
    if (ss.length > 0) {
        if (AvroStorageUtils.noDir(ss))
            return getSchema(path, fs);

        /*otherwise, check whether schemas of underlying directories are the same */
        for (FileStatus s : ss) {
            Schema newSchema = getAvroSchema(s.getPath(), fs);
            if (schema == null) {
                schema = newSchema;
                if (!checkSchema) {
                    System.out.println("Do not check schema; use schema of " + s.getPath());
                    return schema;
                }
            } else if (newSchema != null && !schema.equals(newSchema)) {
                throw new IOException("Input path is " + path + ". Sub-direcotry " + s.getPath()
                        + " contains different schema " + newSchema + " than " + schema);
            }
        }
    }

    if (schema == null)
        System.err.println("Cannot get avro schema! Input path " + path + " might be empty.");

    return schema;
}

From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorageUtils.java

License:Apache License

private static void getAllFilesInternal(FileStatus file, Configuration conf, Set<Path> paths, FileSystem fs)
        throws IOException {
    for (FileStatus f : fs.listStatus(file.getPath(), PATH_FILTER)) {
        if (f.isDir()) {
            getAllFilesInternal(f, conf, paths, fs);
        } else {//from   w  w  w . j  a v  a2 s. c  o m
            paths.add(f.getPath());
        }
    }
}

From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorageUtils.java

License:Apache License

/** get last file of a hdfs path if it is  a directory;
 *   or return the file itself if path is a file
 *//*from   w  w w.j a  v  a  2 s. c  om*/
public static Path getLast(Path path, FileSystem fs) throws IOException {

    FileStatus status = fs.getFileStatus(path);
    if (!status.isDir()) {
        return path;
    }
    FileStatus[] statuses = fs.listStatus(path, PATH_FILTER);

    if (statuses.length == 0) {
        return null;
    } else {
        Arrays.sort(statuses);
        for (int i = statuses.length - 1; i >= 0; i--) {
            if (!statuses[i].isDir()) {
                return statuses[i].getPath();
            }
        }
        return null;
    }
}

From source file:com.linkedin.mapred.AvroUtils.java

License:Open Source License

public static FileStatus[] getAvroPartFiles(JobConf conf, Path outPath) throws IOException {
    Path outputPath = outPath;/* w  w  w.  j av  a 2 s  . c  o m*/
    FileSystem fileSystem = outputPath.getFileSystem(conf);

    FileStatus[] partFiles = fileSystem.listStatus(outputPath, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            if (path.getName().endsWith(".avro")) {
                return true;
            }
            return false;
        }
    });

    return partFiles;
}

From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Configuration conf = job.getConfiguration();
    try {/*from   w w w  . j  av  a2s .c om*/
        List<FileStatus> files = listStatus(job);

        long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
        long maxSize = getMaxSplitSize(job);
        for (FileStatus child : files) {
            Path path = child.getPath();
            FileSystem fs = path.getFileSystem(conf);
            // length is 0 for dir according to FSDirectory.java in 0.20
            // however, w/ Hadoop2, dir in local fs has non-zero length
            long length = child.getLen();
            BlockLocation[] blkLocations = null;
            if (!child.isDirectory() || fs instanceof DistributedFileSystem == false) {
                blkLocations = fs.getFileBlockLocations(child, 0, length);
            } else if (length != 0) {
                throw new IOException("non-zero length directory on HDFS:" + path.toUri().toString());
            }

            if ((length != 0) && isSplitable(job, path)) {
                long blockSize = child.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkLocations.length - 1].getHosts()));
                }
            } else if (length != 0) {
                splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
            } else {
                // Create empty hosts array for zero length files
                splits.add(new FileSplit(path, 0, length, new String[0]));
            }
        }
    } catch (InvalidInputException ex) {
        String inPath = conf.get(ConfigConstants.CONF_INPUT_DIRECTORY);
        String pattern = conf.get(ConfigConstants.CONF_INPUT_FILE_PATTERN, ".*");
        throw new IOException("No input files found with the specified input path " + inPath
                + " and input file pattern " + pattern, ex);
    }

    PathFilter jobFilter = getInputPathFilter(job);
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);
    // take a second pass of the splits generated to extract files from
    // directories
    int count = 0;
    // flatten directories until reaching SPLIT_COUNT_LIMIT
    while (count < splits.size() && splits.size() < SPLIT_COUNT_LIMIT) {
        FileSplit split = (FileSplit) splits.get(count);
        Path file = split.getPath();
        FileSystem fs = file.getFileSystem(conf);
        FileStatus status = fs.getFileStatus(file);
        if (status.isDirectory()) {
            FileStatus[] children = fs.listStatus(file, inputFilter);
            if (children.length + count < SPLIT_COUNT_LIMIT) {
                splits.remove(count);
                for (FileStatus stat : children) {
                    FileSplit child = new FileSplit(stat.getPath(), 0, stat.getLen(), null);
                    splits.add(child);
                }
            } else {
                count++;
            }
        } else {
            count++;
        }
    }
    return splits;
}

From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java

License:Apache License

private List<FileStatus> simpleListStatus(JobContext job, Path[] dirs, PathFilter inputFilter,
        boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    Configuration conf = job.getConfiguration();
    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];/*w w  w .j av a  2 s  .c o  m*/
        FileSystem fs = p.getFileSystem(conf);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    FileStatus[] files = fs.listStatus(globStat.getPath(), inputFilter);
                    for (int j = 0; j < files.length; j++) {
                        if (recursive && files[j].isDirectory()) {
                            simpleAddInputPathRecursively(result, fs, files[j].getPath(), inputFilter);
                        } else {
                            result.add(files[j]);
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    return result;
}

From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java

License:Apache License

protected void simpleAddInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path,
        PathFilter inputFilter) throws IOException {
    FileStatus[] files = fs.listStatus(path, inputFilter);
    for (int j = 0; j < files.length; j++) {
        if (files[j].isDirectory()) {
            simpleAddInputPathRecursively(result, fs, files[j].getPath(), inputFilter);
        } else {//w ww  .j a va2s . c o  m
            result.add(files[j]);
        }
    }
}

From source file:com.marklogic.contentpump.utilities.FileIterator.java

License:Apache License

@Override
public FileSplit next() {
    while (iterator.hasNext() || !fileDirSplits.isEmpty()) {
        try {//from ww  w.j  a va  2 s  .co  m
            if (iterator.hasNext()) {
                FileSplit split = iterator.next();
                Path file = ((FileSplit) split).getPath();

                FileSystem fs = file.getFileSystem(conf);

                FileStatus status = fs.getFileStatus(file);
                if (status.isDirectory()) {
                    FileStatus[] children = fs.listStatus(status.getPath(), inputFilter);
                    for (FileStatus stat : children) {
                        FileSplit child = new FileSplit(stat.getPath(), 0, stat.getLen(), null);
                        fileDirSplits.add(child);
                    }
                } else
                    return split;

            } else if (!fileDirSplits.isEmpty()) {
                FileSplit split = (FileSplit) fileDirSplits.remove(0);
                Path file = split.getPath();
                FileSystem fs = file.getFileSystem(conf);
                FileStatus status = fs.getFileStatus(file);

                if (!status.isDirectory()) {
                    return split;
                }
                FileStatus[] children = fs.listStatus(status.getPath(), inputFilter);

                List<FileSplit> expdFileSpts = new LinkedList<FileSplit>();
                for (FileStatus stat : children) {
                    FileSplit child = new FileSplit(stat.getPath(), 0, stat.getLen(), null);
                    expdFileSpts.add(child);
                }
                iterator = expdFileSpts.iterator();
                continue;
            }
        } catch (IOException e) {
            LOG.error("Invalid next file", e);
        }
    }
    return null;
}

From source file:com.msd.gin.halyard.sail.HBaseSail.java

License:Apache License

@Override
public synchronized long size(Resource... contexts) throws SailException {
    if (contexts != null && contexts.length > 0 && contexts[0] != null) {
        throw new SailException("Size calculation is not supported for named graphs");
    }//  www  . ja v a 2 s.co m
    if (sizeTimestamp < 0
            || (isWritable() && sizeTimestamp + STATUS_CACHING_TIMEOUT < System.currentTimeMillis()))
        try {
            long entries = 0;
            FileSystem fs = FileSystem.get(config);
            Collection<HColumnDescriptor> families = table.getTableDescriptor().getFamilies();
            Set<String> familyNames = new HashSet<>(families.size());
            for (HColumnDescriptor hcd : families) {
                familyNames.add(hcd.getNameAsString());
            }
            Path tableDir = FSUtils.getTableDir(FSUtils.getRootDir(config), table.getName());
            PathFilter dirFilter = new FSUtils.DirFilter(fs);
            int divider = 1;
            for (HRegionLocation hrl : table.getRegionLocator().getAllRegionLocations()) {
                HRegionInfo hri = hrl.getRegionInfo();
                byte[] skey = hri.getStartKey();
                if (skey.length == 0 || skey[0] == HalyardTableUtils.SPO_PREFIX) {
                    byte[] ekey = hri.getEndKey();
                    if (ekey.length == 0 || ekey[0] > HalyardTableUtils.POS_PREFIX) {
                        divider = 3;
                    }
                    for (FileStatus familyDir : fs.listStatus(new Path(tableDir, hri.getEncodedName()),
                            dirFilter)) {
                        if (familyNames.contains(familyDir.getPath().getName())) {
                            for (FileStatus file : fs.listStatus(familyDir.getPath())) {
                                if (file.isFile()) {
                                    try (FSDataInputStream in = fs.open(file.getPath())) {
                                        entries += FixedFileTrailer.readFromStream(in, file.getLen())
                                                .getEntryCount();
                                    } catch (Exception e) {
                                        LOG.log(Level.WARNING,
                                                "Exception while reading trailer from hfile: " + file.getPath(),
                                                e);
                                    }
                                }
                            }
                        }
                    }
                }
            }
            size = entries / divider;
            sizeTimestamp = System.currentTimeMillis();
        } catch (IOException e) {
            throw new SailException(e);
        }
    return size;
}

From source file:com.netease.news.text.SequenceFilesFromDirectory.java

License:Apache License

private int runSequential(Configuration conf, Path input, Path output, Map<String, String> options)
        throws IOException, InterruptedException, NoSuchMethodException {
    // Running sequentially
    Charset charset = Charset.forName(getOption(CHARSET_OPTION[0]));
    String keyPrefix = getOption(KEY_PREFIX_OPTION[0]);
    FileSystem fs = FileSystem.get(input.toUri(), conf);
    ChunkedWriter writer = new ChunkedWriter(conf, Integer.parseInt(options.get(CHUNK_SIZE_OPTION[0])), output);

    try {//  w  w  w  .j a va2s  . com
        SequenceFilesFromDirectoryFilter pathFilter;
        String fileFilterClassName = options.get(FILE_FILTER_CLASS_OPTION[0]);
        if (PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
            pathFilter = new PrefixAdditionFilter(conf, keyPrefix, options, writer, charset, fs);
        } else {
            pathFilter = ClassUtils.instantiateAs(fileFilterClassName, SequenceFilesFromDirectoryFilter.class,
                    new Class[] { Configuration.class, String.class, Map.class, ChunkedWriter.class,
                            Charset.class, FileSystem.class },
                    new Object[] { conf, keyPrefix, options, writer, charset, fs });
        }
        fs.listStatus(input, pathFilter);
    } finally {
        Closeables.close(writer, false);
    }
    return 0;
}