List of usage examples for org.apache.hadoop.fs FileSystem listStatus
public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException
From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java
License:Apache License
/** * Get avro schema of input path. There are three cases: * 1. if path is a file, then return its avro schema; * 2. if path is a first-level directory (no sub-directories), then * return the avro schema of one underlying file; * 3. if path contains sub-directories, then recursively check * whether all of them share the same schema and return it * if so or throw an exception if not./*from w w w . j av a 2 s. c o m*/ * * @param path input path * @param fs file system * @return avro schema of data * @throws IOException if underlying sub-directories do not share the same schema; or if input path is empty or does not exist */ @SuppressWarnings("deprecation") protected Schema getAvroSchema(Path path, FileSystem fs) throws IOException { if (!fs.exists(path) || !AvroStorageUtils.PATH_FILTER.accept(path)) return null; /* if path is first level directory or is a file */ if (!fs.isDirectory(path)) { return getSchema(path, fs); } FileStatus[] ss = fs.listStatus(path, AvroStorageUtils.PATH_FILTER); Schema schema = null; if (ss.length > 0) { if (AvroStorageUtils.noDir(ss)) return getSchema(path, fs); /*otherwise, check whether schemas of underlying directories are the same */ for (FileStatus s : ss) { Schema newSchema = getAvroSchema(s.getPath(), fs); if (schema == null) { schema = newSchema; if (!checkSchema) { System.out.println("Do not check schema; use schema of " + s.getPath()); return schema; } } else if (newSchema != null && !schema.equals(newSchema)) { throw new IOException("Input path is " + path + ". Sub-direcotry " + s.getPath() + " contains different schema " + newSchema + " than " + schema); } } } if (schema == null) System.err.println("Cannot get avro schema! Input path " + path + " might be empty."); return schema; }
From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorageUtils.java
License:Apache License
private static void getAllFilesInternal(FileStatus file, Configuration conf, Set<Path> paths, FileSystem fs) throws IOException { for (FileStatus f : fs.listStatus(file.getPath(), PATH_FILTER)) { if (f.isDir()) { getAllFilesInternal(f, conf, paths, fs); } else {//from w w w . j a v a2 s. c o m paths.add(f.getPath()); } } }
From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorageUtils.java
License:Apache License
/** get last file of a hdfs path if it is a directory; * or return the file itself if path is a file *//*from w w w.j a v a 2 s. c om*/ public static Path getLast(Path path, FileSystem fs) throws IOException { FileStatus status = fs.getFileStatus(path); if (!status.isDir()) { return path; } FileStatus[] statuses = fs.listStatus(path, PATH_FILTER); if (statuses.length == 0) { return null; } else { Arrays.sort(statuses); for (int i = statuses.length - 1; i >= 0; i--) { if (!statuses[i].isDir()) { return statuses[i].getPath(); } } return null; } }
From source file:com.linkedin.mapred.AvroUtils.java
License:Open Source License
public static FileStatus[] getAvroPartFiles(JobConf conf, Path outPath) throws IOException { Path outputPath = outPath;/* w w w. j av a 2 s . c o m*/ FileSystem fileSystem = outputPath.getFileSystem(conf); FileStatus[] partFiles = fileSystem.listStatus(outputPath, new PathFilter() { @Override public boolean accept(Path path) { if (path.getName().endsWith(".avro")) { return true; } return false; } }); return partFiles; }
From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); Configuration conf = job.getConfiguration(); try {/*from w w w . j av a2s .c om*/ List<FileStatus> files = listStatus(job); long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); for (FileStatus child : files) { Path path = child.getPath(); FileSystem fs = path.getFileSystem(conf); // length is 0 for dir according to FSDirectory.java in 0.20 // however, w/ Hadoop2, dir in local fs has non-zero length long length = child.getLen(); BlockLocation[] blkLocations = null; if (!child.isDirectory() || fs instanceof DistributedFileSystem == false) { blkLocations = fs.getFileBlockLocations(child, 0, length); } else if (length != 0) { throw new IOException("non-zero length directory on HDFS:" + path.toUri().toString()); } if ((length != 0) && isSplitable(job, path)) { long blockSize = child.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } } catch (InvalidInputException ex) { String inPath = conf.get(ConfigConstants.CONF_INPUT_DIRECTORY); String pattern = conf.get(ConfigConstants.CONF_INPUT_FILE_PATTERN, ".*"); throw new IOException("No input files found with the specified input path " + inPath + " and input file pattern " + pattern, ex); } PathFilter jobFilter = getInputPathFilter(job); List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); // take a second pass of the splits generated to extract files from // directories int count = 0; // flatten directories until reaching SPLIT_COUNT_LIMIT while (count < splits.size() && splits.size() < SPLIT_COUNT_LIMIT) { FileSplit split = (FileSplit) splits.get(count); Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); FileStatus status = fs.getFileStatus(file); if (status.isDirectory()) { FileStatus[] children = fs.listStatus(file, inputFilter); if (children.length + count < SPLIT_COUNT_LIMIT) { splits.remove(count); for (FileStatus stat : children) { FileSplit child = new FileSplit(stat.getPath(), 0, stat.getLen(), null); splits.add(child); } } else { count++; } } else { count++; } } return splits; }
From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java
License:Apache License
private List<FileStatus> simpleListStatus(JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); Configuration conf = job.getConfiguration(); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i];/*w w w .j av a 2 s .c o m*/ FileSystem fs = p.getFileSystem(conf); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { FileStatus[] files = fs.listStatus(globStat.getPath(), inputFilter); for (int j = 0; j < files.length; j++) { if (recursive && files[j].isDirectory()) { simpleAddInputPathRecursively(result, fs, files[j].getPath(), inputFilter); } else { result.add(files[j]); } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; }
From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java
License:Apache License
protected void simpleAddInputPathRecursively(List<FileStatus> result, FileSystem fs, Path path, PathFilter inputFilter) throws IOException { FileStatus[] files = fs.listStatus(path, inputFilter); for (int j = 0; j < files.length; j++) { if (files[j].isDirectory()) { simpleAddInputPathRecursively(result, fs, files[j].getPath(), inputFilter); } else {//w ww .j a va2s . c o m result.add(files[j]); } } }
From source file:com.marklogic.contentpump.utilities.FileIterator.java
License:Apache License
@Override public FileSplit next() { while (iterator.hasNext() || !fileDirSplits.isEmpty()) { try {//from ww w.j a va 2 s .co m if (iterator.hasNext()) { FileSplit split = iterator.next(); Path file = ((FileSplit) split).getPath(); FileSystem fs = file.getFileSystem(conf); FileStatus status = fs.getFileStatus(file); if (status.isDirectory()) { FileStatus[] children = fs.listStatus(status.getPath(), inputFilter); for (FileStatus stat : children) { FileSplit child = new FileSplit(stat.getPath(), 0, stat.getLen(), null); fileDirSplits.add(child); } } else return split; } else if (!fileDirSplits.isEmpty()) { FileSplit split = (FileSplit) fileDirSplits.remove(0); Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); FileStatus status = fs.getFileStatus(file); if (!status.isDirectory()) { return split; } FileStatus[] children = fs.listStatus(status.getPath(), inputFilter); List<FileSplit> expdFileSpts = new LinkedList<FileSplit>(); for (FileStatus stat : children) { FileSplit child = new FileSplit(stat.getPath(), 0, stat.getLen(), null); expdFileSpts.add(child); } iterator = expdFileSpts.iterator(); continue; } } catch (IOException e) { LOG.error("Invalid next file", e); } } return null; }
From source file:com.msd.gin.halyard.sail.HBaseSail.java
License:Apache License
@Override public synchronized long size(Resource... contexts) throws SailException { if (contexts != null && contexts.length > 0 && contexts[0] != null) { throw new SailException("Size calculation is not supported for named graphs"); }// www . ja v a 2 s.co m if (sizeTimestamp < 0 || (isWritable() && sizeTimestamp + STATUS_CACHING_TIMEOUT < System.currentTimeMillis())) try { long entries = 0; FileSystem fs = FileSystem.get(config); Collection<HColumnDescriptor> families = table.getTableDescriptor().getFamilies(); Set<String> familyNames = new HashSet<>(families.size()); for (HColumnDescriptor hcd : families) { familyNames.add(hcd.getNameAsString()); } Path tableDir = FSUtils.getTableDir(FSUtils.getRootDir(config), table.getName()); PathFilter dirFilter = new FSUtils.DirFilter(fs); int divider = 1; for (HRegionLocation hrl : table.getRegionLocator().getAllRegionLocations()) { HRegionInfo hri = hrl.getRegionInfo(); byte[] skey = hri.getStartKey(); if (skey.length == 0 || skey[0] == HalyardTableUtils.SPO_PREFIX) { byte[] ekey = hri.getEndKey(); if (ekey.length == 0 || ekey[0] > HalyardTableUtils.POS_PREFIX) { divider = 3; } for (FileStatus familyDir : fs.listStatus(new Path(tableDir, hri.getEncodedName()), dirFilter)) { if (familyNames.contains(familyDir.getPath().getName())) { for (FileStatus file : fs.listStatus(familyDir.getPath())) { if (file.isFile()) { try (FSDataInputStream in = fs.open(file.getPath())) { entries += FixedFileTrailer.readFromStream(in, file.getLen()) .getEntryCount(); } catch (Exception e) { LOG.log(Level.WARNING, "Exception while reading trailer from hfile: " + file.getPath(), e); } } } } } } } size = entries / divider; sizeTimestamp = System.currentTimeMillis(); } catch (IOException e) { throw new SailException(e); } return size; }
From source file:com.netease.news.text.SequenceFilesFromDirectory.java
License:Apache License
private int runSequential(Configuration conf, Path input, Path output, Map<String, String> options) throws IOException, InterruptedException, NoSuchMethodException { // Running sequentially Charset charset = Charset.forName(getOption(CHARSET_OPTION[0])); String keyPrefix = getOption(KEY_PREFIX_OPTION[0]); FileSystem fs = FileSystem.get(input.toUri(), conf); ChunkedWriter writer = new ChunkedWriter(conf, Integer.parseInt(options.get(CHUNK_SIZE_OPTION[0])), output); try {// w w w .j a va2s . com SequenceFilesFromDirectoryFilter pathFilter; String fileFilterClassName = options.get(FILE_FILTER_CLASS_OPTION[0]); if (PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) { pathFilter = new PrefixAdditionFilter(conf, keyPrefix, options, writer, charset, fs); } else { pathFilter = ClassUtils.instantiateAs(fileFilterClassName, SequenceFilesFromDirectoryFilter.class, new Class[] { Configuration.class, String.class, Map.class, ChunkedWriter.class, Charset.class, FileSystem.class }, new Object[] { conf, keyPrefix, options, writer, charset, fs }); } fs.listStatus(input, pathFilter); } finally { Closeables.close(writer, false); } return 0; }