Example usage for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:com.netease.news.utils.SequenceFileDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//  w  w  w.ja va  2s.c om
    addOutputOption();
    addOption("substring", "b", "The number of chars to print out per value", false);
    addOption(buildOption("count", "c", "Report the count only", false, false, null));
    addOption("numItems", "n", "Output at most <n> key value pairs", false);
    addOption(
            buildOption("facets", "fa", "Output the counts per key.  Note, if there are a lot of unique keys, "
                    + "this can take up a fair amount of memory", false, false, null));
    addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    Path input = getInputPath();
    FileSystem fs = input.getFileSystem(conf);
    if (fs.getFileStatus(input).isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter()));
    } else {
        pathArr = new Path[1];
        pathArr[0] = input;
    }

    Writer writer;
    boolean shouldClose;
    if (hasOption("output")) {
        shouldClose = true;
        writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        for (Path path : pathArr) {
            if (!hasOption("quiet")) {
                writer.append("Input Path: ").append(String.valueOf(path)).append('\n');
            }

            int sub = Integer.MAX_VALUE;
            if (hasOption("substring")) {
                sub = Integer.parseInt(getOption("substring"));
            }
            boolean countOnly = hasOption("count");
            SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true,
                    conf);
            if (!hasOption("quiet")) {
                writer.append("Key class: ").append(iterator.getKeyClass().toString());
                writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n');
            }
            OpenObjectIntHashMap<String> facets = null;
            if (hasOption("facets")) {
                facets = new OpenObjectIntHashMap<String>();
            }
            long count = 0;
            if (countOnly) {
                while (iterator.hasNext()) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                writer.append("Count: ").append(String.valueOf(count)).append('\n');
            } else {
                long numItems = Long.MAX_VALUE;
                if (hasOption("numItems")) {
                    numItems = Long.parseLong(getOption("numItems"));
                    if (!hasOption("quiet")) {
                        writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n");
                    }
                }
                while (iterator.hasNext() && count < numItems) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    writer.append("Key: ").append(key);
                    String str = record.getSecond().toString();
                    writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str);
                    writer.write('\n');
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                if (!hasOption("quiet")) {
                    writer.append("Count: ").append(String.valueOf(count)).append('\n');
                }
            }
            if (facets != null) {
                List<String> keyList = Lists.newArrayListWithCapacity(facets.size());

                IntArrayList valueList = new IntArrayList(facets.size());
                facets.pairsSortedByKey(keyList, valueList);
                writer.append("-----Facets---\n");
                writer.append("Key\t\tCount\n");
                int i = 0;
                for (String key : keyList) {
                    writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n');
                }
            }
        }
        writer.flush();

    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:com.netease.news.utils.SplitInput.java

License:Apache License

public void splitDirectory(Configuration conf, Path inputDir)
        throws IOException, ClassNotFoundException, InterruptedException {
    FileSystem fs = inputDir.getFileSystem(conf);
    if (fs.getFileStatus(inputDir) == null) {
        throw new IOException(inputDir + " does not exist");
    }/* ww  w. j  a v a2s .c  o  m*/
    if (!fs.getFileStatus(inputDir).isDir()) {
        throw new IOException(inputDir + " is not a directory");
    }

    if (useMapRed) {
        SplitInputJob.run(conf, inputDir, mapRedOutputDirectory, keepPct, testRandomSelectionPct);
    } else {
        // input dir contains one file per category.
        FileStatus[] fileStats = fs.listStatus(inputDir, PathFilters.logsCRCFilter());
        for (FileStatus inputFile : fileStats) {
            if (!inputFile.isDir()) {
                splitFile(inputFile.getPath());
            }
        }
    }
}

From source file:com.netflix.bdp.s3.S3MultipartOutputCommitter.java

License:Apache License

/**
 * Lists the output of a task under the task attempt path. Subclasses can
 * override this method to change how output files are identified.
 * <p>//from w  w w . j a va 2 s  . c om
 * This implementation lists the files that are direct children of the output
 * path and filters hidden files (file names starting with '.' or '_').
 * <p>
 * The task attempt path is provided by
 * {@link #getTaskAttemptPath(TaskAttemptContext)}
 *
 * @param context this task's {@link TaskAttemptContext}
 * @return the output files produced by this task in the task attempt path
 * @throws IOException
 */
protected Iterable<FileStatus> getTaskOutput(TaskAttemptContext context) throws IOException {
    // get files on the local FS in the attempt path
    Path attemptPath = getTaskAttemptPath(context);
    FileSystem attemptFS = attemptPath.getFileSystem(context.getConfiguration());
    FileStatus[] stats = attemptFS.listStatus(attemptPath, HiddenPathFilter.get());
    return Arrays.asList(stats);
}

From source file:com.netflix.bdp.s3.S3MultipartOutputCommitter.java

License:Apache License

private List<S3Util.PendingUpload> getPendingUploads(JobContext context, boolean suppressExceptions)
        throws IOException {
    Path jobAttemptPath = wrappedCommitter.getJobAttemptPath(context);
    final FileSystem attemptFS = jobAttemptPath.getFileSystem(context.getConfiguration());
    FileStatus[] pendingCommitFiles = attemptFS.listStatus(jobAttemptPath, HiddenPathFilter.get());

    final List<S3Util.PendingUpload> pending = Lists.newArrayList();

    // try to read every pending file and add all results to pending.
    // in the case of a failure to read the file, exceptions are held until all
    // reads have been attempted.
    Tasks.foreach(pendingCommitFiles).throwFailureWhenFinished(!suppressExceptions)
            .executeWith(getThreadPool(context)).run(new Task<FileStatus, IOException>() {
                @Override/*from  w w  w.j a  va  2 s.  c om*/
                public void run(FileStatus pendingCommitFile) throws IOException {
                    pending.addAll(S3Util.readPendingCommits(attemptFS, pendingCommitFile.getPath()));
                }
            });

    return pending;
}

From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java

License:Apache License

/**
 * List input directories.//from  w ww .j  av  a  2  s.  co  m
 *
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // Get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    List<IOException> errors = new ArrayList<IOException>();
    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        final SmilePathFilter filter = new SmilePathFilter();
        FileStatus[] matches = fs.globStatus(p, filter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    Collections.addAll(result, fs.listStatus(globStat.getPath(), filter));
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }

    return result;
}

From source file:com.peer2gear.nutch.xquery.ParseResult.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("Usage: %s [generic options] (<segment> ... | -dir <segments>) <output>\n",
                getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }//  w  w w .j a  v  a  2  s .c o m

    Job job = new Job(getConf());
    for (int i = 0; i < args.length - 1; i++) {
        if ("-dir".equals(args[i])) {
            Path dir = new Path(args[++i]);
            FileSystem fs = dir.getFileSystem(getConf());
            FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
            Path[] segments = HadoopFSUtil.getPaths(fstats);
            for (Path segment : segments) {
                FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
            }
        } else {
            FileInputFormat.addInputPath(job, new Path(args[i], ParseData.DIR_NAME));
        }
    }
    FileOutputFormat.setOutputPath(job, new Path(args[args.length - 1]));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(GetResultMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.redsqirl.workflow.server.datatype.MapRedDir.java

License:Open Source License

public String isPathValid(String path, List<String> shouldNotHaveExt, List<String> shouldHaveExt,
        boolean fileExtension) throws RemoteException {
    String error = null;/*w ww.  jav  a2  s .c o  m*/
    HdfsFileChecker hCh = new HdfsFileChecker(path);
    if (shouldHaveExt != null && !shouldHaveExt.isEmpty()) {
        boolean found = false;
        for (String extCur : shouldHaveExt) {
            found |= path.endsWith(extCur);
        }
        if (!found) {
            error = LanguageManagerWF.getText("mapredtexttype.shouldhaveextcompresssile",
                    new Object[] { path, shouldHaveExt });

        }
    } else if (shouldNotHaveExt != null && !shouldNotHaveExt.isEmpty()) {
        boolean found = false;
        for (String extCur : shouldNotHaveExt) {
            found |= path.endsWith(extCur);
        }
        if (found) {
            error = LanguageManagerWF.getText("mapredtexttype.shouldnothaveextcompresssile",
                    new Object[] { path, shouldNotHaveExt });

        }
    }

    if (!hCh.isInitialized() || hCh.isFile()) {
        error = LanguageManagerWF.getText("mapredtexttype.dirisfile");
    } else if (isPathExist()) {
        FileSystem fs;
        try {
            fs = NameNodeVar.getFS();
            hCh.setPath(new Path(path).getParent());
            if (!hCh.isDirectory()) {
                error = LanguageManagerWF.getText("mapredtexttype.nodir",
                        new String[] { hCh.getPath().toString() });
            }

            FileStatus[] stat = null;
            if (error == null) {
                try {
                    stat = fs.listStatus(new Path(path), new PathFilter() {

                        @Override
                        public boolean accept(Path arg0) {
                            return !arg0.getName().startsWith("_") && !arg0.getName().startsWith(".");
                        }
                    });
                } catch (Exception e) {
                    stat = null;
                    error = LanguageManagerWF.getText("mapredtexttype.notmrdir", new Object[] { path });
                }
            }

            if (stat != null) {
                for (int i = 0; i < stat.length && error == null; ++i) {
                    if (stat[i].isDir()) {
                        error = LanguageManagerWF.getText("mapredtexttype.notmrdir", new Object[] { path });
                    } else {
                        if (fileExtension) {
                            if (shouldHaveExt != null && !shouldHaveExt.isEmpty()) {
                                boolean found = false;
                                for (String extCur : shouldHaveExt) {
                                    found |= stat[i].getPath().getName().endsWith(extCur);
                                }
                                if (!found) {
                                    error = LanguageManagerWF.getText(
                                            "mapredtexttype.shouldhaveextcompresssile",
                                            new Object[] { path, shouldHaveExt });

                                }
                            } else if (shouldNotHaveExt != null && !shouldNotHaveExt.isEmpty()) {
                                boolean found = false;
                                for (String extCur : shouldNotHaveExt) {
                                    found |= stat[i].getPath().getName().endsWith(extCur);
                                }
                                if (found) {
                                    error = LanguageManagerWF.getText(
                                            "mapredtexttype.shouldnothaveextcompresssile",
                                            new Object[] { path, shouldNotHaveExt });

                                }
                            }
                        }

                        try {
                            hdfsInt.select(stat[i].getPath().toString(), "", 1);
                        } catch (Exception e) {
                            error = LanguageManagerWF.getText("mapredtexttype.notmrdir");
                            logger.error(error, e);
                        }
                    }
                }
            }
        } catch (IOException e) {

            error = LanguageManagerWF.getText("unexpectedexception", new Object[] { e.getMessage() });

            logger.error(error, e);
        }

    }
    // hCh.close();
    return error;
}

From source file:com.redsqirl.workflow.server.datatype.MapRedDir.java

License:Open Source License

public List<String> selectLine(int maxToRead) throws RemoteException {
    List<String> ans = null;
    if (isPathExist()) {
        try {// w  ww .j  av a  2 s. com
            FileSystem fs = NameNodeVar.getFS();

            FileStatus[] stat = fs.listStatus(new Path(getPath()), new PathFilter() {

                @Override
                public boolean accept(Path arg0) {
                    return !arg0.getName().startsWith("_") && !arg0.getName().startsWith(".");
                }
            });

            if (stat != null && stat.length > 0) {
                ans = new ArrayList<String>(maxToRead);

                SortedSet<Map.Entry<FileStatus, Long>> filesSortedBySize = new TreeSet<Map.Entry<FileStatus, Long>>(
                        new Comparator<Map.Entry<FileStatus, Long>>() {
                            @Override
                            public int compare(Map.Entry<FileStatus, Long> e1, Map.Entry<FileStatus, Long> e2) {
                                return -e1.getValue().compareTo(e2.getValue());
                            }
                        });
                //We limit the number of file to be 100
                for (int k = 0; k < stat.length; ++k) {
                    filesSortedBySize
                            .add(new AbstractMap.SimpleEntry<FileStatus, Long>(stat[k], stat[k].getLen()));
                }

                //Read the biggest files first
                Iterator<Map.Entry<FileStatus, Long>> fileIt = filesSortedBySize.iterator();
                int k = 0;
                while (fileIt.hasNext() && ans.size() < maxToRead && k < NB_FILE_TO_READ_MAX) {
                    Map.Entry<FileStatus, Long> cur = fileIt.next();
                    FileStatus file = cur.getKey();
                    logger.debug("Number of line already read: " + ans.size());
                    ans.addAll(hdfsInt.select(file.getPath().toString(), ",", maxToRead - ans.size()));
                    ++k;
                }

                logger.debug("Number of line read in " + getPath() + ": " + ans.size());
            }
        } catch (IOException e) {
            String error = "Unexpected error: " + e.getMessage();
            logger.error(error, e);
            ans = null;
        } catch (Exception e) {
            logger.error(e, e);
            ans = null;
        }
    }

    return ans;
}

From source file:com.redsqirl.workflow.server.OozieManager.java

License:Open Source License

/**
 * Clean the directory where the Job details are stored
 * //from  w  ww  . j ava2  s  .  c o  m
 * @param nameWf
 * @throws RemoteException
 */
public void cleanJobDirectory(final String nameWf) throws RemoteException {
    Path hdfsWfPath = new Path(WorkflowPrefManager.getHDFSPathJobs());
    FileSystem fs = null;
    int numberToKeep = WorkflowPrefManager.getNbOozieDirToKeep();
    try {
        fs = NameNodeVar.getFS();
        FileStatus[] children = fs.listStatus(hdfsWfPath, new PathFilter() {

            @Override
            public boolean accept(Path arg0) {
                return arg0.getName().startsWith(nameWf + "_");
            }
        });
        Arrays.sort(children, 0, children.length, new Comparator<FileStatus>() {

            @Override
            public int compare(FileStatus arg0, FileStatus arg1) {
                return (int) ((arg0.getModificationTime() - arg1.getModificationTime()) / 10000);
            }
        });
        for (int i = 0; i < children.length - numberToKeep; ++i) {
            fs.delete(children[i].getPath(), true);
        }
    } catch (Exception e1) {
        logger.error(e1);
    }
}

From source file:com.redsqirl.workflow.server.OozieManager.java

License:Open Source License

/**
 * Get a name for a directory to store all the jobs files and configuration
 * /*from   w  ww  .  jav  a2s.  co m*/
 * @param df
 * @return The name for a directory to store all the jobs files and configuration
 * @throws RemoteException
 */
protected String buildFileName(DataFlow df) throws RemoteException {
    final String nameWf = df.getName();
    if (nameWf == null) {
        logger.warn("The workflow to run has no name");
        df.setName(RandomString.getRandomName(8));
    }
    String ans = null;
    Path hdfsWfPath = new Path(WorkflowPrefManager.getHDFSPathJobs());
    FileSystem fs = null;
    int number = -1;
    try {
        fs = NameNodeVar.getFS();
        FileStatus[] children = fs.listStatus(hdfsWfPath, new PathFilter() {

            @Override
            public boolean accept(Path arg0) {
                if (arg0.getName().startsWith(nameWf)) {
                    try {
                        @SuppressWarnings("unused")
                        int i = Integer.valueOf(arg0.getName().substring(nameWf.length() + 1));
                        return true;
                    } catch (Exception e) {
                    }
                }
                return false;
            }
        });

        if (children != null && children.length > 0) {
            for (FileStatus child : children) {
                number = Math.max(number,
                        Integer.valueOf(child.getPath().getName().substring(nameWf.length() + 1)));
            }
        }
    } catch (Exception e) {
        logger.error(e, e);
    }
    ans = nameWf + "_" + (number + 1);

    return ans;
}