Example usage for org.apache.hadoop.fs FileSystem listStatus

List of usage examples for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException 

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:com.netease.news.utils.SequenceFileDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//  w  w  w.ja va  2s.c om
    addOutputOption();
    addOption("substring", "b", "The number of chars to print out per value", false);
    addOption(buildOption("count", "c", "Report the count only", false, false, null));
    addOption("numItems", "n", "Output at most <n> key value pairs", false);
    addOption(
            buildOption("facets", "fa", "Output the counts per key.  Note, if there are a lot of unique keys, "
                    + "this can take up a fair amount of memory", false, false, null));
    addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    Path input = getInputPath();
    FileSystem fs = input.getFileSystem(conf);
    if (fs.getFileStatus(input).isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter()));
    } else {
        pathArr = new Path[1];
        pathArr[0] = input;
    }

    Writer writer;
    boolean shouldClose;
    if (hasOption("output")) {
        shouldClose = true;
        writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        for (Path path : pathArr) {
            if (!hasOption("quiet")) {
                writer.append("Input Path: ").append(String.valueOf(path)).append('\n');
            }

            int sub = Integer.MAX_VALUE;
            if (hasOption("substring")) {
                sub = Integer.parseInt(getOption("substring"));
            }
            boolean countOnly = hasOption("count");
            SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true,
                    conf);
            if (!hasOption("quiet")) {
                writer.append("Key class: ").append(iterator.getKeyClass().toString());
                writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n');
            }
            OpenObjectIntHashMap<String> facets = null;
            if (hasOption("facets")) {
                facets = new OpenObjectIntHashMap<String>();
            }
            long count = 0;
            if (countOnly) {
                while (iterator.hasNext()) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                writer.append("Count: ").append(String.valueOf(count)).append('\n');
            } else {
                long numItems = Long.MAX_VALUE;
                if (hasOption("numItems")) {
                    numItems = Long.parseLong(getOption("numItems"));
                    if (!hasOption("quiet")) {
                        writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n");
                    }
                }
                while (iterator.hasNext() && count < numItems) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    writer.append("Key: ").append(key);
                    String str = record.getSecond().toString();
                    writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str);
                    writer.write('\n');
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                if (!hasOption("quiet")) {
                    writer.append("Count: ").append(String.valueOf(count)).append('\n');
                }
            }
            if (facets != null) {
                List<String> keyList = Lists.newArrayListWithCapacity(facets.size());

                IntArrayList valueList = new IntArrayList(facets.size());
                facets.pairsSortedByKey(keyList, valueList);
                writer.append("-----Facets---\n");
                writer.append("Key\t\tCount\n");
                int i = 0;
                for (String key : keyList) {
                    writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n');
                }
            }
        }
        writer.flush();

    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:com.netease.news.utils.SplitInput.java

License:Apache License

public void splitDirectory(Configuration conf, Path inputDir)
        throws IOException, ClassNotFoundException, InterruptedException {
    FileSystem fs = inputDir.getFileSystem(conf);
    if (fs.getFileStatus(inputDir) == null) {
        throw new IOException(inputDir + " does not exist");
    }/* ww  w. j  a v a2s .c  o  m*/
    if (!fs.getFileStatus(inputDir).isDir()) {
        throw new IOException(inputDir + " is not a directory");
    }

    if (useMapRed) {
        SplitInputJob.run(conf, inputDir, mapRedOutputDirectory, keepPct, testRandomSelectionPct);
    } else {
        // input dir contains one file per category.
        FileStatus[] fileStats = fs.listStatus(inputDir, PathFilters.logsCRCFilter());
        for (FileStatus inputFile : fileStats) {
            if (!inputFile.isDir()) {
                splitFile(inputFile.getPath());
            }
        }
    }
}

From source file:com.netflix.bdp.s3.S3MultipartOutputCommitter.java

License:Apache License

/**
 * Lists the output of a task under the task attempt path. Subclasses can
 * override this method to change how output files are identified.
 * <p>//from w  w w . j a va 2 s  . c om
 * This implementation lists the files that are direct children of the output
 * path and filters hidden files (file names starting with '.' or '_').
 * <p>
 * The task attempt path is provided by
 * {@link #getTaskAttemptPath(TaskAttemptContext)}
 *
 * @param context this task's {@link TaskAttemptContext}
 * @return the output files produced by this task in the task attempt path
 * @throws IOException
 */
protected Iterable<FileStatus> getTaskOutput(TaskAttemptContext context) throws IOException {
    // get files on the local FS in the attempt path
    Path attemptPath = getTaskAttemptPath(context);
    FileSystem attemptFS = attemptPath.getFileSystem(context.getConfiguration());
    FileStatus[] stats = attemptFS.listStatus(attemptPath, HiddenPathFilter.get());
    return Arrays.asList(stats);
}

From source file:com.netflix.bdp.s3.S3MultipartOutputCommitter.java

License:Apache License

private List<S3Util.PendingUpload> getPendingUploads(JobContext context, boolean suppressExceptions)
        throws IOException {
    Path jobAttemptPath = wrappedCommitter.getJobAttemptPath(context);
    final FileSystem attemptFS = jobAttemptPath.getFileSystem(context.getConfiguration());
    FileStatus[] pendingCommitFiles = attemptFS.listStatus(jobAttemptPath, HiddenPathFilter.get());

    final List<S3Util.PendingUpload> pending = Lists.newArrayList();

    // try to read every pending file and add all results to pending.
    // in the case of a failure to read the file, exceptions are held until all
    // reads have been attempted.
    Tasks.foreach(pendingCommitFiles).throwFailureWhenFinished(!suppressExceptions)
            .executeWith(getThreadPool(context)).run(new Task<FileStatus, IOException>() {
                @Override/*from  w w  w.j a  va  2 s.  c om*/
                public void run(FileStatus pendingCommitFile) throws IOException {
                    pending.addAll(S3Util.readPendingCommits(attemptFS, pendingCommitFile.getPath()));
                }
            });

    return pending;
}

From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java

License:Apache License

/**
 * List input directories.//from  w ww .j  av  a  2  s.  co  m
 *
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // Get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    List<IOException> errors = new ArrayList<IOException>();
    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        final SmilePathFilter filter = new SmilePathFilter();
        FileStatus[] matches = fs.globStatus(p, filter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    Collections.addAll(result, fs.listStatus(globStat.getPath(), filter));
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }

    return result;
}

From source file:com.peer2gear.nutch.xquery.ParseResult.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("Usage: %s [generic options] (<segment> ... | -dir <segments>) <output>\n",
                getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }//  w  w w .j a  v  a  2  s .c o m

    Job job = new Job(getConf());
    for (int i = 0; i < args.length - 1; i++) {
        if ("-dir".equals(args[i])) {
            Path dir = new Path(args[++i]);
            FileSystem fs = dir.getFileSystem(getConf());
            FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
            Path[] segments = HadoopFSUtil.getPaths(fstats);
            for (Path segment : segments) {
                FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
            }
        } else {
            FileInputFormat.addInputPath(job, new Path(args[i], ParseData.DIR_NAME));
        }
    }
    FileOutputFormat.setOutputPath(job, new Path(args[args.length - 1]));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(GetResultMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.redsqirl.workflow.server.datatype.MapRedDir.java

License:Open Source License

public String isPathValid(String path, List<String> shouldNotHaveExt, List<String> shouldHaveExt,
        boolean fileExtension) throws RemoteException {
    String error = null;/*w ww.  jav  a2  s .c o  m*/
    HdfsFileChecker hCh = new HdfsFileChecker(path);
    if (shouldHaveExt != null && !shouldHaveExt.isEmpty()) {
        boolean found = false;
        for (String extCur : shouldHaveExt) {
            found |= path.endsWith(extCur);
        }
        if (!found) {
            error = LanguageManagerWF.getText("mapredtexttype.shouldhaveextcompresssile",
                    new Object[] { path, shouldHaveExt });

        }
    } else if (shouldNotHaveExt != null && !shouldNotHaveExt.isEmpty()) {
        boolean found = false;
        for (String extCur : shouldNotHaveExt) {
            found |= path.endsWith(extCur);
        }
        if (found) {
            error = LanguageManagerWF.getText("mapredtexttype.shouldnothaveextcompresssile",
                    new Object[] { path, shouldNotHaveExt });

        }
    }

    if (!hCh.isInitialized() || hCh.isFile()) {
        error = LanguageManagerWF.getText("mapredtexttype.dirisfile");
    } else if (isPathExist()) {
        FileSystem fs;
        try {
            fs = NameNodeVar.getFS();
            hCh.setPath(new Path(path).getParent());
            if (!hCh.isDirectory()) {
                error = LanguageManagerWF.getText("mapredtexttype.nodir",
                        new String[] { hCh.getPath().toString() });
            }

            FileStatus[] stat = null;
            if (error == null) {
                try {
                    stat = fs.listStatus(new Path(path), new PathFilter() {

                        @Override
                        public boolean accept(Path arg0) {
                            return !arg0.getName().startsWith("_") && !arg0.getName().startsWith(".");
                        }
                    });
                } catch (Exception e) {
                    stat = null;
                    error = LanguageManagerWF.getText("mapredtexttype.notmrdir", new Object[] { path });
                }
            }

            if (stat != null) {
                for (int i = 0; i < stat.length && error == null; ++i) {
                    if (stat[i].isDir()) {
                        error = LanguageManagerWF.getText("mapredtexttype.notmrdir", new Object[] { path });
                    } else {
                        if (fileExtension) {
                            if (shouldHaveExt != null && !shouldHaveExt.isEmpty()) {
                                boolean found = false;
                                for (String extCur : shouldHaveExt) {
                                    found |= stat[i].getPath().getName().endsWith(extCur);
                                }
                                if (!found) {
                                    error = LanguageManagerWF.getText(
                                            "mapredtexttype.shouldhaveextcompresssile",
                                            new Object[] { path, shouldHaveExt });

                                }
                            } else if (shouldNotHaveExt != null && !shouldNotHaveExt.isEmpty()) {
                                boolean found = false;
                                for (String extCur : shouldNotHaveExt) {
                                    found |= stat[i].getPath().getName().endsWith(extCur);
                                }
                                if (found) {
                                    error = LanguageManagerWF.getText(
                                            "mapredtexttype.shouldnothaveextcompresssile",
                                            new Object[] { path, shouldNotHaveExt });

                                }
                            }
                        }

                        try {
                            hdfsInt.select(stat[i].getPath().toString(), "", 1);
                        } catch (Exception e) {
                            error = LanguageManagerWF.getText("mapredtexttype.notmrdir");
                            logger.error(error, e);
                        }
                    }
                }
            }
        } catch (IOException e) {

            error = LanguageManagerWF.getText("unexpectedexception", new Object[] { e.getMessage() });

            logger.error(error, e);
        }

    }
    // hCh.close();
    return error;
}

From source file:com.redsqirl.workflow.server.datatype.MapRedDir.java

License:Open Source License

public List<String> selectLine(int maxToRead) throws RemoteException {
    List<String> ans = null;
    if (isPathExist()) {
        try {// w  ww .j  av a  2 s. com
            FileSystem fs = NameNodeVar.getFS();

            FileStatus[] stat = fs.listStatus(new Path(getPath()), new PathFilter() {

                @Override
                public boolean accept(Path arg0) {
                    return !arg0.getName().startsWith("_") && !arg0.getName().startsWith(".");
                }
            });

            if (stat != null && stat.length > 0) {
                ans = new ArrayList<String>(maxToRead);

                SortedSet<Map.Entry<FileStatus, Long>> filesSortedBySize = new TreeSet<Map.Entry<FileStatus, Long>>(
                        new Comparator<Map.Entry<FileStatus, Long>>() {
                            @Override
                            public int compare(Map.Entry<FileStatus, Long> e1, Map.Entry<FileStatus, Long> e2) {
                                return -e1.getValue().compareTo(e2.getValue());
                            }
                        });
                //We limit the number of file to be 100
                for (int k = 0; k < stat.length; ++k) {
                    filesSortedBySize
                            .add(new AbstractMap.SimpleEntry<FileStatus, Long>(stat[k], stat[k].getLen()));
                }

                //Read the biggest files first
                Iterator<Map.Entry<FileStatus, Long>> fileIt = filesSortedBySize.iterator();
                int k = 0;
                while (fileIt.hasNext() && ans.size() < maxToRead && k < NB_FILE_TO_READ_MAX) {
                    Map.Entry<FileStatus, Long> cur = fileIt.next();
                    FileStatus file = cur.getKey();
                    logger.debug("Number of line already read: " + ans.size());
                    ans.addAll(hdfsInt.select(file.getPath().toString(), ",", maxToRead - ans.size()));
                    ++k;
                }

                logger.debug("Number of line read in " + getPath() + ": " + ans.size());
            }
        } catch (IOException e) {
            String error = "Unexpected error: " + e.getMessage();
            logger.error(error, e);
            ans = null;
        } catch (Exception e) {
            logger.error(e, e);
            ans = null;
        }
    }

    return ans;
}

From source file:com.redsqirl.workflow.server.OozieManager.java

License:Open Source License

/**
 * Clean the directory where the Job details are stored
 * //from  w  ww  . j ava2  s  .  c o  m
 * @param nameWf
 * @throws RemoteException
 */
public void cleanJobDirectory(final String nameWf) throws RemoteException {
    Path hdfsWfPath = new Path(WorkflowPrefManager.getHDFSPathJobs());
    FileSystem fs = null;
    int numberToKeep = WorkflowPrefManager.getNbOozieDirToKeep();
    try {
        fs = NameNodeVar.getFS();
        FileStatus[] children = fs.listStatus(hdfsWfPath, new PathFilter() {

            @Override
            public boolean accept(Path arg0) {
                return arg0.getName().startsWith(nameWf + "_");
            }
        });
        Arrays.sort(children, 0, children.length, new Comparator<FileStatus>() {

            @Override
            public int compare(FileStatus arg0, FileStatus arg1) {
                return (int) ((arg0.getModificationTime() - arg1.getModificationTime()) / 10000);
            }
        });
        for (int i = 0; i < children.length - numberToKeep; ++i) {
            fs.delete(children[i].getPath(), true);
        }
    } catch (Exception e1) {
        logger.error(e1);
    }
}

From source file:com.redsqirl.workflow.server.OozieManager.java

License:Open Source License

/**
 * Get a name for a directory to store all the jobs files and configuration
 * /*from   w  ww  .  jav  a2s.  co m*/
 * @param df
 * @return The name for a directory to store all the jobs files and configuration
 * @throws RemoteException
 */
protected String buildFileName(DataFlow df) throws RemoteException {
    final String nameWf = df.getName();
    if (nameWf == null) {
        logger.warn("The workflow to run has no name");
        df.setName(RandomString.getRandomName(8));
    }
    String ans = null;
    Path hdfsWfPath = new Path(WorkflowPrefManager.getHDFSPathJobs());
    FileSystem fs = null;
    int number = -1;
    try {
        fs = NameNodeVar.getFS();
        FileStatus[] children = fs.listStatus(hdfsWfPath, new PathFilter() {

            @Override
            public boolean accept(Path arg0) {
                if (arg0.getName().startsWith(nameWf)) {
                    try {
                        @SuppressWarnings("unused")
                        int i = Integer.valueOf(arg0.getName().substring(nameWf.length() + 1));
                        return true;
                    } catch (Exception e) {
                    }
                }
                return false;
            }
        });

        if (children != null && children.length > 0) {
            for (FileStatus child : children) {
                number = Math.max(number,
                        Integer.valueOf(child.getPath().getName().substring(nameWf.length() + 1)));
            }
        }
    } catch (Exception e) {
        logger.error(e, e);
    }
    ans = nameWf + "_" + (number + 1);

    return ans;
}