Example usage for org.apache.hadoop.fs FileSystem listStatus

List of usage examples for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files) throws FileNotFoundException, IOException 

Source Link

Document

Filter files/directories in the given list of paths using default path filter.

Usage

From source file:com.datatorrent.stram.util.FSUtil.java

License:Apache License

public static void setPermission(FileSystem fs, Path dst, FsPermission permission) throws IOException {
    FileStatus contents[] = fs.listStatus(dst);
    for (int i = 0; i < contents.length; i++) {
        fs.setPermission(contents[i].getPath(), permission);
    }//from   w w w. j  ava2  s  .c  o  m
    fs.setPermission(dst, permission);
}

From source file:com.davidgildeh.hadoop.utils.FileUtils.java

License:Apache License

/**
 * Merges a list of input files in a directory to a single file under the 
 * outputpath with a specified filename/*from ww  w.ja  v  a 2s .  c  om*/
 * 
 * @param inputPath         The input directory containing all the input files. E.g. /input/dir/on/hdfs/
 * @param outputPath        The output path to output the file. E.g. /output/dir/on/hdfs/filename
 * @throws IOException
 */
public static void mergeFiles(String inputPath, String outputPath) throws IOException {

    Path inputDir = new Path(inputPath);
    Path outputFile = new Path(outputPath);
    FileSystem fileSystem = getFileSystem(outputFile);
    checkFileExists(fileSystem, inputDir);

    // Check the input path is a directory
    if (!fileSystem.getFileStatus(inputDir).isDir()) {
        LOG.error("Path '" + inputDir.toString() + "' is not a directory.");
        throw new IOException("Path '" + inputDir.toString() + "' is not a directory.");
    }

    // Create Output File
    OutputStream out = fileSystem.create(outputFile);

    try {

        FileStatus contents[] = fileSystem.listStatus(inputDir);

        // Loop through all files in directory and merge them into one file
        for (int i = 0; i < contents.length; i++) {

            if (!contents[i].isDir()) {

                InputStream in = fileSystem.open(contents[i].getPath());
                try {
                    IOUtils.copyBytes(in, out, fileSystem.getConf(), false);
                } finally {
                    in.close();
                }
            }
        }

    } finally {
        out.close();
        fileSystem.close();
        LOG.info("Merged input files from '" + inputPath + "' to '" + outputPath + "'");
    }
}

From source file:com.digitalpebble.behemoth.gate.GATECorpusGenerator.java

License:Apache License

private void generateXMLdocs(String inputf, String outputf) throws IOException {
    Path input = new Path(inputf);

    File output = new File(outputf);
    if (output.exists() && output.isFile()) {
        System.err.println("Output " + outputf + " already exists");
        return;//from w  w  w.j  a va2  s  . com
    }
    if (output.exists() == false)
        output.mkdirs();

    FileSystem fs = input.getFileSystem(getConf());
    FileStatus[] statuses = fs.listStatus(input);
    int count[] = { 0 };
    for (int i = 0; i < statuses.length; i++) {
        FileStatus status = statuses[i];
        Path suPath = status.getPath();
        if (suPath.getName().equals("_SUCCESS"))
            continue;
        generateXMLdocs(suPath, output, count);
    }
}

From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("v", "vector", true, "input vector sequencefile");
    options.addOption("l", "label", true, "input vector sequencefile");
    options.addOption("o", "output", true, "output Behemoth corpus");

    // parse the command line arguments
    CommandLine line = null;/*  w w w.  j a  v  a2 s .c  o m*/
    try {
        line = parser.parse(options, args);
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusGenerator", options);
            return 0;
        }
        if (!line.hasOption("v") | !line.hasOption("o") | !line.hasOption("l")) {
            formatter.printHelp("CorpusGenerator", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusGenerator", options);
    }

    Path vectorPath = new Path(line.getOptionValue("v"));
    Path labelPath = new Path(line.getOptionValue("l"));
    String output = line.getOptionValue("o");

    Path tempOutput = new Path(vectorPath.getParent(), "temp-" + System.currentTimeMillis());

    // extracts the string representations from the vectors
    int retVal = vectorToString(vectorPath, tempOutput);
    if (retVal != 0) {
        HadoopUtil.delete(getConf(), tempOutput);
        return retVal;
    }

    Path tempOutput2 = new Path(vectorPath.getParent(), "temp-" + System.currentTimeMillis());

    retVal = convert(tempOutput, labelPath, tempOutput2);

    // delete the temp output
    HadoopUtil.delete(getConf(), tempOutput);

    if (retVal != 0) {
        HadoopUtil.delete(getConf(), tempOutput2);
        return retVal;
    }

    // convert tempOutput to standard file
    BufferedWriter bow = new BufferedWriter(new FileWriter(new File(output)));

    // the label dictionary is not dumped to text
    int labelMaxIndex = 0;
    Map<String, Integer> labelIndex = new HashMap<String, Integer>();

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] fss = fs.listStatus(tempOutput2);
    try {
        for (FileStatus status : fss) {
            Path path = status.getPath();
            // skips the _log or _SUCCESS files
            if (!path.getName().startsWith("part-") && !path.getName().equals(tempOutput2.getName()))
                continue;
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
            // read the key + values in that file
            Text key = new Text();
            Text value = new Text();
            while (reader.next(key, value)) {
                String label = key.toString();
                // replace the label by its index
                Integer indexLabel = labelIndex.get(label);
                if (indexLabel == null) {
                    indexLabel = new Integer(labelMaxIndex);
                    labelIndex.put(label, indexLabel);
                    labelMaxIndex++;
                }
                String val = value.toString();
                bow.append(indexLabel.toString()).append(val).append("\n");
            }
            reader.close();
        }
        bow.flush();
    } catch (Exception e) {
        e.printStackTrace();
        return -1;
    } finally {
        bow.close();
        fs.delete(tempOutput2, true);
    }
    return 0;
}

From source file:com.digitalpebble.behemoth.util.ContentExtractor.java

License:Apache License

private int generateDocs(String inputf, String outputf) throws IOException, ArchiveException {

    Path input = new Path(inputf);
    Path dirPath = new Path(outputf);

    FileSystem fsout = FileSystem.get(dirPath.toUri(), getConf());

    if (fsout.exists(dirPath) == false)
        fsout.mkdirs(dirPath);/*from www .j  a va  2  s  . co m*/
    else {
        System.err.println("Output " + outputf + " already exists");
        return -1;
    }

    // index file
    Path indexPath = new Path(dirPath, "index");
    if (fsout.exists(indexPath) == false) {
        fsout.createNewFile(indexPath);
    }

    maxNumEntriesInArchive = getConf().getInt(numEntriesPerArchiveParamName, 10000);

    index = fsout.create(indexPath);

    createArchive(dirPath);

    FileSystem fs = input.getFileSystem(getConf());
    FileStatus[] statuses = fs.listStatus(input);
    int count[] = { 0 };
    for (int i = 0; i < statuses.length; i++) {
        FileStatus status = statuses[i];
        Path suPath = status.getPath();
        if (suPath.getName().equals("_SUCCESS"))
            continue;
        generateDocs(suPath, dirPath, count);
    }

    if (index != null)
        index.close();

    if (currentArchive != null) {
        currentArchive.finish();
        currentArchive.close();
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.util.CorpusReader.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("c", "displayContent", false, "display binary content in output");
    options.addOption("t", "displayText", false, "display text in output");
    options.addOption("a", "displayAnnotations", false, "display annotations in output");
    options.addOption("m", "displayMetadata", false, "display metadata in output");

    // parse the command line arguments
    CommandLine line = null;/*from w  w w  .  j a  v  a  2  s  .com*/
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusReader", options);
            return 0;
        }
        if (input == null) {
            formatter.printHelp("CorpusReader", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusReader", options);
        return -1;
    }

    boolean showBinaryContent = line.hasOption("displayContent");
    boolean showText = line.hasOption("displayText");
    boolean showAnnotations = line.hasOption("displayAnnotations");
    boolean showMD = line.hasOption("displayMetadata");

    Path inputPath = new Path(line.getOptionValue("i"));

    Configuration conf = getConf();
    FileSystem fs = inputPath.getFileSystem(conf);

    // filter input
    DocumentFilter filters = DocumentFilter.getFilters(conf);
    boolean doFilter = DocumentFilter.isRequired(conf);

    FileStatus[] fss = fs.listStatus(inputPath);
    for (FileStatus status : fss) {
        Path path = status.getPath();
        // skips the _log or _SUCCESS files
        if (!path.getName().startsWith("part-") && !path.getName().equals(inputPath.getName()))
            continue;
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        Text key = new Text();
        BehemothDocument value = new BehemothDocument();
        while (reader.next(key, value)) {
            // skip this document?
            if (doFilter && filters.keep(value) == false)
                continue;

            System.out.println(value.toString(showBinaryContent, showAnnotations, showText, showMD));
        }
        reader.close();
    }

    return 0;
}

From source file:com.facebook.hiveio.common.FileSystems.java

License:Apache License

/**
 * Move a file or directory from source to destination, recursively copying
 * subdirectories.// w ww  .  ja v  a2 s.  c  o  m
 *
 * @param fs FileSystem
 * @param file path to copy (file or directory)
 * @param src path to source directory
 * @param dest path to destination directory
 * @throws IOException I/O problems
 */
public static void move(FileSystem fs, Path file, Path src, Path dest) throws IOException {
    Path destFilePath = pathInDestination(file, src, dest);
    if (fs.isFile(file)) {
        if (fs.exists(destFilePath)) {
            if (!fs.delete(destFilePath, true)) {
                throw new IllegalArgumentException("Could not remove existing file " + destFilePath);
            }
        }
        if (!fs.rename(file, destFilePath)) {
            throw new IllegalArgumentException("Could not move " + file + " to " + destFilePath);
        }
    } else if (fs.getFileStatus(file).isDir()) {
        FileStatus[] statuses = fs.listStatus(file);
        fs.mkdirs(destFilePath);
        if (statuses != null) {
            for (FileStatus status : statuses) {
                move(fs, status.getPath(), src, dest);
            }
        }
    }
}

From source file:com.facebook.presto.hive.AbstractTestHiveClient.java

License:Apache License

protected Set<String> listAllDataFiles(Path path) throws IOException {
    Set<String> result = new HashSet<>();
    FileSystem fileSystem = hdfsEnvironment.getFileSystem("user", path);
    if (fileSystem.exists(path)) {
        for (FileStatus fileStatus : fileSystem.listStatus(path)) {
            if (HadoopFileStatus.isFile(fileStatus)) {
                result.add(fileStatus.getPath().toString());
            } else if (HadoopFileStatus.isDirectory(fileStatus)) {
                result.addAll(listAllDataFiles(fileStatus.getPath()));
            }/*w w  w.  ja  v a2s . co  m*/
        }
    }
    return result;
}

From source file:com.facebook.presto.hive.AbstractTestHiveClient.java

License:Apache License

private List<String> listDirectory(String user, Path path) throws IOException {
    FileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path);
    ImmutableList.Builder<String> result = ImmutableList.builder();
    for (FileStatus fileStatus : fileSystem.listStatus(path)) {
        result.add(fileStatus.getPath().getName());
    }//w w w .  ja va2 s .  c  o  m
    return result.build();
}

From source file:com.facebook.presto.hive.HiveSplitSourceProvider.java

License:Apache License

private static FileStatus[] listStatus(FileSystem fs, Path path) {
    try {/*from  w ww. j ava 2 s.c o  m*/
        return fs.listStatus(path);
    } catch (IOException e) {
        throw Throwables.propagate(e);
    }
}