Example usage for org.apache.hadoop.fs FileSystem listStatus

List of usage examples for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException 

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:de.rwth.i9.palm.analytics.algorithm.lda.CustomVectorDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    /**/*w  w w.ja  va2  s.  co m*/
     * Option seqOpt =
     * obuilder.withLongName("seqFile").withRequired(false).withArgument(
     * abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).
     * withDescription(
     * "The Sequence File containing the Vectors").withShortName
     * ("s").create(); Option dirOpt =
     * obuilder.withLongName("seqDirectory").
     * withRequired(false).withArgument(
     * abuilder.withName("seqDirectory").withMinimum
     * (1).withMaximum(1).create()) .withDescription(
     * "The directory containing Sequence File of Vectors")
     * .withShortName("d").create();
     */
    addInputOption();
    addOutputOption();
    addOption("useKey", "u", "If the Key is a vector than dump that instead");
    addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
    addOption("dictionary", "d", "The dictionary file.", false);
    addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
    addOption("csv", "c",
            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries");
    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
            + "(if the vector is one) printing out the name");
    addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
    addOption("sortVectors", "sort",
            "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order");
    addOption("quiet", "q", "Print only file contents");
    addOption("sizeOnly", "sz", "Dump only the size of the vector");
    addOption("numItems", "ni", "Output at most <n> vecors", false);
    addOption("vectorSize", "vs",
            "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort",
            false);
    addOption(buildOption("filter", "fi",
            "Only dump out those vectors whose name matches the filter."
                    + "  Multiple items may be specified by repeating the argument.",
            true, 1, Integer.MAX_VALUE, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path input = getInputPath();
    FileStatus fileStatus = fs.getFileStatus(input);
    if (fileStatus.isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter()));
    } else {
        FileStatus[] inputPaths = fs.globStatus(input);
        pathArr = new Path[inputPaths.length];
        int i = 0;
        for (FileStatus fstatus : inputPaths) {
            pathArr[i++] = fstatus.getPath();
        }
    }

    String dictionaryType = getOption("dictionaryType", "text");

    boolean sortVectors = hasOption("sortVectors");
    boolean quiet = hasOption("quiet");
    if (!quiet) {
        log.info("Sort? {}", sortVectors);
    }

    String[] dictionary = null;
    if (hasOption("dictionary")) {
        String dictFile = getOption("dictionary");
        if ("text".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
        } else if ("sequencefile".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
        } else {
            // TODO: support Lucene's FST as a dictionary type
            throw new IOException("Invalid dictionary type: " + dictionaryType);
        }
    }

    Set<String> filters;
    if (hasOption("filter")) {
        filters = Sets.newHashSet(getOptions("filter"));
    } else {
        filters = null;
    }

    boolean useCSV = hasOption("csv");

    boolean sizeOnly = hasOption("sizeOnly");
    boolean nameOnly = hasOption("nameOnly");
    boolean namesAsComments = hasOption("namesAsComments");
    boolean transposeKeyValue = hasOption("vectorAsKey");
    Writer writer;
    boolean shouldClose;
    File output = getOutputFile();
    if (output != null) {
        shouldClose = true;
        log.info("Output file: {}", output);
        Files.createParentDirs(output);
        writer = Files.newWriter(output, Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        boolean printKey = hasOption("printKey");
        if (useCSV && dictionary != null) {
            writer.write("#");
            for (int j = 0; j < dictionary.length; j++) {
                writer.write(dictionary[j]);
                if (j < dictionary.length - 1) {
                    writer.write(',');
                }
            }
            writer.write('\n');
        }
        Long numItems = null;
        if (hasOption("numItems")) {
            numItems = Long.parseLong(getOption("numItems"));
            if (quiet) {
                writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
            }
        }
        int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize"))
                : Integer.MAX_VALUE;
        long itemCount = 0;
        int fileCount = 0;
        for (Path path : pathArr) {
            if (numItems != null && numItems <= itemCount) {
                break;
            }
            if (quiet) {
                log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
            }
            SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<Writable, Writable>(
                    path, true, conf);
            Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
            long i = 0;
            while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
                Pair<Writable, Writable> record = iterator.next();
                Writable keyWritable = record.getFirst();
                Writable valueWritable = record.getSecond();
                if (printKey) {
                    Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
                    writer.write(notTheVectorWritable.toString());
                    writer.write('\t');
                }
                Vector vector;
                try {
                    vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get();
                } catch (ClassCastException e) {
                    if ((transposeKeyValue ? keyWritable
                            : valueWritable) instanceof WeightedPropertyVectorWritable) {
                        vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable
                                : valueWritable)).getVector();
                    } else {
                        throw e;
                    }
                }
                if (filters != null && vector instanceof NamedVector
                        && !filters.contains(((NamedVector) vector).getName())) {
                    // we are filtering out this item, skip
                    continue;
                }
                if (sizeOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write(":");
                    } else {
                        writer.write(String.valueOf(i++));
                        writer.write(":");
                    }
                    writer.write(String.valueOf(vector.size()));
                    writer.write('\n');
                } else if (nameOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write('\n');
                    }
                } else {
                    String fmtStr;
                    if (useCSV) {
                        fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
                    } else {
                        fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
                                sortVectors);
                    }
                    writer.write(fmtStr);
                    writer.write('\n');
                }
                itemCount++;
            }
        }
        writer.flush();
    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:dz.lab.hdfs.LsWithPathFilter.java

/**
 * @param args//from w  w w .  j a  va  2  s . c  om
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    Path path = new Path("/");
    // restrict result of listStatus() by supplying PathFilter    
    FileStatus[] files = fs.listStatus(path, new PathFilter() {

        @Override
        public boolean accept(Path path) {
            // do not show path whose name equals to user
            if (path.getName().equals("user")) {
                return false;
            }
            return true;
        }
    });

    for (FileStatus file : files) {
        System.out.println(file.getPath().getName());
    }
}

From source file:edu.ecnu.idse.TrajStore.util.FileUtil.java

License:Open Source License

/**
 * Get the actual size of all data in the given directory. If the input is
 * a single file, its size is returned immediately. If the input is a
 * directory, we returns the total size of all data in that directory.
 * If there is a global index, the size is retrieved from that global index.
 * Otherwise, we add up all the sizes of single files.
 * @param fs - the file system that contains the path
 * @param path - the path that contains the data
 * @return//  w ww.  j  a  v  a 2 s .  co m
 * @throws IOException 
 */
public static long getPathSize(FileSystem fs, Path path) throws IOException {
    FileStatus fileStatus = fs.getFileStatus(path);
    // 1- Check if the path points to a file
    if (!fileStatus.isDir())
        return fileStatus.getLen();
    // 2- Check if the input is indexed and get the cached size
    GlobalIndex<Partition> gIndex = SpatialTemporalSite.getGlobalIndex(fs, path);
    if (gIndex != null) {
        long totalSize = 0;
        for (Partition partition : gIndex)
            totalSize += partition.size;
        return totalSize;
    }
    // 3- Get the total size of all non-hidden files
    long totalSize = 0;
    FileStatus[] allFiles = fs.listStatus(path, SpatialTemporalSite.NonHiddenFileFilter);
    for (FileStatus subFile : allFiles) {
        if (!subFile.isDir())
            totalSize += subFile.getLen();
    }
    return totalSize;
}

From source file:edu.indiana.d2i.htrc.DataCopyJobTest.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();

    String outputPath = args[0]; // result

    HTRCDataAPIClient client = Utilities.creatDataAPIClient(conf);

    FileSystem fs = FileSystem.get(conf);

    FileStatus[] status = fs.listStatus(new Path(outputPath), Utilities.HIDDEN_FILE_FILTER);
    Text key = new Text();
    Text value = new Text();
    for (int i = 0; i < status.length; i++) {
        SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf);
        while (seqReader.next(key, value)) {
            Iterable<Entry<String, String>> content = client.getID2Content(key.toString());
            Iterator<Entry<String, String>> iterator = content.iterator();
            Entry<String, String> entry = iterator.next();
            Assert.assertEquals(entry.getValue(), value.toString());
        }//from ww  w .  j av  a 2 s.c  o m
    }

    System.out.println("Finish validation.");

    //      FileStatus[] status = fs.listStatus(new Path(outputPath), Utilities.HIDDEN_FILE_FILTER);
    //      for (int i = 0; i < status.length; i++) {
    //         System.out.println(status[i].getPath().getName());
    //      }
    //      System.out.println("==========================================");
    //      FileStatus[] globStatus = fs.globStatus(new Path(outputPath));
    //      for (int i = 0; i < globStatus.length; i++) {
    //         System.out.println(globStatus[i].getPath().getName());
    //      }
    return 0;
}

From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java

License:Apache License

private static void clusterDataSeq(Configuration conf, Path input, Path clustersIn, Path output,
        DistanceMeasure measure) throws IOException {

    KMeansClusterer clusterer = new KMeansClusterer(measure);
    Collection<Cluster> clusters = Lists.newArrayList();
    MemKMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters);
    if (clusters.isEmpty()) {
        throw new IllegalStateException("Clusters is empty!");
    }/*from w w  w.  j a va 2 s .  c om*/
    FileSystem fs = FileSystem.get(input.toUri(), conf);
    FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter());
    int part = 0;
    for (FileStatus s : status) {
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(output, "part-m-" + part),
                IntWritable.class, WeightedVectorWritable.class);
        try {
            for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(s.getPath(), conf)) {
                clusterer.emitPointToNearestCluster(value.get(), clusters, writer);
            }
        } finally {
            Closeables.closeQuietly(writer);
        }
    }

}

From source file:edu.indiana.d2i.htrc.skmeans.StreamingKMeansDriver.java

License:Apache License

private void StreamingKMeansConfigHelper(Configuration conf, String input, int maxCluster) throws IOException {
    // get samples to calculate scale factor
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER);
    int index = 0 + (int) (Math.random() * (status.length));
    SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[index].getPath(), conf);

    int count = 0;
    Text key = new Text();
    VectorWritable value = new VectorWritable();
    List<MatrixSlice> slices = new ArrayList<MatrixSlice>();
    while (seqReader.next(key, value) && count < samplesNum) {
        MatrixSlice slice = new MatrixSlice(value.get().clone(), count);
        slices.add(slice);/*from w  w w .j  av a  2s.  c  om*/
        count++;
    }

    // set cutoff
    float cutoff = (float) StreamingKmeans.estimateCutoff(slices, samplesNum);
    conf.setFloat(StreamingKMeansConfigKeys.CUTOFF, cutoff);
    logger.info("Scale factor (cutoff) is: " + cutoff);

    // set vector dimension
    int dim = value.get().size();
    conf.setInt(StreamingKMeansConfigKeys.VECTOR_DIMENSION, dim);
    logger.info("Dimemsion of a vector is: " + dim);

    // set maximum #cluster
    conf.setInt(StreamingKMeansConfigKeys.MAXCLUSTER, maxCluster);

    // set distance measurement
    conf.set(StreamingKMeansConfigKeys.DIST_MEASUREMENT, EuclideanDistanceMeasure.class.getName());
}

From source file:edu.indiana.d2i.htrc.util.ClusterInspection.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    String input = args[0]; // cluster path
    String output = args[1];/*from w  w w. j av  a 2  s. co m*/

    int numVector = 0;

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER);
    Text key = new Text();
    LongWritable value = new LongWritable();
    BufferedWriter writer = new BufferedWriter(new FileWriter(output));
    for (int i = 0; i < status.length; i++) {
        SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf);
        while (seqReader.next(key, value)) {
            numVector++;
            writer.write(key.toString() + "\n");
        }
    }
    writer.close();

    logger.info("#vector: " + numVector);

    return 0;
}

From source file:edu.indiana.d2i.htrc.util.DataCopyValidation.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();

    String outputPath = args[0]; // result
    String dataAPIConfClassName = args[1];
    int maxIdsPerReq = Integer.valueOf(args[2]);

    logger.info("DataValidation ");
    logger.info(" - output: " + outputPath);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);
    logger.info(" - maxIdsPerReq: " + maxIdsPerReq);

    Utilities.setDataAPIConf(conf, dataAPIConfClassName, maxIdsPerReq);

    //      HTRCDataAPIClient client = Utilities.creatDataAPIClient(conf);
    String dataEPR = conf.get(HTRCConstants.HOSTS_SEPARATEDBY_COMMA,
            "https://129-79-49-119.dhcp-bl.indiana.edu:25443/data-api");
    String delimitor = conf.get(HTRCConstants.DATA_API_URL_DELIMITOR, "|");
    String clientID = conf.get(HTRCConstants.DATA_API_CLIENTID, "yim");
    String clientSecrete = conf.get(HTRCConstants.DATA_API_CLIENTSECRETE, "yim");
    String tokenLoc = conf.get(HTRCConstants.DATA_API_TOKENLOC,
            "https://129-79-49-119.dhcp-bl.indiana.edu:25443/oauth2/token?grant_type=client_credentials");
    boolean selfsigned = conf.getBoolean(HTRCConstants.DATA_API_SELFSIGNED, true);
    HTRCDataAPIClient client = new HTRCDataAPIClient.Builder(dataEPR, delimitor).authentication(true)
            .selfsigned(selfsigned).clientID(clientID).clientSecrete(clientSecrete).tokenLocation(tokenLoc)
            .build();/*from w  w w . ja  v  a  2 s  .c  om*/

    FileSystem fs = FileSystem.get(conf);
    FileStatus[] status = fs.listStatus(new Path(outputPath), Utilities.HIDDEN_FILE_FILTER);
    Text key = new Text();
    Text value = new Text();
    for (int i = 0; i < status.length; i++) {
        SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf);
        while (seqReader.next(key, value)) {
            //            logger.info(key.toString());
            Iterable<Entry<String, String>> content = client.getID2Content(key.toString());
            Iterator<Entry<String, String>> iterator = content.iterator();
            Entry<String, String> entry = iterator.next();
            if (!entry.getValue().equals(value.toString())) {
                logger.error("Book : " + key.toString() + " corrupts!");
            }
        }
    }

    logger.info("Finish validation.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.util.TokensInspection.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    String input = args[0]; // word count path
    String output = args[1];//w w w.j  av  a 2s  . c o  m

    int numVector = 0;

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER);
    Text key = new Text();
    LongWritable value = new LongWritable();
    BufferedWriter writer = new BufferedWriter(new FileWriter(output));
    for (int i = 0; i < status.length; i++) {
        SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf);
        while (seqReader.next(key, value)) {
            numVector++;
            writer.write(key.toString() + "\n");
        }
    }
    writer.close();

    logger.info("#vector: " + numVector);

    return 0;
}

From source file:edu.indiana.d2i.htrc.util.VectorInspection.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    String input = args[0];//  ww w .j  a v a2s  .  c  o  m
    String output = args[1];

    int numVector = 0;
    Set<Integer> dimLst = new HashSet<Integer>();

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER);
    Text key = new Text();
    VectorWritable value = new VectorWritable();
    BufferedWriter writer = new BufferedWriter(new FileWriter(output));
    for (int i = 0; i < status.length; i++) {
        SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf);
        while (seqReader.next(key, value)) {
            numVector++;
            dimLst.add(value.get().size());
            writer.write(value.toString() + "\n");
        }
    }

    logger.info("#vector: " + numVector);
    logger.info("number of different dimensions: " + dimLst.size());
    StringBuilder builder = new StringBuilder();
    for (Integer dim : dimLst)
        builder.append(dim + " ");
    logger.info("" + builder.toString());

    writer.close();

    return 0;
}