Example usage for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:de.rwth.i9.palm.analytics.algorithm.lda.CustomVectorDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    /**/*w  w w.ja  va2  s.  co m*/
     * Option seqOpt =
     * obuilder.withLongName("seqFile").withRequired(false).withArgument(
     * abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).
     * withDescription(
     * "The Sequence File containing the Vectors").withShortName
     * ("s").create(); Option dirOpt =
     * obuilder.withLongName("seqDirectory").
     * withRequired(false).withArgument(
     * abuilder.withName("seqDirectory").withMinimum
     * (1).withMaximum(1).create()) .withDescription(
     * "The directory containing Sequence File of Vectors")
     * .withShortName("d").create();
     */
    addInputOption();
    addOutputOption();
    addOption("useKey", "u", "If the Key is a vector than dump that instead");
    addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
    addOption("dictionary", "d", "The dictionary file.", false);
    addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
    addOption("csv", "c",
            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries");
    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
            + "(if the vector is one) printing out the name");
    addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
    addOption("sortVectors", "sort",
            "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order");
    addOption("quiet", "q", "Print only file contents");
    addOption("sizeOnly", "sz", "Dump only the size of the vector");
    addOption("numItems", "ni", "Output at most <n> vecors", false);
    addOption("vectorSize", "vs",
            "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort",
            false);
    addOption(buildOption("filter", "fi",
            "Only dump out those vectors whose name matches the filter."
                    + "  Multiple items may be specified by repeating the argument.",
            true, 1, Integer.MAX_VALUE, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path input = getInputPath();
    FileStatus fileStatus = fs.getFileStatus(input);
    if (fileStatus.isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter()));
    } else {
        FileStatus[] inputPaths = fs.globStatus(input);
        pathArr = new Path[inputPaths.length];
        int i = 0;
        for (FileStatus fstatus : inputPaths) {
            pathArr[i++] = fstatus.getPath();
        }
    }

    String dictionaryType = getOption("dictionaryType", "text");

    boolean sortVectors = hasOption("sortVectors");
    boolean quiet = hasOption("quiet");
    if (!quiet) {
        log.info("Sort? {}", sortVectors);
    }

    String[] dictionary = null;
    if (hasOption("dictionary")) {
        String dictFile = getOption("dictionary");
        if ("text".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
        } else if ("sequencefile".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
        } else {
            // TODO: support Lucene's FST as a dictionary type
            throw new IOException("Invalid dictionary type: " + dictionaryType);
        }
    }

    Set<String> filters;
    if (hasOption("filter")) {
        filters = Sets.newHashSet(getOptions("filter"));
    } else {
        filters = null;
    }

    boolean useCSV = hasOption("csv");

    boolean sizeOnly = hasOption("sizeOnly");
    boolean nameOnly = hasOption("nameOnly");
    boolean namesAsComments = hasOption("namesAsComments");
    boolean transposeKeyValue = hasOption("vectorAsKey");
    Writer writer;
    boolean shouldClose;
    File output = getOutputFile();
    if (output != null) {
        shouldClose = true;
        log.info("Output file: {}", output);
        Files.createParentDirs(output);
        writer = Files.newWriter(output, Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        boolean printKey = hasOption("printKey");
        if (useCSV && dictionary != null) {
            writer.write("#");
            for (int j = 0; j < dictionary.length; j++) {
                writer.write(dictionary[j]);
                if (j < dictionary.length - 1) {
                    writer.write(',');
                }
            }
            writer.write('\n');
        }
        Long numItems = null;
        if (hasOption("numItems")) {
            numItems = Long.parseLong(getOption("numItems"));
            if (quiet) {
                writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
            }
        }
        int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize"))
                : Integer.MAX_VALUE;
        long itemCount = 0;
        int fileCount = 0;
        for (Path path : pathArr) {
            if (numItems != null && numItems <= itemCount) {
                break;
            }
            if (quiet) {
                log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
            }
            SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<Writable, Writable>(
                    path, true, conf);
            Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
            long i = 0;
            while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
                Pair<Writable, Writable> record = iterator.next();
                Writable keyWritable = record.getFirst();
                Writable valueWritable = record.getSecond();
                if (printKey) {
                    Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
                    writer.write(notTheVectorWritable.toString());
                    writer.write('\t');
                }
                Vector vector;
                try {
                    vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get();
                } catch (ClassCastException e) {
                    if ((transposeKeyValue ? keyWritable
                            : valueWritable) instanceof WeightedPropertyVectorWritable) {
                        vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable
                                : valueWritable)).getVector();
                    } else {
                        throw e;
                    }
                }
                if (filters != null && vector instanceof NamedVector
                        && !filters.contains(((NamedVector) vector).getName())) {
                    // we are filtering out this item, skip
                    continue;
                }
                if (sizeOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write(":");
                    } else {
                        writer.write(String.valueOf(i++));
                        writer.write(":");
                    }
                    writer.write(String.valueOf(vector.size()));
                    writer.write('\n');
                } else if (nameOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write('\n');
                    }
                } else {
                    String fmtStr;
                    if (useCSV) {
                        fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
                    } else {
                        fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
                                sortVectors);
                    }
                    writer.write(fmtStr);
                    writer.write('\n');
                }
                itemCount++;
            }
        }
        writer.flush();
    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:dz.lab.hdfs.LsWithPathFilter.java

/**
 * @param args//from w  w w .  j a  va  2  s . c  om
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    Path path = new Path("/");
    // restrict result of listStatus() by supplying PathFilter    
    FileStatus[] files = fs.listStatus(path, new PathFilter() {

        @Override
        public boolean accept(Path path) {
            // do not show path whose name equals to user
            if (path.getName().equals("user")) {
                return false;
            }
            return true;
        }
    });

    for (FileStatus file : files) {
        System.out.println(file.getPath().getName());
    }
}

From source file:edu.ecnu.idse.TrajStore.util.FileUtil.java

License:Open Source License

/**
 * Get the actual size of all data in the given directory. If the input is
 * a single file, its size is returned immediately. If the input is a
 * directory, we returns the total size of all data in that directory.
 * If there is a global index, the size is retrieved from that global index.
 * Otherwise, we add up all the sizes of single files.
 * @param fs - the file system that contains the path
 * @param path - the path that contains the data
 * @return//  w ww.  j  a  v  a 2 s .  co m
 * @throws IOException 
 */
public static long getPathSize(FileSystem fs, Path path) throws IOException {
    FileStatus fileStatus = fs.getFileStatus(path);
    // 1- Check if the path points to a file
    if (!fileStatus.isDir())
        return fileStatus.getLen();
    // 2- Check if the input is indexed and get the cached size
    GlobalIndex<Partition> gIndex = SpatialTemporalSite.getGlobalIndex(fs, path);
    if (gIndex != null) {
        long totalSize = 0;
        for (Partition partition : gIndex)
            totalSize += partition.size;
        return totalSize;
    }
    // 3- Get the total size of all non-hidden files
    long totalSize = 0;
    FileStatus[] allFiles = fs.listStatus(path, SpatialTemporalSite.NonHiddenFileFilter);
    for (FileStatus subFile : allFiles) {
        if (!subFile.isDir())
            totalSize += subFile.getLen();
    }
    return totalSize;
}

From source file:edu.indiana.d2i.htrc.DataCopyJobTest.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();

    String outputPath = args[0]; // result

    HTRCDataAPIClient client = Utilities.creatDataAPIClient(conf);

    FileSystem fs = FileSystem.get(conf);

    FileStatus[] status = fs.listStatus(new Path(outputPath), Utilities.HIDDEN_FILE_FILTER);
    Text key = new Text();
    Text value = new Text();
    for (int i = 0; i < status.length; i++) {
        SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf);
        while (seqReader.next(key, value)) {
            Iterable<Entry<String, String>> content = client.getID2Content(key.toString());
            Iterator<Entry<String, String>> iterator = content.iterator();
            Entry<String, String> entry = iterator.next();
            Assert.assertEquals(entry.getValue(), value.toString());
        }//from ww  w .  j av  a 2 s.c  o m
    }

    System.out.println("Finish validation.");

    //      FileStatus[] status = fs.listStatus(new Path(outputPath), Utilities.HIDDEN_FILE_FILTER);
    //      for (int i = 0; i < status.length; i++) {
    //         System.out.println(status[i].getPath().getName());
    //      }
    //      System.out.println("==========================================");
    //      FileStatus[] globStatus = fs.globStatus(new Path(outputPath));
    //      for (int i = 0; i < globStatus.length; i++) {
    //         System.out.println(globStatus[i].getPath().getName());
    //      }
    return 0;
}

From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java

License:Apache License

private static void clusterDataSeq(Configuration conf, Path input, Path clustersIn, Path output,
        DistanceMeasure measure) throws IOException {

    KMeansClusterer clusterer = new KMeansClusterer(measure);
    Collection<Cluster> clusters = Lists.newArrayList();
    MemKMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters);
    if (clusters.isEmpty()) {
        throw new IllegalStateException("Clusters is empty!");
    }/*from w w  w.  j a va 2 s .  c om*/
    FileSystem fs = FileSystem.get(input.toUri(), conf);
    FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter());
    int part = 0;
    for (FileStatus s : status) {
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(output, "part-m-" + part),
                IntWritable.class, WeightedVectorWritable.class);
        try {
            for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(s.getPath(), conf)) {
                clusterer.emitPointToNearestCluster(value.get(), clusters, writer);
            }
        } finally {
            Closeables.closeQuietly(writer);
        }
    }

}

From source file:edu.indiana.d2i.htrc.skmeans.StreamingKMeansDriver.java

License:Apache License

private void StreamingKMeansConfigHelper(Configuration conf, String input, int maxCluster) throws IOException {
    // get samples to calculate scale factor
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER);
    int index = 0 + (int) (Math.random() * (status.length));
    SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[index].getPath(), conf);

    int count = 0;
    Text key = new Text();
    VectorWritable value = new VectorWritable();
    List<MatrixSlice> slices = new ArrayList<MatrixSlice>();
    while (seqReader.next(key, value) && count < samplesNum) {
        MatrixSlice slice = new MatrixSlice(value.get().clone(), count);
        slices.add(slice);/*from w  w w .j  av a  2s.  c  om*/
        count++;
    }

    // set cutoff
    float cutoff = (float) StreamingKmeans.estimateCutoff(slices, samplesNum);
    conf.setFloat(StreamingKMeansConfigKeys.CUTOFF, cutoff);
    logger.info("Scale factor (cutoff) is: " + cutoff);

    // set vector dimension
    int dim = value.get().size();
    conf.setInt(StreamingKMeansConfigKeys.VECTOR_DIMENSION, dim);
    logger.info("Dimemsion of a vector is: " + dim);

    // set maximum #cluster
    conf.setInt(StreamingKMeansConfigKeys.MAXCLUSTER, maxCluster);

    // set distance measurement
    conf.set(StreamingKMeansConfigKeys.DIST_MEASUREMENT, EuclideanDistanceMeasure.class.getName());
}

From source file:edu.indiana.d2i.htrc.util.ClusterInspection.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    String input = args[0]; // cluster path
    String output = args[1];/*from w  w w. j av  a 2  s. co m*/

    int numVector = 0;

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER);
    Text key = new Text();
    LongWritable value = new LongWritable();
    BufferedWriter writer = new BufferedWriter(new FileWriter(output));
    for (int i = 0; i < status.length; i++) {
        SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf);
        while (seqReader.next(key, value)) {
            numVector++;
            writer.write(key.toString() + "\n");
        }
    }
    writer.close();

    logger.info("#vector: " + numVector);

    return 0;
}

From source file:edu.indiana.d2i.htrc.util.DataCopyValidation.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();

    String outputPath = args[0]; // result
    String dataAPIConfClassName = args[1];
    int maxIdsPerReq = Integer.valueOf(args[2]);

    logger.info("DataValidation ");
    logger.info(" - output: " + outputPath);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);
    logger.info(" - maxIdsPerReq: " + maxIdsPerReq);

    Utilities.setDataAPIConf(conf, dataAPIConfClassName, maxIdsPerReq);

    //      HTRCDataAPIClient client = Utilities.creatDataAPIClient(conf);
    String dataEPR = conf.get(HTRCConstants.HOSTS_SEPARATEDBY_COMMA,
            "https://129-79-49-119.dhcp-bl.indiana.edu:25443/data-api");
    String delimitor = conf.get(HTRCConstants.DATA_API_URL_DELIMITOR, "|");
    String clientID = conf.get(HTRCConstants.DATA_API_CLIENTID, "yim");
    String clientSecrete = conf.get(HTRCConstants.DATA_API_CLIENTSECRETE, "yim");
    String tokenLoc = conf.get(HTRCConstants.DATA_API_TOKENLOC,
            "https://129-79-49-119.dhcp-bl.indiana.edu:25443/oauth2/token?grant_type=client_credentials");
    boolean selfsigned = conf.getBoolean(HTRCConstants.DATA_API_SELFSIGNED, true);
    HTRCDataAPIClient client = new HTRCDataAPIClient.Builder(dataEPR, delimitor).authentication(true)
            .selfsigned(selfsigned).clientID(clientID).clientSecrete(clientSecrete).tokenLocation(tokenLoc)
            .build();/*from w  w w . ja  v  a  2 s  .c  om*/

    FileSystem fs = FileSystem.get(conf);
    FileStatus[] status = fs.listStatus(new Path(outputPath), Utilities.HIDDEN_FILE_FILTER);
    Text key = new Text();
    Text value = new Text();
    for (int i = 0; i < status.length; i++) {
        SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf);
        while (seqReader.next(key, value)) {
            //            logger.info(key.toString());
            Iterable<Entry<String, String>> content = client.getID2Content(key.toString());
            Iterator<Entry<String, String>> iterator = content.iterator();
            Entry<String, String> entry = iterator.next();
            if (!entry.getValue().equals(value.toString())) {
                logger.error("Book : " + key.toString() + " corrupts!");
            }
        }
    }

    logger.info("Finish validation.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.util.TokensInspection.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    String input = args[0]; // word count path
    String output = args[1];//w w w.j  av  a 2s  . c o  m

    int numVector = 0;

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER);
    Text key = new Text();
    LongWritable value = new LongWritable();
    BufferedWriter writer = new BufferedWriter(new FileWriter(output));
    for (int i = 0; i < status.length; i++) {
        SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf);
        while (seqReader.next(key, value)) {
            numVector++;
            writer.write(key.toString() + "\n");
        }
    }
    writer.close();

    logger.info("#vector: " + numVector);

    return 0;
}

From source file:edu.indiana.d2i.htrc.util.VectorInspection.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    String input = args[0];//  ww w .j  a v a2s  .  c  o  m
    String output = args[1];

    int numVector = 0;
    Set<Integer> dimLst = new HashSet<Integer>();

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER);
    Text key = new Text();
    VectorWritable value = new VectorWritable();
    BufferedWriter writer = new BufferedWriter(new FileWriter(output));
    for (int i = 0; i < status.length; i++) {
        SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf);
        while (seqReader.next(key, value)) {
            numVector++;
            dimLst.add(value.get().size());
            writer.write(value.toString() + "\n");
        }
    }

    logger.info("#vector: " + numVector);
    logger.info("number of different dimensions: " + dimLst.size());
    StringBuilder builder = new StringBuilder();
    for (Integer dim : dimLst)
        builder.append(dim + " ");
    logger.info("" + builder.toString());

    writer.close();

    return 0;
}