List of usage examples for org.apache.hadoop.fs FileSystem listStatus
public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException
From source file:de.rwth.i9.palm.analytics.algorithm.lda.CustomVectorDumper.java
License:Apache License
@Override public int run(String[] args) throws Exception { /**/*w w w.ja va2 s. co m*/ * Option seqOpt = * obuilder.withLongName("seqFile").withRequired(false).withArgument( * abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()). * withDescription( * "The Sequence File containing the Vectors").withShortName * ("s").create(); Option dirOpt = * obuilder.withLongName("seqDirectory"). * withRequired(false).withArgument( * abuilder.withName("seqDirectory").withMinimum * (1).withMaximum(1).create()) .withDescription( * "The directory containing Sequence File of Vectors") * .withShortName("d").create(); */ addInputOption(); addOutputOption(); addOption("useKey", "u", "If the Key is a vector than dump that instead"); addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true"); addOption("dictionary", "d", "The dictionary file.", false); addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false); addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries"); addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector " + "(if the vector is one) printing out the name"); addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)"); addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order"); addOption("quiet", "q", "Print only file contents"); addOption("sizeOnly", "sz", "Dump only the size of the vector"); addOption("numItems", "ni", "Output at most <n> vecors", false); addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort", false); addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter." + " Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null)); if (parseArguments(args, false, true) == null) { return -1; } Path[] pathArr; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path input = getInputPath(); FileStatus fileStatus = fs.getFileStatus(input); if (fileStatus.isDir()) { pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter())); } else { FileStatus[] inputPaths = fs.globStatus(input); pathArr = new Path[inputPaths.length]; int i = 0; for (FileStatus fstatus : inputPaths) { pathArr[i++] = fstatus.getPath(); } } String dictionaryType = getOption("dictionaryType", "text"); boolean sortVectors = hasOption("sortVectors"); boolean quiet = hasOption("quiet"); if (!quiet) { log.info("Sort? {}", sortVectors); } String[] dictionary = null; if (hasOption("dictionary")) { String dictFile = getOption("dictionary"); if ("text".equals(dictionaryType)) { dictionary = VectorHelper.loadTermDictionary(new File(dictFile)); } else if ("sequencefile".equals(dictionaryType)) { dictionary = VectorHelper.loadTermDictionary(conf, dictFile); } else { // TODO: support Lucene's FST as a dictionary type throw new IOException("Invalid dictionary type: " + dictionaryType); } } Set<String> filters; if (hasOption("filter")) { filters = Sets.newHashSet(getOptions("filter")); } else { filters = null; } boolean useCSV = hasOption("csv"); boolean sizeOnly = hasOption("sizeOnly"); boolean nameOnly = hasOption("nameOnly"); boolean namesAsComments = hasOption("namesAsComments"); boolean transposeKeyValue = hasOption("vectorAsKey"); Writer writer; boolean shouldClose; File output = getOutputFile(); if (output != null) { shouldClose = true; log.info("Output file: {}", output); Files.createParentDirs(output); writer = Files.newWriter(output, Charsets.UTF_8); } else { shouldClose = false; writer = new OutputStreamWriter(System.out, Charsets.UTF_8); } try { boolean printKey = hasOption("printKey"); if (useCSV && dictionary != null) { writer.write("#"); for (int j = 0; j < dictionary.length; j++) { writer.write(dictionary[j]); if (j < dictionary.length - 1) { writer.write(','); } } writer.write('\n'); } Long numItems = null; if (hasOption("numItems")) { numItems = Long.parseLong(getOption("numItems")); if (quiet) { writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n'); } } int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize")) : Integer.MAX_VALUE; long itemCount = 0; int fileCount = 0; for (Path path : pathArr) { if (numItems != null && numItems <= itemCount) { break; } if (quiet) { log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length); } SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<Writable, Writable>( path, true, conf); Iterator<Pair<Writable, Writable>> iterator = iterable.iterator(); long i = 0; while (iterator.hasNext() && (numItems == null || itemCount < numItems)) { Pair<Writable, Writable> record = iterator.next(); Writable keyWritable = record.getFirst(); Writable valueWritable = record.getSecond(); if (printKey) { Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable; writer.write(notTheVectorWritable.toString()); writer.write('\t'); } Vector vector; try { vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get(); } catch (ClassCastException e) { if ((transposeKeyValue ? keyWritable : valueWritable) instanceof WeightedPropertyVectorWritable) { vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).getVector(); } else { throw e; } } if (filters != null && vector instanceof NamedVector && !filters.contains(((NamedVector) vector).getName())) { // we are filtering out this item, skip continue; } if (sizeOnly) { if (vector instanceof NamedVector) { writer.write(((NamedVector) vector).getName()); writer.write(":"); } else { writer.write(String.valueOf(i++)); writer.write(":"); } writer.write(String.valueOf(vector.size())); writer.write('\n'); } else if (nameOnly) { if (vector instanceof NamedVector) { writer.write(((NamedVector) vector).getName()); writer.write('\n'); } } else { String fmtStr; if (useCSV) { fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments); } else { fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector, sortVectors); } writer.write(fmtStr); writer.write('\n'); } itemCount++; } } writer.flush(); } finally { if (shouldClose) { Closeables.close(writer, false); } } return 0; }
From source file:dz.lab.hdfs.LsWithPathFilter.java
/** * @param args//from w w w . j a va 2 s . c om * @throws IOException */ public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path("/"); // restrict result of listStatus() by supplying PathFilter FileStatus[] files = fs.listStatus(path, new PathFilter() { @Override public boolean accept(Path path) { // do not show path whose name equals to user if (path.getName().equals("user")) { return false; } return true; } }); for (FileStatus file : files) { System.out.println(file.getPath().getName()); } }
From source file:edu.ecnu.idse.TrajStore.util.FileUtil.java
License:Open Source License
/** * Get the actual size of all data in the given directory. If the input is * a single file, its size is returned immediately. If the input is a * directory, we returns the total size of all data in that directory. * If there is a global index, the size is retrieved from that global index. * Otherwise, we add up all the sizes of single files. * @param fs - the file system that contains the path * @param path - the path that contains the data * @return// w ww. j a v a 2 s . co m * @throws IOException */ public static long getPathSize(FileSystem fs, Path path) throws IOException { FileStatus fileStatus = fs.getFileStatus(path); // 1- Check if the path points to a file if (!fileStatus.isDir()) return fileStatus.getLen(); // 2- Check if the input is indexed and get the cached size GlobalIndex<Partition> gIndex = SpatialTemporalSite.getGlobalIndex(fs, path); if (gIndex != null) { long totalSize = 0; for (Partition partition : gIndex) totalSize += partition.size; return totalSize; } // 3- Get the total size of all non-hidden files long totalSize = 0; FileStatus[] allFiles = fs.listStatus(path, SpatialTemporalSite.NonHiddenFileFilter); for (FileStatus subFile : allFiles) { if (!subFile.isDir()) totalSize += subFile.getLen(); } return totalSize; }
From source file:edu.indiana.d2i.htrc.DataCopyJobTest.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); String outputPath = args[0]; // result HTRCDataAPIClient client = Utilities.creatDataAPIClient(conf); FileSystem fs = FileSystem.get(conf); FileStatus[] status = fs.listStatus(new Path(outputPath), Utilities.HIDDEN_FILE_FILTER); Text key = new Text(); Text value = new Text(); for (int i = 0; i < status.length; i++) { SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf); while (seqReader.next(key, value)) { Iterable<Entry<String, String>> content = client.getID2Content(key.toString()); Iterator<Entry<String, String>> iterator = content.iterator(); Entry<String, String> entry = iterator.next(); Assert.assertEquals(entry.getValue(), value.toString()); }//from ww w . j av a 2 s.c o m } System.out.println("Finish validation."); // FileStatus[] status = fs.listStatus(new Path(outputPath), Utilities.HIDDEN_FILE_FILTER); // for (int i = 0; i < status.length; i++) { // System.out.println(status[i].getPath().getName()); // } // System.out.println("=========================================="); // FileStatus[] globStatus = fs.globStatus(new Path(outputPath)); // for (int i = 0; i < globStatus.length; i++) { // System.out.println(globStatus[i].getPath().getName()); // } return 0; }
From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java
License:Apache License
private static void clusterDataSeq(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure) throws IOException { KMeansClusterer clusterer = new KMeansClusterer(measure); Collection<Cluster> clusters = Lists.newArrayList(); MemKMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters); if (clusters.isEmpty()) { throw new IllegalStateException("Clusters is empty!"); }/*from w w w. j a va 2 s . c om*/ FileSystem fs = FileSystem.get(input.toUri(), conf); FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter()); int part = 0; for (FileStatus s : status) { SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(output, "part-m-" + part), IntWritable.class, WeightedVectorWritable.class); try { for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(s.getPath(), conf)) { clusterer.emitPointToNearestCluster(value.get(), clusters, writer); } } finally { Closeables.closeQuietly(writer); } } }
From source file:edu.indiana.d2i.htrc.skmeans.StreamingKMeansDriver.java
License:Apache License
private void StreamingKMeansConfigHelper(Configuration conf, String input, int maxCluster) throws IOException { // get samples to calculate scale factor FileSystem fs = FileSystem.get(conf); FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER); int index = 0 + (int) (Math.random() * (status.length)); SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[index].getPath(), conf); int count = 0; Text key = new Text(); VectorWritable value = new VectorWritable(); List<MatrixSlice> slices = new ArrayList<MatrixSlice>(); while (seqReader.next(key, value) && count < samplesNum) { MatrixSlice slice = new MatrixSlice(value.get().clone(), count); slices.add(slice);/*from w w w .j av a 2s. c om*/ count++; } // set cutoff float cutoff = (float) StreamingKmeans.estimateCutoff(slices, samplesNum); conf.setFloat(StreamingKMeansConfigKeys.CUTOFF, cutoff); logger.info("Scale factor (cutoff) is: " + cutoff); // set vector dimension int dim = value.get().size(); conf.setInt(StreamingKMeansConfigKeys.VECTOR_DIMENSION, dim); logger.info("Dimemsion of a vector is: " + dim); // set maximum #cluster conf.setInt(StreamingKMeansConfigKeys.MAXCLUSTER, maxCluster); // set distance measurement conf.set(StreamingKMeansConfigKeys.DIST_MEASUREMENT, EuclideanDistanceMeasure.class.getName()); }
From source file:edu.indiana.d2i.htrc.util.ClusterInspection.java
License:Apache License
@Override public int run(String[] args) throws Exception { String input = args[0]; // cluster path String output = args[1];/*from w w w. j av a 2 s. co m*/ int numVector = 0; Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER); Text key = new Text(); LongWritable value = new LongWritable(); BufferedWriter writer = new BufferedWriter(new FileWriter(output)); for (int i = 0; i < status.length; i++) { SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf); while (seqReader.next(key, value)) { numVector++; writer.write(key.toString() + "\n"); } } writer.close(); logger.info("#vector: " + numVector); return 0; }
From source file:edu.indiana.d2i.htrc.util.DataCopyValidation.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String outputPath = args[0]; // result String dataAPIConfClassName = args[1]; int maxIdsPerReq = Integer.valueOf(args[2]); logger.info("DataValidation "); logger.info(" - output: " + outputPath); logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName); logger.info(" - maxIdsPerReq: " + maxIdsPerReq); Utilities.setDataAPIConf(conf, dataAPIConfClassName, maxIdsPerReq); // HTRCDataAPIClient client = Utilities.creatDataAPIClient(conf); String dataEPR = conf.get(HTRCConstants.HOSTS_SEPARATEDBY_COMMA, "https://129-79-49-119.dhcp-bl.indiana.edu:25443/data-api"); String delimitor = conf.get(HTRCConstants.DATA_API_URL_DELIMITOR, "|"); String clientID = conf.get(HTRCConstants.DATA_API_CLIENTID, "yim"); String clientSecrete = conf.get(HTRCConstants.DATA_API_CLIENTSECRETE, "yim"); String tokenLoc = conf.get(HTRCConstants.DATA_API_TOKENLOC, "https://129-79-49-119.dhcp-bl.indiana.edu:25443/oauth2/token?grant_type=client_credentials"); boolean selfsigned = conf.getBoolean(HTRCConstants.DATA_API_SELFSIGNED, true); HTRCDataAPIClient client = new HTRCDataAPIClient.Builder(dataEPR, delimitor).authentication(true) .selfsigned(selfsigned).clientID(clientID).clientSecrete(clientSecrete).tokenLocation(tokenLoc) .build();/*from w w w . ja v a 2 s .c om*/ FileSystem fs = FileSystem.get(conf); FileStatus[] status = fs.listStatus(new Path(outputPath), Utilities.HIDDEN_FILE_FILTER); Text key = new Text(); Text value = new Text(); for (int i = 0; i < status.length; i++) { SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf); while (seqReader.next(key, value)) { // logger.info(key.toString()); Iterable<Entry<String, String>> content = client.getID2Content(key.toString()); Iterator<Entry<String, String>> iterator = content.iterator(); Entry<String, String> entry = iterator.next(); if (!entry.getValue().equals(value.toString())) { logger.error("Book : " + key.toString() + " corrupts!"); } } } logger.info("Finish validation."); return 0; }
From source file:edu.indiana.d2i.htrc.util.TokensInspection.java
License:Apache License
@Override public int run(String[] args) throws Exception { String input = args[0]; // word count path String output = args[1];//w w w.j av a 2s . c o m int numVector = 0; Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER); Text key = new Text(); LongWritable value = new LongWritable(); BufferedWriter writer = new BufferedWriter(new FileWriter(output)); for (int i = 0; i < status.length; i++) { SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf); while (seqReader.next(key, value)) { numVector++; writer.write(key.toString() + "\n"); } } writer.close(); logger.info("#vector: " + numVector); return 0; }
From source file:edu.indiana.d2i.htrc.util.VectorInspection.java
License:Apache License
@Override public int run(String[] args) throws Exception { String input = args[0];// ww w .j a v a2s . c o m String output = args[1]; int numVector = 0; Set<Integer> dimLst = new HashSet<Integer>(); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER); Text key = new Text(); VectorWritable value = new VectorWritable(); BufferedWriter writer = new BufferedWriter(new FileWriter(output)); for (int i = 0; i < status.length; i++) { SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf); while (seqReader.next(key, value)) { numVector++; dimLst.add(value.get().size()); writer.write(value.toString() + "\n"); } } logger.info("#vector: " + numVector); logger.info("number of different dimensions: " + dimLst.size()); StringBuilder builder = new StringBuilder(); for (Integer dim : dimLst) builder.append(dim + " "); logger.info("" + builder.toString()); writer.close(); return 0; }