List of usage examples for org.apache.hadoop.fs FileSystem isDirectory
@Deprecated public boolean isDirectory(Path f) throws IOException
From source file:edu.ucsb.cs.lsh.statistics.LshStat.java
License:Apache License
public static void lshProjectionStat(String[] args) throws IOException { boolean produceMax = false; if (args.length == 3) produceMax = true;/*from ww w .j av a2 s .com*/ else if (args.length != 2) printUsage(3); Path inputPath = new Path(args[1]); Configuration conf = new Configuration(); FileSystem fs = inputPath.getFileSystem(conf); FileStatus[] files = fs.listStatus(inputPath); long i = 0, bucketCount = 0, avgBucketSize = 0, maxBucket = 0, minBucket = Long.MAX_VALUE; ArrayList<Integer> bucketSizes = new ArrayList<Integer>(); for (FileStatus file : files) { if ((fs.isDirectory(file.getPath())) || file.getPath().getName().startsWith("_")) continue; Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf); LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); while (reader.next(key, value)) { if (key.get() == 0) { bucketCount++; avgBucketSize += i; if (maxBucket < i) { maxBucket = i; maxBucketID = (bucketCount - 1); } if (i != 0 && minBucket > i) minBucket = i; i = 0; } else { i++; } } avgBucketSize += i; bucketSizes.add((int) i); } System.out.println("Number of buckets:" + bucketCount); System.out.println("Max. bucket size:" + maxBucket + " with ID:" + maxBucketID); System.out.println("Min. bucket size:" + minBucket); System.out.println("Avg. buckets size:" + (avgBucketSize / (float) bucketCount)); System.out.println( "R-std. among bucket sizes:" + getRStd((avgBucketSize / (float) bucketCount), bucketSizes)); System.out.println("Total comparison done within buckets:" + getSumCombin(bucketSizes)); if (produceMax) produceMaxBucket(args); // getRepatedPairs(files, fs, conf); }
From source file:edu.ucsb.cs.lsh.statistics.LshStat.java
License:Apache License
public static void getRepatedPairs(FileStatus[] files, FileSystem fs, Configuration conf) throws IOException { NumByteList bucket = null;/*w w w. j a v a 2s . c om*/ long i = 0, bucketCount = 0; ArrayList<NumByteList> buckets = new ArrayList<NumByteList>(); for (FileStatus file : files) { if ((fs.isDirectory(file.getPath())) || file.getPath().getName().startsWith("_")) continue; Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf); LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); while (reader.next(key, value)) { if (key.get() == 0) { if (bucketCount != 0) buckets.add(bucket); bucketCount++; bucket = new NumByteList(bucketCount); i = 0; } else { i++; bucket.addDoc(key.get()); } } } System.out.println("Number of repeated docs across buckets: " + getRepetedPairsCount(buckets)); }
From source file:edu.ucsb.cs.lsh.statistics.LshStat.java
License:Apache License
public static void produceMaxBucket(String args[]) throws IOException { if (args.length == 3) maxBucketID = Integer.parseInt(args[2]); else if (args.length != 2) printUsage(4);/*w w w. j a v a2 s . c o m*/ Path inputPath = new Path(args[1]); Path outPath = new Path("maxBucket"); Configuration conf = new Configuration(); FileSystem fs = inputPath.getFileSystem(conf); if (fs.exists(outPath)) fs.delete(outPath); FileStatus[] files = fs.listStatus(inputPath); SequenceFile.Writer writer = null; int bucketCount = 0; for (FileStatus file : files) { if ((fs.isDirectory(file.getPath())) || file.getPath().getName().startsWith("_")) continue; Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf); LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); while (reader.next(key, value)) if (key.get() == 0) { bucketCount++; if (bucketCount == maxBucketID) { writer = SequenceFile.createWriter(fs, conf, outPath, LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE); while (reader.next(key, value) && (key.get() != 0)) writer.append(key, value); writer.close(); return; } } } }
From source file:edu.ucsb.cs.partitioning.cosine.Organizer.java
License:Apache License
public static void readCombineCopy(Path input, String output, JobConf job) throws IOException { boolean printDist = job.getBoolean(Config.PRINT_DISTRIBUTION_PROPERTY, Config.PRINT_DISTRIBUTION_VALUE); BufferedWriter distout = null; SequenceFile.Writer out = null; if (printDist) distout = new BufferedWriter(new FileWriter("p-norm-distribution" + output)); int pc = 0, pr = 0; float pChoice = job.getFloat(NormSortMain.P_NORM_PROPERTY, NormSortMain.P_NORM_VALUE); FileSystem hdfs = input.getFileSystem(new JobConf()); FileStatus[] files = Partitioner.setFiles(hdfs, input); ArrayList<String> partitions = arrangeNames(files); for (int i = 0; i < partitions.size(); i++) { Path inputPath = new Path(input.toString() + "/" + partitions.get(i)); if (hdfs.isDirectory(inputPath)) continue; SequenceFile.Reader in = new SequenceFile.Reader(hdfs, inputPath, job); if (!isCombined(pr, pc, getRow(inputPath.getName()), getCol(inputPath.getName()), partitions)) { if (out != null) out.close();/*from www . j a v a 2 s. co m*/ pr = getRow(inputPath.getName()); pc = getCol(inputPath.getName()); out = SequenceFile.createWriter(hdfs, job, new Path(output + "/" + inputPath.getName()), LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE); } while (in.next(unused, document)) { out.append(new LongWritable(document.id), new FeatureWeightArrayWritable(document.vectorSize, document.vector)); if (printDist) distout.write(document.getPNorm(pChoice) + " \n"); } in.close(); } if (out != null) out.close(); }
From source file:edu.ucsb.cs.partitioning.cosine.Partitioner.java
License:Apache License
/** * /* ww w . j a va 2 s. c o m*/ * @param job * @param inputDir * @param interDir * @param maxDir * @param nPartitions * @param norm_weight_all * @return number of partitions actaully produced */ public static int produceStaticParitions(JobConf job, String inputDir, String interDir, String maxDir, int nPartitions, int norm_weight_all) { SequenceFile.Writer partOut = null; float maxn = 0, maxw = 0, pChoice = job.getFloat(NormSortMain.P_NORM_PROPERTY, NormSortMain.P_NORM_VALUE); int maxs = 0; LongWritable prevK = null, key = new LongWritable(); FeatureWeightArrayWritable prevV = null, value = new FeatureWeightArrayWritable(); try { Path inputPath = new Path(inputDir); FileSystem hdfs = inputPath.getFileSystem(new Configuration()); Path interDirectory = new Path(interDir); Path maxPath = new Path(maxDir); clearPath(hdfs, maxPath); clearPath(hdfs, interDirectory); long nDocuments = Collector.countDirVectors(hdfs, inputPath, job); if (nDocuments == 0) return 0; double partitionSize; uniformPartitions = job.getBoolean(Config.UNIFORM_PARTITIONING_PROPERTY, Config.UNIFORM_PARTITIONING_VALUE); if (uniformPartitions) partitionSize = Math.ceil(nDocuments / (double) nPartitions); else partitionSize = Math.ceil(nDocuments / (double) (GijComparisons.choose(nPartitions + 1, 2))); if (partitionSize == 1) System.err.println("WARN: Number of partitions = number of documents!!"); FileStatus[] files = setFiles(hdfs, inputPath); FSDataOutputStream maxOut = hdfs.create(maxPath); int documentNo = 0, partitionNo = 1; // partition naming start at 1 for (int i = 0; i < files.length; i++) { inputPath = files[i].getPath(); if ((hdfs.isDirectory(inputPath) || inputPath.getName().startsWith("_"))) continue; Reader in = new SequenceFile.Reader(hdfs, inputPath, job); while (in.next(key, value)) { // id,vector documentNo++; prevK = key; prevV = value; if (isFirstDocument(partOut)) { maxn = value.getPNorm(pChoice); maxw = value.getMaxWeight(); maxs = value.vectorSize; partOut = openFile(hdfs, job, interDirectory, partitionNo); } partOut.append(key, value); maxw = (value.getMaxWeight() > maxw) ? value.getMaxWeight() : maxw; maxs = (value.vectorSize > maxs) ? value.vectorSize : maxs; maxn = (value.getPNorm(pChoice) > maxn) ? value.getPNorm(pChoice) : maxn; if (isLastDocument(documentNo, partitionNo, partitionSize, uniformPartitions)) { partOut = writeMax(norm_weight_all, partOut, maxOut, maxn, maxw, maxs); documentNo = 0; partitionNo++; } prevK = key; prevV = value; } in.close(); } if (partOut != null) partOut = writeMax(norm_weight_all, partOut, maxOut, maxn, maxw, maxs); nPartitions = partitionNo - 1; maxOut.close(); } catch (Exception e) { e.printStackTrace(); } return (nPartitions); }
From source file:edu.ucsb.cs.partitioning.statistics.Collector.java
License:Apache License
/** * @param inputPath: path of all the input files. * @param fs: file system./*from w ww .j a v a2s .com*/ * @return file paths sorted by file name. */ public static Iterator<Path> getSortedFiles(Path inputPath, FileSystem fs) throws IOException { TreeSet<Path> paths = new TreeSet<Path>(); FileStatus[] files = getFiles(inputPath, fs); for (int i = 0; i < files.length; i++) if (!fs.isDirectory(files[i].getPath())) paths.add(files[i].getPath()); return paths.iterator(); }
From source file:edu.ucsb.cs.partitioning.statistics.Collector.java
License:Apache License
public static String getNSkipCosineVecPairs(FileSystem fs, Path inputPath, JobConf job) throws IOException { long nSkipVecPair = 0, nVectors = 0, nSkipPartEdges = 0, nPartitions = 0; FileStatus[] files = getFiles(inputPath, fs); if (files == null) return null; for (int i = 0; i < files.length; i++) { inputPath = files[i].getPath();// w ww . j a v a 2 s . co m if (fs.isDirectory(inputPath)) continue; nPartitions++; long n = countFileVectors(fs, files[i].getPath(), job); nVectors += n; for (int j = i; j < files.length; j++) { inputPath = files[j].getPath(); if (fs.isDirectory(inputPath)) continue; long m = countFileVectors(fs, files[j].getPath(), job); if (skipCosinePartitions(files[i].getPath().getName(), files[j].getPath().getName())) { nSkipVecPair += (n * m); nSkipPartEdges++; } } } return (nSkipVecPair + ",(" + nVectors + "C2)," + nSkipPartEdges + "," + ",(" + nPartitions + "C2),"); }
From source file:edu.ucsb.cs.partitioning.statistics.Collector.java
License:Apache License
/** * Not sure about calculations here anymore .. * @param fs/*from w w w .j a v a 2 s . co m*/ * @param inputPath * @param job * @return * @throws IOException */ public static String getNSkipJaccardDocPairs(FileSystem fs, Path inputPath, JobConf job) throws IOException { long nSkipVecPair = 0, nVecPairs = 0, nSkipPartEdges = 0, nPartitions = 0; FileStatus[] files = getFiles(inputPath, fs); if (files == null) return null; for (int i = 0; i < files.length; i++) { inputPath = files[i].getPath(); if (fs.isDirectory(inputPath)) continue; nPartitions++; for (int j = 0; j < files.length; j++) { inputPath = files[j].getPath(); if (fs.isDirectory(inputPath)) continue; long n = countFileVectors(fs, files[i].getPath(), job); long m = countFileVectors(fs, files[j].getPath(), job); if (skip1dCoarseJaccardPartitions(files[i].getPath().getName(), files[j].getPath().getName())) { nSkipVecPair += (n * m); nSkipPartEdges++; } nVecPairs += (n * m); } } return (nSkipVecPair / 2 + "," + nVecPairs / 2 + "," + nSkipPartEdges / 2 + "," + ",(" + nPartitions + "C2),"); }
From source file:edu.ucsb.cs.partitioning.statistics.Collector.java
License:Apache License
public static long countFileVectors(FileSystem fs, Path inputFile, JobConf job) throws IOException { long nDocuments = 0; LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); if ((fs.isDirectory(inputFile)) || inputFile.getName().startsWith("_")) return 0; SequenceFile.Reader in = new SequenceFile.Reader(fs, inputFile, job); while (in.next(key, value)) nDocuments++;/*from w w w. j a va 2s .c om*/ in.close(); return nDocuments; }
From source file:edu.ucsb.cs.partitioning.statistics.CollectorBaraglia.java
License:Apache License
/** * @param inputPath: path of all the input files. * @param fs: file system./* w ww . ja v a 2 s . c o m*/ * @return file paths sorted by file name. */ public static Iterator<Path> getSortedFiles(Path inputPath, FileSystem fs) throws IOException { TreeSet<Path> paths = new TreeSet<Path>(); FileStatus[] files = getFiles(inputPath, fs); for (int i = 0; i < files.length; i++) if (!fs.isDirectory(files[i].getPath())) paths.add(files[i].getPath()); return paths.iterator(); }