Example usage for org.apache.hadoop.fs FileSystem isDirectory

List of usage examples for org.apache.hadoop.fs FileSystem isDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem isDirectory.

Prototype

@Deprecated
public boolean isDirectory(Path f) throws IOException 

Source Link

Document

True iff the named path is a directory.

Usage

From source file:edu.ucsb.cs.lsh.statistics.LshStat.java

License:Apache License

public static void lshProjectionStat(String[] args) throws IOException {
    boolean produceMax = false;
    if (args.length == 3)
        produceMax = true;/*from ww w  .j av  a2 s .com*/
    else if (args.length != 2)
        printUsage(3);

    Path inputPath = new Path(args[1]);
    Configuration conf = new Configuration();
    FileSystem fs = inputPath.getFileSystem(conf);
    FileStatus[] files = fs.listStatus(inputPath);
    long i = 0, bucketCount = 0, avgBucketSize = 0, maxBucket = 0, minBucket = Long.MAX_VALUE;
    ArrayList<Integer> bucketSizes = new ArrayList<Integer>();

    for (FileStatus file : files) {
        if ((fs.isDirectory(file.getPath())) || file.getPath().getName().startsWith("_"))
            continue;

        Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf);
        LongWritable key = new LongWritable();
        FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();

        while (reader.next(key, value)) {
            if (key.get() == 0) {
                bucketCount++;
                avgBucketSize += i;
                if (maxBucket < i) {
                    maxBucket = i;
                    maxBucketID = (bucketCount - 1);
                }
                if (i != 0 && minBucket > i)
                    minBucket = i;
                i = 0;
            } else {
                i++;
            }
        }
        avgBucketSize += i;
        bucketSizes.add((int) i);
    }
    System.out.println("Number of buckets:" + bucketCount);
    System.out.println("Max. bucket size:" + maxBucket + " with ID:" + maxBucketID);
    System.out.println("Min. bucket size:" + minBucket);
    System.out.println("Avg. buckets size:" + (avgBucketSize / (float) bucketCount));
    System.out.println(
            "R-std. among bucket sizes:" + getRStd((avgBucketSize / (float) bucketCount), bucketSizes));
    System.out.println("Total comparison done within buckets:" + getSumCombin(bucketSizes));
    if (produceMax)
        produceMaxBucket(args);
    // getRepatedPairs(files, fs, conf);
}

From source file:edu.ucsb.cs.lsh.statistics.LshStat.java

License:Apache License

public static void getRepatedPairs(FileStatus[] files, FileSystem fs, Configuration conf) throws IOException {
    NumByteList bucket = null;/*w  w  w. j  a v  a  2s . c  om*/
    long i = 0, bucketCount = 0;
    ArrayList<NumByteList> buckets = new ArrayList<NumByteList>();

    for (FileStatus file : files) {
        if ((fs.isDirectory(file.getPath())) || file.getPath().getName().startsWith("_"))
            continue;

        Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf);
        LongWritable key = new LongWritable();
        FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();

        while (reader.next(key, value)) {
            if (key.get() == 0) {
                if (bucketCount != 0)
                    buckets.add(bucket);
                bucketCount++;
                bucket = new NumByteList(bucketCount);
                i = 0;
            } else {
                i++;
                bucket.addDoc(key.get());
            }
        }
    }
    System.out.println("Number of repeated docs across buckets: " + getRepetedPairsCount(buckets));
}

From source file:edu.ucsb.cs.lsh.statistics.LshStat.java

License:Apache License

public static void produceMaxBucket(String args[]) throws IOException {
    if (args.length == 3)
        maxBucketID = Integer.parseInt(args[2]);
    else if (args.length != 2)
        printUsage(4);/*w w  w. j a v  a2 s  .  c  o  m*/

    Path inputPath = new Path(args[1]);
    Path outPath = new Path("maxBucket");
    Configuration conf = new Configuration();
    FileSystem fs = inputPath.getFileSystem(conf);
    if (fs.exists(outPath))
        fs.delete(outPath);
    FileStatus[] files = fs.listStatus(inputPath);
    SequenceFile.Writer writer = null;
    int bucketCount = 0;

    for (FileStatus file : files) {
        if ((fs.isDirectory(file.getPath())) || file.getPath().getName().startsWith("_"))
            continue;

        Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf);
        LongWritable key = new LongWritable();
        FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();

        while (reader.next(key, value))
            if (key.get() == 0) {
                bucketCount++;
                if (bucketCount == maxBucketID) {
                    writer = SequenceFile.createWriter(fs, conf, outPath, LongWritable.class,
                            FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE);
                    while (reader.next(key, value) && (key.get() != 0))
                        writer.append(key, value);
                    writer.close();
                    return;
                }
            }
    }
}

From source file:edu.ucsb.cs.partitioning.cosine.Organizer.java

License:Apache License

public static void readCombineCopy(Path input, String output, JobConf job) throws IOException {
    boolean printDist = job.getBoolean(Config.PRINT_DISTRIBUTION_PROPERTY, Config.PRINT_DISTRIBUTION_VALUE);
    BufferedWriter distout = null;
    SequenceFile.Writer out = null;
    if (printDist)
        distout = new BufferedWriter(new FileWriter("p-norm-distribution" + output));

    int pc = 0, pr = 0;
    float pChoice = job.getFloat(NormSortMain.P_NORM_PROPERTY, NormSortMain.P_NORM_VALUE);
    FileSystem hdfs = input.getFileSystem(new JobConf());
    FileStatus[] files = Partitioner.setFiles(hdfs, input);
    ArrayList<String> partitions = arrangeNames(files);

    for (int i = 0; i < partitions.size(); i++) {
        Path inputPath = new Path(input.toString() + "/" + partitions.get(i));
        if (hdfs.isDirectory(inputPath))
            continue;

        SequenceFile.Reader in = new SequenceFile.Reader(hdfs, inputPath, job);
        if (!isCombined(pr, pc, getRow(inputPath.getName()), getCol(inputPath.getName()), partitions)) {
            if (out != null)
                out.close();/*from   www .  j  a  v  a 2  s. co  m*/
            pr = getRow(inputPath.getName());
            pc = getCol(inputPath.getName());
            out = SequenceFile.createWriter(hdfs, job, new Path(output + "/" + inputPath.getName()),
                    LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE);
        }
        while (in.next(unused, document)) {
            out.append(new LongWritable(document.id),
                    new FeatureWeightArrayWritable(document.vectorSize, document.vector));
            if (printDist)
                distout.write(document.getPNorm(pChoice) + " \n");
        }
        in.close();
    }
    if (out != null)
        out.close();
}

From source file:edu.ucsb.cs.partitioning.cosine.Partitioner.java

License:Apache License

/**
 * /*  ww w .  j a va  2  s. c o  m*/
 * @param job
 * @param inputDir
 * @param interDir
 * @param maxDir
 * @param nPartitions
 * @param norm_weight_all
 * @return number of partitions actaully produced
 */
public static int produceStaticParitions(JobConf job, String inputDir, String interDir, String maxDir,
        int nPartitions, int norm_weight_all) {
    SequenceFile.Writer partOut = null;
    float maxn = 0, maxw = 0, pChoice = job.getFloat(NormSortMain.P_NORM_PROPERTY, NormSortMain.P_NORM_VALUE);
    int maxs = 0;
    LongWritable prevK = null, key = new LongWritable();
    FeatureWeightArrayWritable prevV = null, value = new FeatureWeightArrayWritable();

    try {
        Path inputPath = new Path(inputDir);
        FileSystem hdfs = inputPath.getFileSystem(new Configuration());
        Path interDirectory = new Path(interDir);
        Path maxPath = new Path(maxDir);

        clearPath(hdfs, maxPath);
        clearPath(hdfs, interDirectory);

        long nDocuments = Collector.countDirVectors(hdfs, inputPath, job);
        if (nDocuments == 0)
            return 0;

        double partitionSize;
        uniformPartitions = job.getBoolean(Config.UNIFORM_PARTITIONING_PROPERTY,
                Config.UNIFORM_PARTITIONING_VALUE);
        if (uniformPartitions)
            partitionSize = Math.ceil(nDocuments / (double) nPartitions);
        else
            partitionSize = Math.ceil(nDocuments / (double) (GijComparisons.choose(nPartitions + 1, 2)));

        if (partitionSize == 1)
            System.err.println("WARN: Number of partitions = number of documents!!");

        FileStatus[] files = setFiles(hdfs, inputPath);
        FSDataOutputStream maxOut = hdfs.create(maxPath);

        int documentNo = 0, partitionNo = 1; // partition naming start at 1
        for (int i = 0; i < files.length; i++) {
            inputPath = files[i].getPath();
            if ((hdfs.isDirectory(inputPath) || inputPath.getName().startsWith("_")))
                continue;
            Reader in = new SequenceFile.Reader(hdfs, inputPath, job);

            while (in.next(key, value)) { // id,vector
                documentNo++;
                prevK = key;
                prevV = value;

                if (isFirstDocument(partOut)) {
                    maxn = value.getPNorm(pChoice);
                    maxw = value.getMaxWeight();
                    maxs = value.vectorSize;
                    partOut = openFile(hdfs, job, interDirectory, partitionNo);
                }
                partOut.append(key, value);
                maxw = (value.getMaxWeight() > maxw) ? value.getMaxWeight() : maxw;
                maxs = (value.vectorSize > maxs) ? value.vectorSize : maxs;
                maxn = (value.getPNorm(pChoice) > maxn) ? value.getPNorm(pChoice) : maxn;

                if (isLastDocument(documentNo, partitionNo, partitionSize, uniformPartitions)) {
                    partOut = writeMax(norm_weight_all, partOut, maxOut, maxn, maxw, maxs);
                    documentNo = 0;
                    partitionNo++;
                }
                prevK = key;
                prevV = value;
            }
            in.close();
        }
        if (partOut != null)
            partOut = writeMax(norm_weight_all, partOut, maxOut, maxn, maxw, maxs);
        nPartitions = partitionNo - 1;
        maxOut.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return (nPartitions);
}

From source file:edu.ucsb.cs.partitioning.statistics.Collector.java

License:Apache License

/**
 * @param inputPath: path of all the input files.
 * @param fs: file system./*from   w  ww  .j a  v  a2s .com*/
 * @return file paths sorted by file name.
 */
public static Iterator<Path> getSortedFiles(Path inputPath, FileSystem fs) throws IOException {
    TreeSet<Path> paths = new TreeSet<Path>();
    FileStatus[] files = getFiles(inputPath, fs);
    for (int i = 0; i < files.length; i++)
        if (!fs.isDirectory(files[i].getPath()))
            paths.add(files[i].getPath());
    return paths.iterator();
}

From source file:edu.ucsb.cs.partitioning.statistics.Collector.java

License:Apache License

public static String getNSkipCosineVecPairs(FileSystem fs, Path inputPath, JobConf job) throws IOException {

    long nSkipVecPair = 0, nVectors = 0, nSkipPartEdges = 0, nPartitions = 0;
    FileStatus[] files = getFiles(inputPath, fs);
    if (files == null)
        return null;

    for (int i = 0; i < files.length; i++) {
        inputPath = files[i].getPath();// w  ww  .  j  a  v a  2  s  .  co  m
        if (fs.isDirectory(inputPath))
            continue;
        nPartitions++;
        long n = countFileVectors(fs, files[i].getPath(), job);
        nVectors += n;
        for (int j = i; j < files.length; j++) {
            inputPath = files[j].getPath();
            if (fs.isDirectory(inputPath))
                continue;
            long m = countFileVectors(fs, files[j].getPath(), job);
            if (skipCosinePartitions(files[i].getPath().getName(), files[j].getPath().getName())) {
                nSkipVecPair += (n * m);
                nSkipPartEdges++;
            }
        }
    }
    return (nSkipVecPair + ",(" + nVectors + "C2)," + nSkipPartEdges + "," + ",(" + nPartitions + "C2),");
}

From source file:edu.ucsb.cs.partitioning.statistics.Collector.java

License:Apache License

/**
 * Not sure about calculations here anymore ..
 * @param fs/*from   w  w w  .j  a  v  a  2 s .  co m*/
 * @param inputPath
 * @param job
 * @return
 * @throws IOException
 */
public static String getNSkipJaccardDocPairs(FileSystem fs, Path inputPath, JobConf job) throws IOException {

    long nSkipVecPair = 0, nVecPairs = 0, nSkipPartEdges = 0, nPartitions = 0;
    FileStatus[] files = getFiles(inputPath, fs);
    if (files == null)
        return null;

    for (int i = 0; i < files.length; i++) {
        inputPath = files[i].getPath();
        if (fs.isDirectory(inputPath))
            continue;
        nPartitions++;
        for (int j = 0; j < files.length; j++) {
            inputPath = files[j].getPath();
            if (fs.isDirectory(inputPath))
                continue;
            long n = countFileVectors(fs, files[i].getPath(), job);
            long m = countFileVectors(fs, files[j].getPath(), job);

            if (skip1dCoarseJaccardPartitions(files[i].getPath().getName(), files[j].getPath().getName())) {
                nSkipVecPair += (n * m);
                nSkipPartEdges++;
            }
            nVecPairs += (n * m);
        }
    }
    return (nSkipVecPair / 2 + "," + nVecPairs / 2 + "," + nSkipPartEdges / 2 + "," + ",(" + nPartitions
            + "C2),");
}

From source file:edu.ucsb.cs.partitioning.statistics.Collector.java

License:Apache License

public static long countFileVectors(FileSystem fs, Path inputFile, JobConf job) throws IOException {
    long nDocuments = 0;
    LongWritable key = new LongWritable();
    FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();

    if ((fs.isDirectory(inputFile)) || inputFile.getName().startsWith("_"))
        return 0;
    SequenceFile.Reader in = new SequenceFile.Reader(fs, inputFile, job);
    while (in.next(key, value))
        nDocuments++;/*from  w w w.  j a va 2s .c om*/
    in.close();
    return nDocuments;
}

From source file:edu.ucsb.cs.partitioning.statistics.CollectorBaraglia.java

License:Apache License

/**
 * @param inputPath: path of all the input files.
 * @param fs: file system./* w  ww  . ja  v a 2  s  . c  o m*/
 * @return file paths sorted by file name.
 */
public static Iterator<Path> getSortedFiles(Path inputPath, FileSystem fs) throws IOException {
    TreeSet<Path> paths = new TreeSet<Path>();
    FileStatus[] files = getFiles(inputPath, fs);
    for (int i = 0; i < files.length; i++)
        if (!fs.isDirectory(files[i].getPath()))
            paths.add(files[i].getPath());

    return paths.iterator();
}