edu.ucsb.cs.lsh.statistics.LshStat.java Source code

Introduction

Here is the source code for edu.ucsb.cs.lsh.statistics.LshStat.java
Source

/**
 * Copyright 2012-2013 The Regents of the University of California
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on
 * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations under
 * the License.
 * 
 * Author: maha alabduljalil <maha (at) cs.ucsb.edu>
 * @Since Sep 3, 2012
 */

package edu.ucsb.cs.lsh.statistics;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader;

import edu.ucsb.cs.types.DocDocWritable;
import edu.ucsb.cs.types.FeatureWeightArrayWritable;

/**
 * This class offers the following options:<br>
 * 1) Computes LSH four values. It takes a valid directory with paired documents
 * exceeding threshold. Another test directory to compute its errors from the
 * valid directory. Input is of the format:
 * <code> [DocDocWritable, FloatWritable]<\code> and computes:<br>
 * Ouptut are the following values:
 * fn: wrong pairs.<br>
 * tn: missing pairs.<br>
 * tp: correct present pairs.<br>
 * fp: wrong present pairs.<br>
 * 
 * 2) Convert text input to binary DocDocWritable,FloatWritable.
 * 
 * 
 * @author Maha <br>
 * */
public class LshStat {

    public static void printUsage(int choice) {
        switch (choice) {
        case 1:
            System.out.println("Usage: 1 <ValidDir> <TestDir> <numberDocuments not pairs>");
            break;
        case 2:
            System.out.println("Usage: 2 <Local text inputDir> <outputDir>");
            break;
        case 3:
            System.out.println("Usage: 3 <hdfs lsh directory> [yes for maxBucket]");
            break;
        case 4:
            System.out.println("Usage: 4 <hdfs lsh directory> [<bucketNo>]");
            break;
        default:
        }
        System.exit(0);
    }

    private static long maxBucketID = 0;

    public static void main(String[] args) throws IOException {
        if ((args.length < 1) || !args[0].matches("[\\d.]+")) {
            System.out.println("Enter number:\n"
                    + "(1) - compute LSH four values from binary input <DocDoc,Float>.\n"
                    + "(2) - convert text input to <DocDoc,FloatWritable>.\n"
                    + "(3) - lsh statistics Max/Min/Avg/buckets/repetition from binary input from binary input <DocDoc,Float>.\n"
                    + "(4) - produce maximum bucket.\n");

            System.exit(0);
        }
        switch (Integer.parseInt(args[0])) {
        case 1:
            computeValues(args);
            break;
        case 2:
            convertInput(args);
            break;
        case 3:
            lshProjectionStat(args);
            break;
        case 4:
            produceMaxBucket(args);
            break;
        }

    }

    private static HashSet<DocDocWritable> ValidationData = new HashSet<DocDocWritable>();
    private static HashSet<DocDocWritable> TestData = new HashSet<DocDocWritable>();
    private static long allDistinctPairs;

    public static void computeValues(String[] args) throws IOException {
        if (args.length != 4)
            printUsage(1);

        long tp = 0, tn = 0, fp = 0, fn = 0;

        Path validPath = new Path(args[1]);
        Path testPath = new Path(args[2]);
        int n = Integer.parseInt(args[3]);

        Configuration conf = new Configuration();
        FileSystem fs = testPath.getFileSystem(conf);
        long validPairCount = 0, testPairCount = 0;
        allDistinctPairs = (n * (n - 1)) / 2;

        Reader validReader = new SequenceFile.Reader(fs, validPath, conf);
        DocDocWritable key = new DocDocWritable();
        FloatWritable value = new FloatWritable();

        // Read data into sets
        while (validReader.next(key, value)) {
            validPairCount++;
            assert (key.doc1 < key.doc2);
            ValidationData.add(key);
        }

        Reader testReader = new SequenceFile.Reader(fs, testPath, conf);
        while (testReader.next(key, value)) {
            testPairCount++;
            assert (key.doc1 < key.doc2);
            TestData.add(key);
        }

        // Compute tp , fp
        Iterator<DocDocWritable> testItr = TestData.iterator();
        while (testItr.hasNext()) {
            DocDocWritable next = testItr.next();
            if (ValidationData.contains(next)) {
                tp++;
                ValidationData.remove(next);
            } else
                fp++;
        }

        // Compute fn
        Iterator<DocDocWritable> validItr = ValidationData.iterator();
        while (validItr.hasNext()) {
            testItr.next();
            fn++;
        }

        // Compute tn
        tn = allDistinctPairs - (tp + fn + fp);
        System.out.println("Number of documents: " + n + "# tp: " + tp + " (" + tp / allDistinctPairs + " %)"
                + "# fp: " + fp + " (" + fp / allDistinctPairs + " %)" + "# tn: " + tn + " ("
                + tn / allDistinctPairs + " %)" + "# fn: " + fn + " (" + fn / allDistinctPairs + " %)");
    }

    public static void convertInput(String[] args) throws IOException {

        if (args.length != 3)
            printUsage(2);

        String strLine, input = args[1], output_file = args[2];
        Path outPath = new Path(output_file);
        Configuration conf = new Configuration();
        FileSystem fs = outPath.getFileSystem(conf);
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outPath, DocDocWritable.class,
                FloatWritable.class, SequenceFile.CompressionType.NONE);

        if ((new File(input)).isDirectory()) {
            for (File inputFile : (new File(input)).listFiles()) {
                BufferedReader br = new BufferedReader(
                        new InputStreamReader(new DataInputStream(new FileInputStream(inputFile))));

                while ((strLine = br.readLine()) != null) {
                    writer.append(new DocDocWritable(0, 3), new FloatWritable(1));
                }
            }
        } else {
        }
        writer.close();
    }

    public static void lshProjectionStat(String[] args) throws IOException {
        boolean produceMax = false;
        if (args.length == 3)
            produceMax = true;
        else if (args.length != 2)
            printUsage(3);

        Path inputPath = new Path(args[1]);
        Configuration conf = new Configuration();
        FileSystem fs = inputPath.getFileSystem(conf);
        FileStatus[] files = fs.listStatus(inputPath);
        long i = 0, bucketCount = 0, avgBucketSize = 0, maxBucket = 0, minBucket = Long.MAX_VALUE;
        ArrayList<Integer> bucketSizes = new ArrayList<Integer>();

        for (FileStatus file : files) {
            if ((fs.isDirectory(file.getPath())) || file.getPath().getName().startsWith("_"))
                continue;

            Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf);
            LongWritable key = new LongWritable();
            FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();

            while (reader.next(key, value)) {
                if (key.get() == 0) {
                    bucketCount++;
                    avgBucketSize += i;
                    if (maxBucket < i) {
                        maxBucket = i;
                        maxBucketID = (bucketCount - 1);
                    }
                    if (i != 0 && minBucket > i)
                        minBucket = i;
                    i = 0;
                } else {
                    i++;
                }
            }
            avgBucketSize += i;
            bucketSizes.add((int) i);
        }
        System.out.println("Number of buckets:" + bucketCount);
        System.out.println("Max. bucket size:" + maxBucket + " with ID:" + maxBucketID);
        System.out.println("Min. bucket size:" + minBucket);
        System.out.println("Avg. buckets size:" + (avgBucketSize / (float) bucketCount));
        System.out.println(
                "R-std. among bucket sizes:" + getRStd((avgBucketSize / (float) bucketCount), bucketSizes));
        System.out.println("Total comparison done within buckets:" + getSumCombin(bucketSizes));
        if (produceMax)
            produceMaxBucket(args);
        // getRepatedPairs(files, fs, conf);
    }

    public static float getRStd(float avg, ArrayList<Integer> array) {
        double sum = 0;
        for (int i = 0; i < array.size(); i++)
            sum += Math.pow(array.get(i) - avg, 2);
        double sd = Math.sqrt(sum / (array.size() - 1));
        System.out.println("Standard deviation across buckets: " + sd);
        return (float) (sd / avg);
    }

    public static long getSumCombin(ArrayList<Integer> array) {
        long sum = 0;
        for (int i = 0; i < array.size(); i++)
            sum += choose(array.get(i), 2);
        return sum;
    }

    // java heap space
    public static void getRepatedPairs(FileStatus[] files, FileSystem fs, Configuration conf) throws IOException {
        NumByteList bucket = null;
        long i = 0, bucketCount = 0;
        ArrayList<NumByteList> buckets = new ArrayList<NumByteList>();

        for (FileStatus file : files) {
            if ((fs.isDirectory(file.getPath())) || file.getPath().getName().startsWith("_"))
                continue;

            Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf);
            LongWritable key = new LongWritable();
            FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();

            while (reader.next(key, value)) {
                if (key.get() == 0) {
                    if (bucketCount != 0)
                        buckets.add(bucket);
                    bucketCount++;
                    bucket = new NumByteList(bucketCount);
                    i = 0;
                } else {
                    i++;
                    bucket.addDoc(key.get());
                }
            }
        }
        System.out.println("Number of repeated docs across buckets: " + getRepetedPairsCount(buckets));
    }

    public static int getBitId(Long doc) {
        return 0;
    }

    public static void produceMaxBucket(String args[]) throws IOException {
        if (args.length == 3)
            maxBucketID = Integer.parseInt(args[2]);
        else if (args.length != 2)
            printUsage(4);

        Path inputPath = new Path(args[1]);
        Path outPath = new Path("maxBucket");
        Configuration conf = new Configuration();
        FileSystem fs = inputPath.getFileSystem(conf);
        if (fs.exists(outPath))
            fs.delete(outPath);
        FileStatus[] files = fs.listStatus(inputPath);
        SequenceFile.Writer writer = null;
        int bucketCount = 0;

        for (FileStatus file : files) {
            if ((fs.isDirectory(file.getPath())) || file.getPath().getName().startsWith("_"))
                continue;

            Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf);
            LongWritable key = new LongWritable();
            FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();

            while (reader.next(key, value))
                if (key.get() == 0) {
                    bucketCount++;
                    if (bucketCount == maxBucketID) {
                        writer = SequenceFile.createWriter(fs, conf, outPath, LongWritable.class,
                                FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE);
                        while (reader.next(key, value) && (key.get() != 0))
                            writer.append(key, value);
                        writer.close();
                        return;
                    }
                }
        }
    }

    public static Long getRepetedPairsCount(ArrayList<NumByteList> buckets) {
        long pairCount = 0;
        for (int i = 0; i < buckets.size() - 1; i++)
            for (int j = i + 1; j < buckets.size(); j++) {
                NumByteList bucketi = buckets.get(i);
                NumByteList bucketj = buckets.get(j);
                for (int k = 0; k < bucketi.getNumBytes() && k < bucketj.getNumBytes(); k++)
                    pairCount += choose(intersectCount(bucketi.getByte(k), bucketj.getByte(k)), 2);
            }
        return pairCount;
    }

    public static int intersectCount(byte a, byte b) {
        int ones, value = (a & b), sum = 0;
        for (int i = 0; i < 8; i++) {
            ones = (value >> i) & 1;
            if (ones > 1)
                sum += ones;
        }
        return sum;
    }

    private static Boolean isBitSet(byte b, int bit) {
        return (b & (1 << bit)) != 0;
    }

    public static double choose(int x, int y) {
        if (y < 0 || y > x)
            return 0;
        if (y > x / 2) {
            y = x - y;
        }

        double denominator = 1.0, numerator = 1.0;
        for (int i = 1; i <= y; i++) {
            denominator *= i;
            numerator *= (x + 1 - i);
        }
        return numerator / denominator;
    }
}

class NumByteList {
    long num;
    ArrayList<Byte> bytes = new ArrayList<Byte>(0);

    public NumByteList(long n) {
        num = n;
    }

    public void addDoc(Long docId) {
        int bytenum = (int) Math.floor(docId / 8);
        byte tochange = 0;
        if (bytes.size() < bytenum) {
            for (int j = bytes.size(); j < bytenum; j++)
                bytes.add((byte) 0);
            tochange = 0;
        } else
            tochange = getByte(bytenum);
        tochange = changeBit(tochange, (int) (docId % 8));
    }

    /**
     * 
     * @param tochange
     * @param i
     * @return a byte with bit-i set to 1.
     */
    public byte changeBit(byte tochange, int i) {
        tochange = (byte) (tochange | (1 << i));
        return tochange;
    }

    public int getNumBytes() {
        return bytes.size();
    }

    public byte getByte(int i) {
        return bytes.get(i);
    }
}