spark_example.matrix_decomposition.LocalitySensitiveHash.java Source code

Java tutorial

Introduction

Here is the source code for spark_example.matrix_decomposition.LocalitySensitiveHash.java

Source

/*
 * Copyright (c) 2015, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */

package spark_example.matrix_decomposition;

import org.apache.commons.math3.random.RandomGenerator;
import org.apache.commons.math3.util.CombinatoricsUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

final class LocalitySensitiveHash {

    static final int MAX_HASHES = 16;
    private static final Logger log = LoggerFactory.getLogger(LocalitySensitiveHash.class);

    private final double[][] hashVectors;
    private final int maxBitsDiffering;
    private final int[] candidateIndicesPrototype;
    private final int[] allIndices;

    LocalitySensitiveHash(double sampleRate, int numFeatures) {
        this(sampleRate, numFeatures, Runtime.getRuntime().availableProcessors());
    }

    // This separate constructor exists for unit testing only
    LocalitySensitiveHash(double sampleRate, int numFeatures, int numCores) {

        // How many hash functions to use? use as few as possible that still achieve the desired sample
        // rate or less, approximately.
        int numHashes = 0;
        int bitsDiffering = 0;
        for (; numHashes < MAX_HASHES; numHashes++) {

            // For a given number of hashes, consider partitions differing from the target hash in how many bits?
            // Choose enough such that number to test is as large as possible while <= the number of cores
            bitsDiffering = 0;
            // Number of different partitions that are examined when allowing the given number of bits to differ
            long numPartitionsToTry = 1;
            // Make bitsDiffering as large as possible given number of cores
            while (bitsDiffering < numHashes && numPartitionsToTry < numCores) {
                // There are numHashes-choose-bitsDiffering ways for numHashes bits to differ in
                // exactly bitsDiffering bits
                bitsDiffering++;
                numPartitionsToTry += CombinatoricsUtils.binomialCoefficient(numHashes, bitsDiffering);
            }
            // Note that this allows numPartitionsToTry to overshoot numCores by one step

            if (bitsDiffering == numHashes && numPartitionsToTry < numCores) {
                // Can't keep busy enough; keep going
                continue;
            }

            // Consider what fraction of all 2^n partitions is then considered, as a proxy for the
            // sample rate
            // Stop as soon as it's <= target sample rate
            if (numPartitionsToTry <= sampleRate * (1L << numHashes)) {
                break;
            }
        }

        log.info("LSH with {} hashes, querying partitions with up to {} bits differing", numHashes, bitsDiffering);
        this.maxBitsDiffering = bitsDiffering;
        hashVectors = new double[numHashes][];

        RandomGenerator random = RandomManager.getRandom();
        for (int i = 0; i < numHashes; i++) {
            // Pick the most-orthogonal next random vector
            double bestTotalDot = Double.POSITIVE_INFINITY;
            double[] nextBest = null;
            // Try, like, lots of them
            int candidatesSinceBest = 0;
            while (candidatesSinceBest < 1000) {
                double[] candidate = VectorMath.randomVectorD(numFeatures, random);
                // measure by total (absolute) dot product
                double score = totalAbsCos(hashVectors, i, candidate);
                if (score < bestTotalDot) {
                    nextBest = candidate;
                    // Stop if best possible score
                    if (score == 0.0) {
                        break;
                    }
                    bestTotalDot = score;
                    candidatesSinceBest = 0;
                } else {
                    candidatesSinceBest++;
                }
            }
            hashVectors[i] = nextBest;
        }
        log.info("Chose {} random hash vectors", hashVectors.length);

        // Contains all 2^numHashes integers from 0. The first element has 0 bits set. The next numHashes elements
        // are all such integers with 1 bit sets. Then 2 bits, and so on. This is used as a "mask" on top of an
        // initial candidate index in order to construct results in getCandidateIndices()
        candidateIndicesPrototype = new int[1 << numHashes];
        int[] offsetPerBitsActive = new int[numHashes + 1];
        for (int i = 1; i <= numHashes; i++) {
            offsetPerBitsActive[i] = offsetPerBitsActive[i - 1]
                    + (int) CombinatoricsUtils.binomialCoefficient(numHashes, i - 1);
        }
        for (int i = 0; i < candidateIndicesPrototype.length; i++) {
            candidateIndicesPrototype[offsetPerBitsActive[Integer.bitCount(i)]++] = i;
        }

        // Contains all 2^numHashes integers from 0
        allIndices = new int[1 << numHashes];
        for (int i = 0; i < allIndices.length; i++) {
            allIndices[i] = i;
        }
    }

    int getNumHashes() {
        return hashVectors.length;
    }

    int getNumPartitions() {
        return 1 << getNumHashes();
    }

    int getMaxBitsDiffering() {
        return maxBitsDiffering;
    }

    /**
     * @param vector vector to hash
     * @return index of partition into which it hashes
     */
    int getIndexFor(float[] vector) {
        int index = 0;
        for (int i = 0; i < hashVectors.length; i++) {
            if (VectorMath.dot(hashVectors[i], vector) > 0.0) {
                index |= 1 << i;
            }
        }
        return index;
    }

    /**
     * @param vector vector to hash
     * @return index of partition into which it hashes
     */
    int getIndexFor(double[] vector) {
        int index = 0;
        for (int i = 0; i < hashVectors.length; i++) {
            if (VectorMath.dot(hashVectors[i], vector) > 0.0) {
                index |= 1 << i;
            }
        }
        return index;
    }

    /**
     * @param vector vector whose dot product with hashed vectors is to be maximized
     * @return indices of partitions containing candidates to check
     */
    int[] getCandidateIndices(double[] vector) {
        int mainIndex = getIndexFor(vector);
        // Simple cases
        int numHashes = getNumHashes();
        if (numHashes == maxBitsDiffering) {
            return allIndices;
        }
        if (maxBitsDiffering == 0) {
            return new int[] { mainIndex };
        }
        // Other cases
        int howMany = 0;
        for (int i = 0; i <= maxBitsDiffering; i++) {
            howMany += (int) CombinatoricsUtils.binomialCoefficient(numHashes, i);
        }
        int[] result = new int[howMany];
        System.arraycopy(candidateIndicesPrototype, 0, result, 0, howMany);
        for (int i = 0; i < howMany; i++) {
            result[i] ^= mainIndex;
        }
        return result;
    }

    private static double totalAbsCos(double[][] existingVectors, int numExisting, double[] newVector) {
        double newNorm = VectorMath.norm(newVector);
        double sum = 0.0;
        for (int i = 0; i < numExisting; i++) {
            sum += Math.abs(VectorMath.dot(existingVectors[i], newVector)) / VectorMath.norm(existingVectors[i])
                    / newNorm;
        }
        return sum;
    }

}