Java tutorial
/* * Copyright Myrrix Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package net.myrrix.online.som; import java.util.Collections; import java.util.Comparator; import com.google.common.base.Preconditions; import org.apache.commons.math3.distribution.IntegerDistribution; import org.apache.commons.math3.distribution.PascalDistribution; import org.apache.commons.math3.random.RandomGenerator; import org.apache.commons.math3.util.FastMath; import org.apache.commons.math3.util.Pair; import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import net.myrrix.common.LangUtils; import net.myrrix.common.collection.FastByIDMap; import net.myrrix.common.math.SimpleVectorMath; import net.myrrix.common.random.RandomManager; import net.myrrix.common.random.RandomUtils; /** * <p>This class implements a basic version of * <a href="http://en.wikipedia.org/wiki/Self-organizing_map">self-organizing maps</a>, or * <a href="http://www.scholarpedia.org/article/Kohonen_network">Kohonen network</a>. Self-organizing maps * bear some similarity to clustering techniques like * <a href="http://en.wikipedia.org/wiki/K-means_clustering">k-means</a>, in that they both try to discover * the centers of relatively close or similar groups of points in the input.</p> * * <p>K-means and other pure clustering algorithms try to find the centers which best reflect the input's structure. * Self-organizing maps have a different priority; the centers it is fitting are connected together as part of * a two-dimensional grid, and influence each other as they move. The result is like fitting an elastic 2D grid * of points to the input. This constraint results in less faithful clustering -- it is not even primarily a * clustering. But it does result in a project of points onto a 2D surface that keeps similar things near * to each other -- a sort of randomized ad-hoc 2D map of the space.</p> * * @author Sean Owen * @since 1.0 */ public final class SelfOrganizingMaps { private static final Logger log = LoggerFactory.getLogger(SelfOrganizingMaps.class); public static final double DEFAULT_MIN_DECAY = 0.00001; public static final double DEFAULT_INIT_LEARNING_RATE = 0.5; private final double minDecay; private final double initLearningRate; public SelfOrganizingMaps() { this(DEFAULT_MIN_DECAY, DEFAULT_INIT_LEARNING_RATE); } /** * @param minDecay learning rate decays over iterations; when the decay factor drops below this, stop iteration * as further updates will do little. * @param initLearningRate initial learning rate, decaying over time, which controls how much a newly assigned * vector will move vector centers. */ public SelfOrganizingMaps(double minDecay, double initLearningRate) { Preconditions.checkArgument(minDecay > 0.0, "Min decay must be positive: {}", minDecay); Preconditions.checkArgument(initLearningRate > 0.0 && initLearningRate <= 1.0, "Learning rate should be in (0,1]: {}", initLearningRate); this.minDecay = minDecay; this.initLearningRate = initLearningRate; } public Node[][] buildSelfOrganizedMap(FastByIDMap<float[]> vectors, int maxMapSize) { return buildSelfOrganizedMap(vectors, maxMapSize, Double.NaN); } /** * @param vectors user-feature or item-feature matrix from current computation generation * @param maxMapSize maximum desired dimension of the (square) 2D map * @param samplingRate fraction of input to consider when creating the map * size overall, nodes will be pruned to remove least-matching assignments, and not all vectors in the * input will be assigned. * @return a square, 2D array of {@link Node} representing the map, with dimension {@code mapSize} */ public Node[][] buildSelfOrganizedMap(FastByIDMap<float[]> vectors, int maxMapSize, double samplingRate) { Preconditions.checkNotNull(vectors); Preconditions.checkArgument(!vectors.isEmpty()); Preconditions.checkArgument(maxMapSize > 0); Preconditions.checkArgument(Double.isNaN(samplingRate) || (samplingRate > 0.0 && samplingRate <= 1.0)); if (Double.isNaN(samplingRate)) { // Compute a sampling rate that shoots for 1 assignment per node on average double expectedNodeSize = (double) vectors.size() / (maxMapSize * maxMapSize); samplingRate = expectedNodeSize > 1.0 ? 1.0 / expectedNodeSize : 1.0; } log.debug("Sampling rate: {}", samplingRate); int mapSize = FastMath.min(maxMapSize, (int) FastMath.sqrt(vectors.size() * samplingRate)); Node[][] map = buildInitialMap(vectors, mapSize); sketchMapParallel(vectors, samplingRate, map); for (Node[] mapRow : map) { for (Node node : mapRow) { node.clearAssignedIDs(); } } assignVectorsParallel(vectors, samplingRate, map); sortMembers(map); int numFeatures = vectors.entrySet().iterator().next().getValue().length; buildProjections(numFeatures, map); return map; } private void sketchMapParallel(FastByIDMap<float[]> vectors, double samplingRate, Node[][] map) { int mapSize = map.length; double sigma = (vectors.size() * samplingRate) / Math.log(mapSize); int t = 0; for (FastByIDMap.MapEntry<float[]> entry : vectors.entrySet()) { float[] V = entry.getValue(); double decayFactor = FastMath.exp(-t / sigma); t++; if (decayFactor < minDecay) { break; } int[] bmuCoordinates = findBestMatchingUnit(V, map); if (bmuCoordinates != null) { updateNeighborhood(map, V, bmuCoordinates[0], bmuCoordinates[1], decayFactor); } } } private static void assignVectorsParallel(FastByIDMap<float[]> vectors, double samplingRate, Node[][] map) { boolean doSample = samplingRate < 1.0; RandomGenerator random = RandomManager.getRandom(); for (FastByIDMap.MapEntry<float[]> entry : vectors.entrySet()) { if (doSample && random.nextDouble() > samplingRate) { continue; } float[] V = entry.getValue(); int[] bmuCoordinates = findBestMatchingUnit(V, map); if (bmuCoordinates != null) { Node node = map[bmuCoordinates[0]][bmuCoordinates[1]]; float[] center = node.getCenter(); double currentScore = SimpleVectorMath.dot(V, center) / (SimpleVectorMath.norm(center) * SimpleVectorMath.norm(V)); Pair<Double, Long> newAssignedID = new Pair<Double, Long>(currentScore, entry.getKey()); node.addAssignedID(newAssignedID); } } } /** * @return map of initialized {@link Node}s, where each node is empty and initialized to a randomly chosen * input vector normalized to unit length */ private static Node[][] buildInitialMap(FastByIDMap<float[]> vectors, int mapSize) { double p = ((double) mapSize * mapSize) / vectors.size(); // Choose mapSize^2 out of # vectors IntegerDistribution pascalDistribution; if (p >= 1.0) { // No sampling at all, we can't fill the map with one pass even pascalDistribution = null; } else { // Number of un-selected elements to skip between selections is geometrically distributed with // parameter p; this is the same as a negative binomial / Pascal distribution with r=1: pascalDistribution = new PascalDistribution(RandomManager.getRandom(), 1, p); } LongPrimitiveIterator keyIterator = vectors.keySetIterator(); Node[][] map = new Node[mapSize][mapSize]; for (Node[] mapRow : map) { for (int j = 0; j < mapSize; j++) { if (pascalDistribution != null) { keyIterator.skip(pascalDistribution.sample()); } while (!keyIterator.hasNext()) { keyIterator = vectors.keySetIterator(); // Start over, a little imprecise but affects it not much Preconditions.checkState(keyIterator.hasNext()); if (pascalDistribution != null) { keyIterator.skip(pascalDistribution.sample()); } } float[] sampledVector = vectors.get(keyIterator.nextLong()); mapRow[j] = new Node(sampledVector); } } return map; } /** * @return coordinates of {@link Node} in map whose center is "closest" to the given vector. Here closeness * is defined as smallest angle between the vectors */ private static int[] findBestMatchingUnit(float[] vector, Node[][] map) { int mapSize = map.length; double vectorNorm = SimpleVectorMath.norm(vector); double bestScore = Double.NEGATIVE_INFINITY; int bestI = -1; int bestJ = -1; for (int i = 0; i < mapSize; i++) { Node[] mapRow = map[i]; for (int j = 0; j < mapSize; j++) { float[] center = mapRow[j].getCenter(); double currentScore = SimpleVectorMath.dot(vector, center) / (SimpleVectorMath.norm(center) * vectorNorm); if (LangUtils.isFinite(currentScore) && currentScore > bestScore) { bestScore = currentScore; bestI = i; bestJ = j; } } } return bestI == -1 || bestJ == -1 ? null : new int[] { bestI, bestJ }; } /** * Completes the update step after assigning an input vector tentatively to a {@link Node}. The assignment * causes nearby nodes (including the assigned one) to move their centers towards the vector. */ private void updateNeighborhood(Node[][] map, float[] V, int bmuI, int bmuJ, double decayFactor) { int mapSize = map.length; double neighborhoodRadius = mapSize * decayFactor; int minI = FastMath.max(0, (int) FastMath.floor(bmuI - neighborhoodRadius)); int maxI = FastMath.min(mapSize, (int) FastMath.ceil(bmuI + neighborhoodRadius)); int minJ = FastMath.max(0, (int) FastMath.floor(bmuJ - neighborhoodRadius)); int maxJ = FastMath.min(mapSize, (int) FastMath.ceil(bmuJ + neighborhoodRadius)); for (int i = minI; i < maxI; i++) { Node[] mapRow = map[i]; for (int j = minJ; j < maxJ; j++) { double learningRate = initLearningRate * decayFactor; double currentDistance = distance(i, j, bmuI, bmuJ); double theta = FastMath.exp( -(currentDistance * currentDistance) / (2.0 * neighborhoodRadius * neighborhoodRadius)); double learningTheta = learningRate * theta; float[] center = mapRow[j].getCenter(); int length = center.length; // Don't synchronize, for performance. Colliding updates once in a while does little. for (int k = 0; k < length; k++) { center[k] += (float) (learningTheta * (V[k] - center[k])); } } } } private static void sortMembers(Node[][] map) { for (Node[] mapRow : map) { for (Node node : mapRow) { Collections.sort(node.getAssignedIDs(), new Comparator<Pair<Double, Long>>() { @Override public int compare(Pair<Double, Long> a, Pair<Double, Long> b) { if (a.getFirst() > b.getFirst()) { return -1; } if (a.getFirst() < b.getFirst()) { return 1; } return 0; } }); } } } private static void buildProjections(int numFeatures, Node[][] map) { int mapSize = map.length; float[] mean = new float[numFeatures]; for (Node[] mapRow : map) { for (int j = 0; j < mapSize; j++) { add(mapRow[j].getCenter(), mean); } } divide(mean, mapSize * mapSize); RandomGenerator random = RandomManager.getRandom(); float[] rBasis = RandomUtils.randomUnitVector(numFeatures, random); float[] gBasis = RandomUtils.randomUnitVector(numFeatures, random); float[] bBasis = RandomUtils.randomUnitVector(numFeatures, random); for (Node[] mapRow : map) { for (int j = 0; j < mapSize; j++) { float[] W = mapRow[j].getCenter().clone(); subtract(mean, W); double norm = SimpleVectorMath.norm(W); float[] projection3D = mapRow[j].getProjection3D(); projection3D[0] = (float) ((1.0 + SimpleVectorMath.dot(W, rBasis) / norm) / 2.0); projection3D[1] = (float) ((1.0 + SimpleVectorMath.dot(W, gBasis) / norm) / 2.0); projection3D[2] = (float) ((1.0 + SimpleVectorMath.dot(W, bBasis) / norm) / 2.0); } } } private static void add(float[] from, float[] to) { int length = from.length; for (int i = 0; i < length; i++) { to[i] += from[i]; } } private static void subtract(float[] toSubtract, float[] from) { int length = toSubtract.length; for (int i = 0; i < length; i++) { from[i] -= toSubtract[i]; } } private static void divide(float[] x, float by) { int length = x.length; for (int i = 0; i < length; i++) { x[i] /= by; } } private static double distance(int i1, int j1, int i2, int j2) { int diff1 = i1 - i2; int diff2 = j1 - j2; return FastMath.sqrt(diff1 * diff1 + diff2 * diff2); } }