com.cloudera.science.ml.kmeans.parallel.KMeansParallel.java Source code

Introduction

Here is the source code for com.cloudera.science.ml.kmeans.parallel.KMeansParallel.java
Source

/**
 * Copyright (c) 2012, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.science.ml.kmeans.parallel;

import static org.apache.crunch.types.avro.Avros.doubles;
import static org.apache.crunch.types.avro.Avros.ints;
import static org.apache.crunch.types.avro.Avros.pairs;
import static org.apache.crunch.types.avro.Avros.tableOf;

import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Random;

import com.cloudera.science.ml.avro.MLClusterCovariance;
import com.cloudera.science.ml.avro.MLMatrixEntry;
import com.cloudera.science.ml.parallel.covariance.CoMoment;
import com.cloudera.science.ml.parallel.covariance.Covariance;
import com.cloudera.science.ml.parallel.covariance.Index;
import com.cloudera.science.ml.parallel.covariance.MahalanobisDistance;
import com.cloudera.science.ml.parallel.types.MLAvros;
import com.google.common.collect.Maps;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.math.distribution.FDistribution;
import org.apache.commons.math.distribution.FDistributionImpl;
import org.apache.crunch.Aggregator;
import org.apache.crunch.DoFn;
import org.apache.crunch.Emitter;
import org.apache.crunch.MapFn;
import org.apache.crunch.PCollection;
import org.apache.crunch.PObject;
import org.apache.crunch.PTable;
import org.apache.crunch.Pair;
import org.apache.crunch.fn.Aggregators;
import org.apache.crunch.materialize.pobject.PObjectImpl;
import org.apache.crunch.types.PTableType;
import org.apache.crunch.types.PType;
import org.apache.crunch.types.PTypeFamily;
import org.apache.crunch.types.avro.Avros;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;

import com.cloudera.science.ml.avro.MLVector;
import com.cloudera.science.ml.core.records.Record;
import com.cloudera.science.ml.core.records.RecordSpec;
import com.cloudera.science.ml.core.records.SimpleRecord;
import com.cloudera.science.ml.core.records.Spec;
import com.cloudera.science.ml.core.vectors.Centers;
import com.cloudera.science.ml.core.vectors.VectorConvert;
import com.cloudera.science.ml.core.vectors.Weighted;
import com.cloudera.science.ml.kmeans.parallel.CentersIndex.Distances;
import com.cloudera.science.ml.parallel.crossfold.Crossfold;
import com.cloudera.science.ml.parallel.fn.SumVectorsAggregator;
import com.cloudera.science.ml.parallel.pobject.ListOfListsPObject;
import com.cloudera.science.ml.parallel.pobject.ListPObject;
import com.cloudera.science.ml.parallel.records.Records;
import com.cloudera.science.ml.parallel.sample.ReservoirSampling;
import com.cloudera.science.ml.parallel.types.MLRecords;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

/**
 * <p>An implementation of the k-means|| algorithm, as described in
 * <a href="http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf">Bahmani et al. (2012)</a>
 * 
 * <p>The main algorithm is executed by the {@link #computeClusterAssignments(PCollection, List)} 
 * method, which takes a number of
 * configured instances and runs over a given dataset of points for a fixed number
 * of iterations in order to find a candidate set of points to stream into the client and
 * cluster using the in-memory k-means algorithms defined in the {@code kmeans} package.
 */
public class KMeansParallel {

    private static final Log LOG = LogFactory.getLog(KMeansParallel.class);

    public static final Spec ASSIGNMENT_SPEC = RecordSpec.builder().addString("vector_id").addInt("cluster_id")
            .addInt("closest_center_id").addDouble("distance").build();

    public static final Spec OUTLIER_SPEC = RecordSpec.builder().addString("vector_id").addInt("cluster_id")
            .addInt("closest_center_id").addDouble("outlier_distance").build();

    private final int projectionBits;
    private final int projectionSamples;
    private final long seed;
    private final Random random;

    public KMeansParallel() {
        this(null, 128, 32);
    }

    /**
     * Main constructor that includes the option to uses a fixed {@code Random} instance
     * for running the k-means algorithm for testing purposes.
     */
    public KMeansParallel(Random random, int projectionBits, int projectionSamples) {
        this.projectionBits = projectionBits;
        this.projectionSamples = projectionSamples;
        if (random == null) {
            this.seed = System.currentTimeMillis();
        } else {
            this.seed = random.nextLong();
        }
        this.random = random;
    }

    /**
     * Calculates the <i>cost</i> of each of the given {@code Centers} instances on the given
     * dataset, where the cost is defined in Bahmani et al. as the sum of squared distances
     * from each point in the dataset to the {@code Centers}.
     * 
     * @param vecs The dataset
     * @param centers The candidate centers
     * @param approx Whether or not to use approximate assignments to speed up computations
     * @return A reference to the Crunch job that calculates the cost for each centers instance
     */
    public <V extends Vector> PObject<List<Double>> getCosts(PCollection<V> vecs, List<Centers> centers,
            boolean approx) {
        Preconditions.checkArgument(!centers.isEmpty(), "No centers specified");
        return getCosts(vecs, createIndex(centers), approx);
    }

    private CentersIndex createIndex(List<Centers> centers) {
        return new CentersIndex(centers, projectionBits, projectionSamples, seed);
    }

    private static <V extends Vector> PObject<List<Double>> getCosts(PCollection<V> vecs, CentersIndex centers,
            boolean approx) {
        return new ListPObject<Double>(
                vecs.parallelDo("center-costs", new CenterCostFn<V>(centers, approx), tableOf(ints(), doubles()))
                        .groupByKey(1).combineValues(Aggregators.SUM_DOUBLES()));
    }

    /**
     * For each of the {@code NamedVector} instances in the given {@code PCollection}, calculate which
     * cluster in each of the {@code Centers} it is assigned (i.e., closest) to and its distance from
     * that closest center. The clusters will be identified by their position in the given list.
     * 
     * @param vecs The named vectors, with the name used as a unique identifier
     * @param centers The centers of the clusters
     * @return A {@code Records} instance containing the cluster assignment info for each point
     */
    public <V extends NamedVector> Records computeClusterAssignments(PCollection<V> vecs, List<Centers> centers) {
        return computeClusterAssignments(vecs, centers, null);
    }

    /**
     * For each of the {@code NamedVector} instances in the given {@code PCollection}, calculate which
     * cluster in each of the {@code Centers} it is assigned (i.e., closest) to and its distance from
     * that closest center. The clusters will be identified by the given list of cluster IDs.
     * 
     * @param vecs The named vectors, with the name used as a unique identifier
     * @param centers The centers of the clusters
     * @param clusterIds Integer identifiers to use for the clusters
     * @return A {@code Records} instance containing the cluster assignment info for each point
     */
    public <V extends NamedVector> Records computeClusterAssignments(PCollection<V> vecs, List<Centers> centers,
            List<Integer> clusterIds) {
        if (clusterIds != null && !clusterIds.isEmpty()) {
            Preconditions.checkArgument(centers.size() == clusterIds.size(),
                    "Num centers and num clusters must be equal");
        }
        CentersIndex index = createIndex(centers);
        return new Records(vecs.parallelDo("assignments", new AssignedCenterFn<V>(index, clusterIds),
                MLRecords.record(ASSIGNMENT_SPEC)), ASSIGNMENT_SPEC);
    }

    /**
     * Builds the covariance matrix and associated metadata for each of the centers
     * of the given clusters.
     *
     * @param vecs The input vectors
     * @param centers A list of clusters
     * @param approx Whether or not to use approximate cluster assignment (faster, but less accurate)
     * @param clusterIds An optional list of integer IDs to use for the clusters
     * @return A list of the covariance data for each cluster center.
     */
    public PObject<List<MLClusterCovariance>> computeClusterCovarianceMatrix(PCollection<Vector> vecs,
            List<Centers> centers, boolean approx, List<Integer> clusterIds) {
        CentersIndex index = createIndex(centers);
        PTable<ClusterKey, Vector> assignedCenters = vecs.parallelDo("assign",
                new CovarianceCentersFn<Vector>(index, clusterIds, approx),
                Avros.tableOf(Avros.reflects(ClusterKey.class), MLAvros.vector()));
        return new ClusterCovariancePObject(Covariance.cov(assignedCenters));
    }

    public Records computeOutliers(PCollection<NamedVector> vecs, List<Centers> centers, boolean approx,
            List<Integer> clusterIds, Map<ClusterKey, MahalanobisDistance> distances) {
        CentersIndex index = createIndex(centers);
        PTable<ClusterKey, NamedVector> assignedCenters = vecs.parallelDo("assign",
                new CovarianceCentersFn<NamedVector>(index, clusterIds, approx),
                Avros.tableOf(Avros.reflects(ClusterKey.class), MLAvros.namedVector()));
        PCollection<Record> records = assignedCenters.parallelDo("scoreOutliers", new OutlierScoreFn(distances),
                MLRecords.record(OUTLIER_SPEC));
        return new Records(records, OUTLIER_SPEC);
    }

    /**
     * For each of the points in each of the given {@code Centers}, calculate the number of points
     * in the dataset that are closer to that point than they are to any other point in the same
     * {@code Centers} instance.
     * 
     * @param vecs The dataset
     * @param centers The collection of {@code Centers} to do the calculations on
     * @return A reference to the output file that contains the calculation for each of the centers
     */
    public <V extends Vector> PObject<List<List<Long>>> getCountsOfClosest(PCollection<V> vecs,
            List<Centers> centers) {
        Preconditions.checkArgument(!centers.isEmpty(), "No centers specified");
        Crossfold cf = new Crossfold(1);
        return getCountsOfClosest(cf.apply(vecs), createIndex(centers));
    }

    private static <V extends Vector> PObject<List<List<Long>>> getCountsOfClosest(
            PCollection<Pair<Integer, V>> vecs, CentersIndex centers) {
        return new ListOfListsPObject<Long>(
                vecs.parallelDo("closest-center", new ClosestCenterFn<V>(centers), pairs(ints(), ints())).count(),
                centers.getPointsPerCluster(), 0L);
    }

    /**
     * Performs the k-means|| initialization to generate a set of candidate {@code Weighted<Vector>}
     * instances for each of the given {@code Vector} initial points. This
     * is the first stage of the execution pipeline performed by the {@code compute} method.
     */
    public <V extends Vector> List<Weighted<Vector>> initialization(PCollection<V> vecs, int numIterations,
            int samplesPerIteration, List<Vector> initialPoints) {
        return initialization(vecs, numIterations, samplesPerIteration, initialPoints, new Crossfold(1)).get(0);
    }

    /**
     * Performs the k-means|| initialization to generate a set of candidate {@code Weighted<Vector>}
     * instances for each of the given {@code Vector} initial points. This
     * is the first stage of the execution pipeline performed by the {@code compute} method.
     */
    public <V extends Vector> List<List<Weighted<Vector>>> initialization(PCollection<V> vecs, int numIterations,
            int samplesPerIteration, List<Vector> initialPoints, Crossfold crossfold) {

        CentersIndex centers = new CentersIndex(crossfold.getNumFolds(), initialPoints.get(0).size(),
                projectionBits, projectionSamples, random == null ? System.currentTimeMillis() : random.nextLong());

        for (Vector initialPoint : initialPoints) {
            for (int j = 0; j < crossfold.getNumFolds(); j++) {
                centers.add(initialPoint, j);
            }
        }

        PType<V> ptype = vecs.getPType();
        PTypeFamily ptf = ptype.getFamily();
        PTableType<Integer, Pair<V, Double>> ptt = ptf.tableOf(ptf.ints(), ptf.pairs(ptype, ptf.doubles()));
        PCollection<Pair<Integer, V>> folds = crossfold.apply(vecs);
        for (int i = 0; i < numIterations; i++) {
            LOG.info(String.format("Running iteration %d of k-means|| initialization procedure", i + 1));
            ScoringFn<V> scoringFn = new ScoringFn<V>(centers);
            PTable<Integer, Pair<V, Double>> scores = folds.parallelDo("computeDistances", scoringFn, ptt);
            PTable<Integer, V> sample = ReservoirSampling.groupedWeightedSample(scores, samplesPerIteration,
                    random);
            updateCenters(sample.materialize(), centers);
        }
        return getWeightedVectors(folds, centers);
    }

    /**
     * Runs Lloyd's algorithm on the given points for a given number of iterations, returning the final
     * centers that result.
     * 
     * @param points The data points to cluster
     * @param centers The list of initial centers
     * @param numIterations The number of iterations to run, with each iteration corresponding to a MapReduce job
     * @param approx Whether to use random projection for assigning points to centers
     * @return
     */
    public <V extends Vector> List<Centers> lloydsAlgorithm(PCollection<V> points, List<Centers> centers,
            int numIterations, boolean approx) {
        PTypeFamily tf = points.getTypeFamily();
        PTableType<Pair<Integer, Integer>, Pair<V, Long>> ptt = tf.tableOf(tf.pairs(tf.ints(), tf.ints()),
                tf.pairs(points.getPType(), tf.longs()));
        Aggregator<Pair<V, Long>> agg = new SumVectorsAggregator<V>();
        for (int i = 0; i < numIterations; i++) {
            CentersIndex index = createIndex(centers);
            LloydsMapFn<V> mapFn = new LloydsMapFn<V>(index, approx);
            centers = new LloydsCenters<V>(
                    points.parallelDo("lloyds-" + i, mapFn, ptt).groupByKey().combineValues(agg), centers.size())
                            .getValue();
        }
        return centers;
    }

    private static <V extends Vector> List<List<Weighted<Vector>>> getWeightedVectors(
            PCollection<Pair<Integer, V>> folds, CentersIndex centers) {
        LOG.info("Computing the weight of each candidate center");
        List<List<Long>> indexWeights = getCountsOfClosest(folds, centers).getValue();
        return centers.getWeightedVectors(indexWeights);
    }

    private static <V extends Vector> void updateCenters(Iterable<Pair<Integer, V>> vecs, CentersIndex centers) {
        for (Pair<Integer, V> p : vecs) {
            centers.add(p.second(), p.first());
        }
    }

    private static class LloydsMapFn<V extends Vector>
            extends DoFn<V, Pair<Pair<Integer, Integer>, Pair<V, Long>>> {
        private final CentersIndex centers;
        private final boolean approx;

        private LloydsMapFn(CentersIndex centers, boolean approx) {
            this.centers = centers;
            this.approx = approx;
        }

        @Override
        public void process(V vec, Emitter<Pair<Pair<Integer, Integer>, Pair<V, Long>>> emitFn) {
            Distances d = centers.getDistances(vec, approx);
            Pair<V, Long> out = Pair.of(vec, 1L);
            for (int i = 0; i < d.closestPoints.length; i++) {
                // TODO: cache
                emitFn.emit(Pair.of(Pair.of(i, d.closestPoints[i]), out));
            }
        }
    }

    private static class LloydsCenters<V extends Vector>
            extends PObjectImpl<Pair<Pair<Integer, Integer>, Pair<V, Long>>, List<Centers>> {

        private final int numCenters;

        LloydsCenters(PTable<Pair<Integer, Integer>, Pair<V, Long>> collect, int numCenters) {
            super(collect);
            this.numCenters = numCenters;
        }

        @Override
        protected List<Centers> process(Iterable<Pair<Pair<Integer, Integer>, Pair<V, Long>>> values) {
            List<Centers> centers = Lists.newArrayListWithExpectedSize(numCenters);
            for (int i = 0; i < numCenters; i++) {
                centers.add(new Centers());
            }
            for (Pair<Pair<Integer, Integer>, Pair<V, Long>> p : values) {
                int centerId = p.first().first();
                Vector c = p.second().first().divide(p.second().second());
                centers.set(centerId, centers.get(centerId).extendWith(c));
            }
            return centers;
        }
    }

    private static class ScoringFn<V extends Vector>
            extends DoFn<Pair<Integer, V>, Pair<Integer, Pair<V, Double>>> {
        private final CentersIndex centers;

        private ScoringFn(CentersIndex centers) {
            this.centers = centers;
        }

        @Override
        public void process(Pair<Integer, V> in, Emitter<Pair<Integer, Pair<V, Double>>> emitter) {
            Distances d = centers.getDistances(in.second(), true);
            double dist = d.clusterDistances[in.first()];
            if (dist > 0.0) {
                emitter.emit(Pair.of(in.first(), Pair.of(in.second(), dist)));
            }
        }
    }

    private static class ClosestCenterFn<V extends Vector> extends DoFn<Pair<Integer, V>, Pair<Integer, Integer>> {
        private final CentersIndex centers;

        private ClosestCenterFn(CentersIndex centers) {
            this.centers = centers;
        }

        @Override
        public void process(Pair<Integer, V> in, Emitter<Pair<Integer, Integer>> emitter) {
            Distances d = centers.getDistances(in.second(), true);
            emitter.emit(Pair.of(in.first(), d.closestPoints[in.first()]));
        }
    }

    private static class AssignedCenterFn<V extends NamedVector> extends DoFn<V, Record> {
        private final CentersIndex centers;
        private final List<Integer> clusterIds;

        private AssignedCenterFn(CentersIndex centers, List<Integer> clusterIds) {
            this.centers = centers;
            this.clusterIds = clusterIds;
        }

        @Override
        public void process(V vec, Emitter<Record> emitter) {
            MLVector mlvec = VectorConvert.fromVector(vec);
            Distances d = centers.getDistances(vec, false);
            for (int i = 0; i < d.closestPoints.length; i++) {
                Record r = new SimpleRecord(ASSIGNMENT_SPEC);
                r.set("vector_id", mlvec.getId().toString()).set("cluster_id", getClusterId(i, clusterIds))
                        .set("closest_center_id", d.closestPoints[i]).set("distance", d.clusterDistances[i]);
                emitter.emit(r);
            }
        }

    }

    private static Integer getClusterId(int index, List<Integer> clusterIds) {
        if (clusterIds == null || clusterIds.isEmpty()) {
            return index;
        } else {
            return clusterIds.get(index);
        }
    }

    private static class CenterCostFn<V extends Vector> extends DoFn<V, Pair<Integer, Double>> {
        private final CentersIndex centers;
        private final double[] currentCosts;
        private final boolean approx;

        private CenterCostFn(CentersIndex centers, boolean approx) {
            this.centers = centers;
            this.currentCosts = new double[centers.getNumCenters()];
            this.approx = approx;
        }

        @Override
        public void initialize() {
            Arrays.fill(currentCosts, 0.0);
        }

        @Override
        public void process(V vec, Emitter<Pair<Integer, Double>> emitter) {
            Distances d = centers.getDistances(vec, approx);
            for (int i = 0; i < currentCosts.length; i++) {
                currentCosts[i] += d.clusterDistances[i];
            }
        }

        @Override
        public void cleanup(Emitter<Pair<Integer, Double>> emitter) {
            for (int i = 0; i < currentCosts.length; i++) {
                emitter.emit(Pair.of(i, currentCosts[i]));
            }
        }
    }

    private static class CovarianceCentersFn<V extends Vector> extends DoFn<V, Pair<ClusterKey, V>> {
        private final CentersIndex centers;
        private final List<Integer> clusterIds;
        private final boolean approx;

        public CovarianceCentersFn(CentersIndex centers, List<Integer> clusterIds, boolean approx) {
            this.centers = centers;
            this.clusterIds = clusterIds;
            this.approx = approx;
        }

        @Override
        public void process(V vec, Emitter<Pair<ClusterKey, V>> emitter) {
            Distances d = centers.getDistances(vec, approx);
            for (int i = 0; i < d.closestPoints.length; i++) {
                ClusterKey key = new ClusterKey(getClusterId(i, clusterIds), d.closestPoints[i]);
                emitter.emit(Pair.of(key, vec));
            }
        }
    }

    private static class ClusteringData {
        private long size = Long.MIN_VALUE;
        private List<Double> means;
        private List<MLMatrixEntry> entries;

        public ClusteringData() {
            this.means = Lists.newArrayList();
            this.entries = Lists.newArrayList();
        }

        public void update(Index index, CoMoment cm) {
            if (cm.getN() > size) {
                size = cm.getN();
            }

            double cov = cm.getCovariance();
            if (cov != 0.0) {
                MLMatrixEntry entry = MLMatrixEntry.newBuilder().setRow(index.row).setColumn(index.column)
                        .setValue(cm.getCovariance()).build();
                entries.add(entry);
            }

            if (index.isDiagonal()) {
                if (means.size() <= index.row) {
                    for (int i = means.size(); i < index.row; i++) {
                        means.add(0.0);
                    }
                    means.add(cm.getMeanX());
                } else {
                    means.set(index.row, cm.getMeanX());
                }
            }
        }

        public MLClusterCovariance create(ClusterKey key) {
            return MLClusterCovariance.newBuilder().setClusteringId(key.getClusterId())
                    .setCenterId(key.getCenterId()).setMeans(means).setCount(size).setCov(entries).build();
        }
    }

    private static class ClusterCovariancePObject
            extends PObjectImpl<Pair<Pair<ClusterKey, Index>, CoMoment>, List<MLClusterCovariance>> {
        public ClusterCovariancePObject(PTable<Pair<ClusterKey, Index>, CoMoment> cov) {
            super(cov);
        }

        @Override
        protected List<MLClusterCovariance> process(Iterable<Pair<Pair<ClusterKey, Index>, CoMoment>> pairs) {
            Map<ClusterKey, ClusteringData> data = Maps.newHashMap();
            for (Pair<Pair<ClusterKey, Index>, CoMoment> p : pairs) {
                ClusterKey clusterKey = p.first().first();
                Index index = p.first().second();
                CoMoment cm = p.second();

                ClusteringData cd = data.get(clusterKey);
                if (cd == null) {
                    cd = new ClusteringData();
                    data.put(clusterKey, cd);
                }
                cd.update(index, cm);
            }
            List<MLClusterCovariance> ret = Lists.newArrayList();
            for (Map.Entry<ClusterKey, ClusteringData> e : data.entrySet()) {
                ret.add(e.getValue().create(e.getKey()));
            }
            return ret;
        }
    }

    private static class OutlierScoreFn extends MapFn<Pair<ClusterKey, NamedVector>, Record> {
        private final Map<ClusterKey, MahalanobisDistance> distances;

        public OutlierScoreFn(Map<ClusterKey, MahalanobisDistance> distances) {
            this.distances = distances;
        }

        @Override
        public void initialize() {
            for (MahalanobisDistance d : distances.values()) {
                d.initialize();
            }
        }

        @Override
        public Record map(Pair<ClusterKey, NamedVector> input) {
            Record r = new SimpleRecord(OUTLIER_SPEC);
            ClusterKey key = input.first();
            NamedVector mlvec = input.second();
            double dist = distances.get(key).distance(mlvec);
            r.set("vector_id", mlvec.getName()).set("cluster_id", key.getClusterId())
                    .set("closest_center_id", key.getCenterId()).set("outlier_distance", dist);
            return r;
        }
    }
}