org.apache.mahout.classifier.AbstractVectorClassifier.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.mahout.classifier.AbstractVectorClassifier.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.classifier;

import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.Vector;

import com.google.common.base.Preconditions;

/**
 * Defines the interface for classifiers that take a vector as input. This is
 * implemented as an abstract class so that it can implement a number of handy
 * convenience methods related to classification of vectors.
 *
 * <p>
 * A classifier takes an input vector and calculates the scores (usually
 * probabilities) that the input vector belongs to one of {@code n}
 * categories. In {@code AbstractVectorClassifier} each category is denoted
 * by an integer {@code c} between {@code 0} and {@code n-1}
 * (inclusive).
 *
 * <p>
 * New users should start by looking at {@link #classifyFull} (not {@link #classify}).
 *
 */
public abstract class AbstractVectorClassifier {

    /** Minimum allowable log likelihood value. */
    public static final double MIN_LOG_LIKELIHOOD = -100.0;

    /**
     * Returns the number of categories that a target variable can be assigned to.
     * A vector classifier will encode it's output as an integer from
     * {@code 0} to {@code numCategories()-1} (inclusive).
     *
     * @return The number of categories.
     */
    public abstract int numCategories();

    /**
     * Compute and return a vector containing {@code n-1} scores, where
     * {@code n} is equal to {@code numCategories()}, given an input
     * vector {@code instance}. Higher scores indicate that the input vector
     * is more likely to belong to that category. The categories are denoted by
     * the integers {@code 0} through {@code n-1} (inclusive), and the
     * scores in the returned vector correspond to categories 1 through
     * {@code n-1} (leaving out category 0). It is assumed that the score for
     * category 0 is one minus the sum of the scores in the returned vector.
     *
     * @param instance  A feature vector to be classified.
     * @return A vector of probabilities in 1 of {@code n-1} encoding.
     */
    public abstract Vector classify(Vector instance);

    /**
     * Compute and return a vector of scores before applying the inverse link
     * function. For logistic regression and other generalized linear models, this
     * is just the linear part of the classification.
     * 
     * <p>
     * The implementation of this method provided by {@code AbstractVectorClassifier} throws an
     * {@link UnsupportedOperationException}. Your subclass must explicitly override this method to support
     * this operation.
     * 
     * @param features  A feature vector to be classified.
     * @return A vector of scores. If transformed by the link function, these will become probabilities.
     */
    public Vector classifyNoLink(Vector features) {
        throw new UnsupportedOperationException(
                this.getClass().getName() + " doesn't support classification without a link");
    }

    /**
     * Classifies a vector in the special case of a binary classifier where
     * {@link #classify(Vector)} would return a vector with only one element. As
     * such, using this method can avoid the allocation of a vector.
     * 
     * @param instance The feature vector to be classified.
     * @return The score for category 1.
     * 
     * @see #classify(Vector)
     */
    public abstract double classifyScalar(Vector instance);

    /**
     * Computes and returns a vector containing {@code n} scores, where
     * {@code n} is {@code numCategories()}, given an input vector
     * {@code instance}. Higher scores indicate that the input vector is more
     * likely to belong to the corresponding category. The categories are denoted
     * by the integers {@code 0} through {@code n-1} (inclusive).
     *
     * <p>
     * Using this method it is possible to classify an input vector, for example,
     * by selecting the category with the largest score. If
     * {@code classifier} is an instance of
     * {@code AbstractVectorClassifier} and {@code input} is a
     * {@code Vector} of features describing an element to be classified,
     * then the following code could be used to classify {@code input}.<br>
     * {@code
     * Vector scores = classifier.classifyFull(input);<br>
     * int assignedCategory = scores.maxValueIndex();<br>
     * } Here {@code assignedCategory} is the index of the category
     * with the maximum score.
     *
     * <p>
     * If an {@code n-1} encoding is acceptable, and allocation performance
     * is an issue, then the {@link #classify(Vector)} method is probably better
     * to use.
     *
     * @see #classify(Vector)
     * @see #classifyFull(Vector r, Vector instance)
     *
     * @param instance A vector of features to be classified.
     * @return A vector of probabilities, one for each category.
     */
    public Vector classifyFull(Vector instance) {
        return classifyFull(new DenseVector(numCategories()), instance);
    }

    /**
     * Computes and returns a vector containing {@code n} scores, where
     * {@code n} is {@code numCategories()}, given an input vector
     * {@code instance}. Higher scores indicate that the input vector is more
     * likely to belong to the corresponding category. The categories are denoted
     * by the integers {@code 0} through {@code n-1} (inclusive). The
     * main difference between this method and {@link #classifyFull(Vector)} is
     * that this method allows a user to provide a previously allocated
     * {@code Vector r} to store the returned scores.
     *
     * <p>
     * Using this method it is possible to classify an input vector, for example,
     * by selecting the category with the largest score. If
     * {@code classifier} is an instance of
     * {@code AbstractVectorClassifier}, {@code result} is a non-null
     * {@code Vector}, and {@code input} is a {@code Vector} of
     * features describing an element to be classified, then the following code
     * could be used to classify {@code input}.<br>
     * {@code
     * Vector scores = classifier.classifyFull(result, input); // Notice that scores == result<br>
     * int assignedCategory = scores.maxValueIndex();<br>
     * } Here {@code assignedCategory} is the index of the category
     * with the maximum score.
     *
     * @param r Where to put the results.
     * @param instance  A vector of features to be classified.
     * @return A vector of scores/probabilities, one for each category.
     */
    public Vector classifyFull(Vector r, Vector instance) {
        r.viewPart(1, numCategories() - 1).assign(classify(instance));
        r.setQuick(0, 1.0 - r.zSum());
        return r;
    }

    /**
     * Returns n-1 probabilities, one for each categories 1 through
     * {@code n-1}, for each row of a matrix, where {@code n} is equal
     * to {@code numCategories()}. The probability of the missing 0-th
     * category is 1 - rowSum(this result).
     *
     * @param data  The matrix whose rows are the input vectors to classify
     * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category.
     */
    public Matrix classify(Matrix data) {
        Matrix r = new DenseMatrix(data.numRows(), numCategories() - 1);
        for (int row = 0; row < data.numRows(); row++) {
            r.assignRow(row, classify(data.viewRow(row)));
        }
        return r;
    }

    /**
     * Returns a matrix where the rows of the matrix each contain {@code n} probabilities, one for each category.
     *
     * @param data  The matrix whose rows are the input vectors to classify
     * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category.
     */
    public Matrix classifyFull(Matrix data) {
        Matrix r = new DenseMatrix(data.numRows(), numCategories());
        for (int row = 0; row < data.numRows(); row++) {
            classifyFull(r.viewRow(row), data.viewRow(row));
        }
        return r;
    }

    /**
     * Returns a vector of probabilities of category 1, one for each row
     * of a matrix. This only makes sense if there are exactly two categories, but
     * calling this method in that case can save a number of vector allocations.
     * 
     * @param data  The matrix whose rows are vectors to classify
     * @return A vector of scores, with one value per row of the input matrix.
     */
    public Vector classifyScalar(Matrix data) {
        Preconditions.checkArgument(numCategories() == 2, "Can only call classifyScalar with two categories");

        Vector r = new DenseVector(data.numRows());
        for (int row = 0; row < data.numRows(); row++) {
            r.set(row, classifyScalar(data.viewRow(row)));
        }
        return r;
    }

    /**
     * Returns a measure of how good the classification for a particular example
     * actually is.
     * 
     * @param actual  The correct category for the example.
     * @param data  The vector to be classified.
     * @return The log likelihood of the correct answer as estimated by the current model. This will always be <= 0
     *  and larger (closer to 0) indicates better accuracy. In order to simplify code that maintains eunning averages,
     *  we bound this value at -100.
     */
    public double logLikelihood(int actual, Vector data) {
        if (numCategories() == 2) {
            double p = classifyScalar(data);
            if (actual > 0) {
                return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p));
            } else {
                return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p));
            }
        } else {
            Vector p = classify(data);
            if (actual > 0) {
                return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p.get(actual - 1)));
            } else {
                return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p.zSum()));
            }
        }
    }
}