de.dfki.madm.anomalydetection.operator.statistical_based.RobustPCAOperator.java Source code

Java tutorial

Introduction

Here is the source code for de.dfki.madm.anomalydetection.operator.statistical_based.RobustPCAOperator.java

Source

/*
 *  RapidMiner Anomaly Detection Extension
 *
 *  Copyright (C) 2009-2014 by Markus Goldstein or its licensors, as applicable.
 *
 *  This is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this software. If not, see <http://www.gnu.org/licenses/.
 *
 * Author: Markus Goldstein
 * Responsible: Markus Goldstein (goldstein@ait.kyushu-u.ac.jp)
 *
 * URL: http://madm.dfki.de/rapidminer/anomalydetection
 */
package de.dfki.madm.anomalydetection.operator.statistical_based;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.math.MathException;
import org.apache.commons.math.distribution.ChiSquaredDistributionImpl;

import Jama.EigenvalueDecomposition;
import Jama.Matrix;

import com.rapidminer.example.*;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.features.transformation.PCAModel;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.OutputPort;
import com.rapidminer.operator.ports.metadata.ExampleSetPrecondition;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.conditions.EqualTypeCondition;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.math.matrix.CovarianceMatrix;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.set.MappedExampleSet;
import com.rapidminer.example.table.AttributeFactory;

/**
 * Computes a robust PCA-based anomaly score. For robustness, trimming of the
 * original data set based on the Mahalanobis distance is performed first. Then,
 * PCA is computed and a score is determined based on the top upper and/or lower PCs.
 * This operator follows the papers "A Novel Anomaly Detection Scheme Based on
 * Principal Component Classifier" by Shyu et al (2003) and "Robust Methods for
 * Unsupervised PCA-based Anomaly Detection" by Kwitt et al. (2006). In contrast to
 * the original publications, this operator computes a normalized score instead of
 * classifying into normal/anomalous instances.
 * 
 * Based on RM's PCA class.
 * 
 * @author Markus Goldstein
 * 
 */
public class RobustPCAOperator extends Operator {
    public static final String PARAMETER_REDUCTION_TYPE = "component_usage";
    public static final String PARAMETER_REDUCTION_TYPE_DESCRIPTION = "Select wich principal components should be used for anomaly score computation. Major PCs are typically preferred in literature.";
    public static final String[] PCS_METHODS = new String[] { "use all components", "only use major components",
            "only use minor components", "use major and minor components" };

    public static final int PCS_ALL = 0;
    public static final int PCS_TOP = 1;
    public static final int PCS_LOWER = 2;
    public static final int PCS_BOTH = 3;

    public static final String PARAMETER_TOP_METHODS = "major_components";
    public static final String PARAMETER_TOP_METHODS_DESCRIPTION = "Select method for major principal components to use.";
    public static final String[] PCS_TOP_METHODS = new String[] { "use variance threshold",
            "use fixed number of components", };

    public static final int PCS_TOP_VAR = 0;
    public static final int PCS_TOP_FIX = 1;

    public static final String PARAMETER_LOW_METHODS = "minor_components";
    public static final String PARAMETER_LOW_METHODS_DESCRIPTION = "Select method for minor principal components to use.";
    public static final String[] PCS_LOW_METHODS = new String[] { "use max eigenvalue",
            "use fixed number of components", };

    public static final int PCS_LOW_VAL = 0;
    public static final int PCS_LOW_FIX = 1;

    /**
     * The normal class probability; used for robustness and chi**2 dist normalization.
     */
    public static final String PARAMETER_OUTLIER_PROBABILITY = "probability_for_normal_class";
    public static final String PARAMETER_OUTLIER_PROBABILITY_DESCRIPTION = "This is the expected probability of normal data instances. Usually it should be between 0.95 and 1.0.";

    public static final String PARAMETER_VARIANCE_THRESHOLD = "cumulative_variance";
    public static final String PARAMETER_VARIANCE_THRESHOLD_DESCRIPTION = "Cumulative variance threshold for selecting major components.";

    public static final String PARAMETER_NUMBER_OF_COMPONENTS_TOP = "number_of_major_pcs";
    public static final String PARAMETER_NUMBER_OF_COMPONENTS_TOP_DESCRIPTION = "Number of major components to keep.";

    public static final String PARAMETER_NUMBER_OF_COMPONENTS_LOW = "number_of_minor_pcs";
    public static final String PARAMETER_NUMBER_OF_COMPONENTS_LOW_DESCRIPTION = "Number of minor components to keep.";

    public static final String PARAMETER_VALUE_THRESHOLD = "eigenvalue_threshold_max";
    public static final String PARAMETER_VALUE_THRESHOLD_DESCRIPTION = "The maximum allowed eigenvalue for minor components taken into account.";

    private InputPort exampleSetInput = getInputPorts().createPort("example set input");

    private OutputPort exampleSetOutput = getOutputPorts().createPort("example set output");

    public RobustPCAOperator(OperatorDescription description) {
        super(description);
        exampleSetInput.addPrecondition(new ExampleSetPrecondition(exampleSetInput, Ontology.NUMERICAL));
    }

    @Override
    public void doWork() throws OperatorException {
        // check whether all attributes are numerical
        ExampleSet exampleSet = exampleSetInput.getData(ExampleSet.class);

        Tools.onlyNonMissingValues(exampleSet, "PCA");
        Tools.onlyNumericalAttributes(exampleSet, "PCA");

        // Get normal probability.
        double normProb = getParameterAsDouble(PARAMETER_OUTLIER_PROBABILITY);
        int olInst = exampleSet.size() - (int) Math.floor(exampleSet.size() * normProb);
        log("Ignoring " + olInst + " anomalyous instances for robustness.");

        // The robust estimate is based on removing top outliers first based on Mahalanobis distance (MD).
        // Since MD is the same as the outlier score when using all PCs, the PCA is done twice:
        // First with all examples, second with top-outliers removed (robust)

        // First PCA for outlier removal
        // create covariance matrix
        Matrix covarianceMatrix = CovarianceMatrix.getCovarianceMatrix(exampleSet);

        // EigenVector and EigenValues of the covariance matrix
        EigenvalueDecomposition eigenvalueDecomposition = covarianceMatrix.eig();

        // create and deliver results
        double[] eigenvalues = eigenvalueDecomposition.getRealEigenvalues();
        Matrix eigenvectorMatrix = eigenvalueDecomposition.getV();
        double[][] eigenvectors = eigenvectorMatrix.getArray();

        PCAModel model = new PCAModel(exampleSet, eigenvalues, eigenvectors);

        // Perform transformation
        ExampleSet res = model.apply((ExampleSet) exampleSet.clone());

        // Compute simple list with MDs and sort according to MD.
        List<double[]> l = new LinkedList<double[]>();
        double eIdx = 0;
        for (Example example : res) {
            double md = 0.0;
            int aNr = 0;
            for (Attribute attr : example.getAttributes()) {
                double pcscore = example.getValue(attr);
                md += (pcscore * pcscore) / model.getEigenvalue(aNr);
                aNr++;
            }
            double[] x = { md, eIdx };
            l.add(x);
            eIdx++;
        }
        Collections.sort(l, new Comparator<double[]>() {
            public int compare(double[] first, double[] second) {
                return Double.compare(second[0], first[0]);
            }
        });
        // Out of the list, create array with outlier-indexes and array (mapping) with good instances. 
        Iterator<double[]> iter = l.iterator();
        int[] olMapping = new int[olInst];
        for (int i = 0; i < olInst; i++) {
            olMapping[i] = (int) ((double[]) iter.next())[1];
        }
        Arrays.sort(olMapping);
        int[] mapping = new int[exampleSet.size() - olInst];
        int olc = 0;
        int ctr = 0;
        for (int i = 0; i < exampleSet.size(); i++) {
            if (olc == olInst) { // Add last elements after last outlier
                mapping[ctr++] = i;
                continue;
            }
            if (olMapping[olc] != i) {
                mapping[ctr++] = i;
            } else {
                olc++;
            }
        }
        ExampleSet robustExampleSet = new MappedExampleSet(exampleSet, mapping); // creates a new example set without the top outliers.

        // ---
        // Second PCA (robust)
        covarianceMatrix = CovarianceMatrix.getCovarianceMatrix(robustExampleSet);
        eigenvalueDecomposition = covarianceMatrix.eig();

        // create and deliver results
        eigenvalues = eigenvalueDecomposition.getRealEigenvalues();
        eigenvectorMatrix = eigenvalueDecomposition.getV();
        eigenvectors = eigenvectorMatrix.getArray();

        // Apply on original set
        model = new PCAModel(exampleSet, eigenvalues, eigenvectors);

        // Perform transformation
        res = model.apply((ExampleSet) exampleSet.clone());

        // Sort eigenvalues
        Arrays.sort(eigenvalues);
        ArrayUtils.reverse(eigenvalues);

        // if necessary reduce nbr of dimensions ...
        int reductionType = getParameterAsInt(PARAMETER_REDUCTION_TYPE);
        List<Integer> pcList = new ArrayList<Integer>();
        if (reductionType == PCS_ALL) {
            for (int i = 0; i < exampleSet.getAttributes().size(); i++) {
                pcList.add(i);
            }
        }
        if (reductionType == PCS_TOP || reductionType == PCS_BOTH) {
            //top
            switch (getParameterAsInt(PARAMETER_TOP_METHODS)) {
            case PCS_TOP_FIX:
                for (int i = 0; i < getParameterAsInt(PARAMETER_NUMBER_OF_COMPONENTS_TOP); i++) {
                    pcList.add(i);
                }
                break;
            case PCS_TOP_VAR:
                double var = getParameterAsDouble(PARAMETER_VARIANCE_THRESHOLD);
                boolean last = false;
                for (int i = 0; i < exampleSet.getAttributes().size(); i++) {
                    if (model.getCumulativeVariance(i) < var) {
                        pcList.add(i);
                    } else if (!last) { // we need to add another PC to meet the minimum requirement.
                        last = true;
                        pcList.add(i);
                    }
                }
                break;
            }
        }
        if (reductionType == PCS_LOWER || reductionType == PCS_BOTH) {
            //lower
            switch (getParameterAsInt(PARAMETER_LOW_METHODS)) {
            case PCS_LOW_FIX:
                for (int i = exampleSet.getAttributes().size()
                        - getParameterAsInt(PARAMETER_NUMBER_OF_COMPONENTS_LOW); i < exampleSet.getAttributes()
                                .size(); i++) {
                    pcList.add(i);
                }
                break;
            case PCS_LOW_VAL:
                double val = getParameterAsDouble(PARAMETER_VALUE_THRESHOLD);
                for (int i = 0; i < eigenvalues.length; i++) {
                    if (eigenvalues[i] <= val) {
                        if (pcList.size() == 0) {
                            pcList.add(i);
                        } else if (pcList.get(pcList.size() - 1).intValue() < i) {
                            pcList.add(i);
                        }
                    }
                }
                break;
            }
        }
        int[] opcs = ArrayUtils.toPrimitive(pcList.toArray(new Integer[pcList.size()]));

        if (opcs.length == 0) {
            throw new UserError(this,
                    "Parameters thresholds are selected such that they did not match any principal component. Lower variance or increase eigenvalue threshold.");
        }
        if (opcs.length == exampleSet.getAttributes().size()) {
            log("Using all PCs for score.");
        } else {
            log("Using following PCs for score: " + Arrays.toString(opcs));
        }

        // Normalize by Chi-Dist with d degrees of freedom
        double scoreNormalizer = 1.0;
        ChiSquaredDistributionImpl chi = new ChiSquaredDistributionImpl(opcs.length);
        try {
            scoreNormalizer = chi.inverseCumulativeProbability(normProb);
        } catch (MathException e) {
            System.err.println(e);
        }
        log("Normalizing score with chi cumulative propability: " + scoreNormalizer);

        // compute scores
        Attribute scoreAttr = AttributeFactory.createAttribute("outlier", Ontology.REAL);
        exampleSet.getExampleTable().addAttribute(scoreAttr);
        exampleSet.getAttributes().setOutlier(scoreAttr);
        for (int exNr = 0; exNr < exampleSet.size(); exNr++) {
            Example orig = exampleSet.getExample(exNr);
            Example pc = res.getExample(exNr);
            double oscore = 0.0;
            int aNr = 0;
            ctr = 0;
            for (Attribute attr : pc.getAttributes()) {
                if (ctr < opcs.length && opcs[ctr] != aNr) { // we skip this dimension
                    aNr++;
                    continue;
                }
                double pcscore = pc.getValue(attr);
                oscore += (pcscore * pcscore) / model.getEigenvalue(aNr);
                aNr++;
                ctr++;
            }
            orig.setValue(scoreAttr, oscore / scoreNormalizer);
        }
        exampleSetOutput.deliver(exampleSet);
    }

    @Override
    public List<ParameterType> getParameterTypes() {
        List<ParameterType> list = super.getParameterTypes();

        list.add(new ParameterTypeDouble(PARAMETER_OUTLIER_PROBABILITY, PARAMETER_OUTLIER_PROBABILITY_DESCRIPTION,
                0, 1.0, 0.975, false));

        ParameterType type = new ParameterTypeCategory(PARAMETER_REDUCTION_TYPE,
                PARAMETER_REDUCTION_TYPE_DESCRIPTION, PCS_METHODS, PCS_ALL);
        type.setExpert(false);
        list.add(type);

        type = new ParameterTypeCategory(PARAMETER_TOP_METHODS, PARAMETER_TOP_METHODS_DESCRIPTION, PCS_TOP_METHODS,
                PCS_TOP_VAR);
        type.setExpert(false);
        type.registerDependencyCondition(
                new EqualTypeCondition(this, PARAMETER_REDUCTION_TYPE, PCS_METHODS, false, PCS_TOP, PCS_BOTH));
        list.add(type);

        type = new ParameterTypeDouble(PARAMETER_VARIANCE_THRESHOLD, PARAMETER_VARIANCE_THRESHOLD_DESCRIPTION,
                Double.MIN_VALUE, 1, 0.50);
        type.setExpert(false);
        type.registerDependencyCondition(
                new EqualTypeCondition(this, PARAMETER_REDUCTION_TYPE, PCS_METHODS, true, PCS_TOP, PCS_BOTH));
        type.registerDependencyCondition(
                new EqualTypeCondition(this, PARAMETER_TOP_METHODS, PCS_TOP_METHODS, true, PCS_TOP_VAR));
        list.add(type);

        type = new ParameterTypeInt(PARAMETER_NUMBER_OF_COMPONENTS_TOP,
                PARAMETER_NUMBER_OF_COMPONENTS_TOP_DESCRIPTION, 1, Integer.MAX_VALUE, 1);
        type.setExpert(false);
        type.registerDependencyCondition(
                new EqualTypeCondition(this, PARAMETER_REDUCTION_TYPE, PCS_METHODS, true, PCS_TOP, PCS_BOTH));
        type.registerDependencyCondition(
                new EqualTypeCondition(this, PARAMETER_TOP_METHODS, PCS_TOP_METHODS, true, PCS_TOP_FIX));
        list.add(type);

        type = new ParameterTypeCategory(PARAMETER_LOW_METHODS, PARAMETER_LOW_METHODS_DESCRIPTION, PCS_LOW_METHODS,
                PCS_LOW_VAL);
        type.setExpert(false);
        type.registerDependencyCondition(
                new EqualTypeCondition(this, PARAMETER_REDUCTION_TYPE, PCS_METHODS, false, PCS_LOWER, PCS_BOTH));
        list.add(type);

        type = new ParameterTypeDouble(PARAMETER_VALUE_THRESHOLD, PARAMETER_VALUE_THRESHOLD_DESCRIPTION, 0,
                Double.MAX_VALUE, 0.20);
        type.setExpert(false);
        type.registerDependencyCondition(
                new EqualTypeCondition(this, PARAMETER_REDUCTION_TYPE, PCS_METHODS, true, PCS_LOWER, PCS_BOTH));
        type.registerDependencyCondition(
                new EqualTypeCondition(this, PARAMETER_LOW_METHODS, PCS_LOW_METHODS, true, PCS_LOW_VAL));
        list.add(type);

        type = new ParameterTypeInt(PARAMETER_NUMBER_OF_COMPONENTS_LOW,
                PARAMETER_NUMBER_OF_COMPONENTS_LOW_DESCRIPTION, 1, Integer.MAX_VALUE, 1);
        type.setExpert(false);
        type.registerDependencyCondition(
                new EqualTypeCondition(this, PARAMETER_REDUCTION_TYPE, PCS_METHODS, true, PCS_LOWER, PCS_BOTH));
        type.registerDependencyCondition(
                new EqualTypeCondition(this, PARAMETER_LOW_METHODS, PCS_LOW_METHODS, true, PCS_LOW_FIX));
        list.add(type);

        return list;
    }
}