jsat.distributions.MyDistributionSearch.java Source code

Java tutorial

Introduction

Here is the source code for jsat.distributions.MyDistributionSearch.java

Source

/**
 * This file is part of SADL, a library for learning all sorts of (timed) automata and performing sequence-based anomaly detection.
 * Copyright (C) 2013-2016  the original author or authors.
 *
 * SADL is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 *
 * SADL is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with SADL.  If not, see <http://www.gnu.org/licenses/>.
 */
package jsat.distributions;

import java.util.Arrays;

import jsat.distributions.empirical.MyKernelDensityEstimator;
import jsat.linear.Vec;
import jsat.testing.goodnessoffit.KSTest;
import jsat.utils.Pair;

import org.apache.commons.math3.util.Precision;

/**
 * Provides methods for selecting the distribution that best fits a given data set.
 * 
 * @author Edward Raff
 * @author Timo Klerx
 */
public class MyDistributionSearch extends DistributionSearch {
    private static ContinuousDistribution[] possibleDistributionSet = new ContinuousDistribution[] { new Normal(),
            new LogNormal(), new Exponential(), new Gamma(2, 1), new FisherSendor(10, 10), new Weibull(2, 1),
            new Uniform(0, 1), new Logistic(3, 2), new MaxwellBoltzmann(), new Pareto(), new Rayleigh(2) };

    /**
     * Searches the distributions that are known for a possible fit, and returns
     * what appears to be the best fit.
     * 
     * @param v all the values from a sample
     * @return the distribution that provides the best fit to the data that this method could find.
     */
    public static ContinuousDistribution getBestDistribution(Vec v) {
        return getBestDistribution(v, possibleDistributionSet);
    }

    /**
     * Searches the distributions that are known for a possible fit, and returns
     * what appears to be the best fit. If no suitable fit can be found, a
     * {@link MyKernelDensityEstimator} is fit to the data.
     * 
     * @param v all the values from a sample
     * @param KDECutOff the cut off value used for using the KDE. Should be in
     * the range (0, 1). Values less than zero means the KDE will never be used,
     * and greater then 1 means the KDE will always be used.
     * @return the distribution that provides the best fit to the data that this method could find.
     */
    public static ContinuousDistribution getBestDistribution(Vec v, double KDECutOff) {
        return getBestDistribution(v, KDECutOff, possibleDistributionSet);
    }

    /**
     * Searches the distributions that are given for a possible fit, and returns
     * what appears to be the best fit.
     * 
     * @param v all the values from a sample
     * @param possibleDistributions the array of distribution to try and fit to the data
     * @return the distribution that provides the best fit to the data that this method could find.
     */
    public static ContinuousDistribution getBestDistribution(Vec v,
            ContinuousDistribution... possibleDistributions) {
        return getBestDistribution(v, 0.0, possibleDistributions);
    }

    /**
     * Searches the distributions that are given for a possible fit, and returns
     * what appears to be the best fit. If no suitable fit can be found, a
     * {@link MyKernelDensityEstimator} is fit to the data.
     * 
     * @param v all the values from a sample
     * @param KDECutOff the cut off value used for using the KDE. Should be in
     * the range (0, 1). Values less than zero means the KDE will never be used,
     * and greater then 1 means the KDE will always be used.
     * @param possibleDistributions the array of distribution to try and fit to the data
     * @return  the distribution that provides the best fit to the data that this method could find.
     */
    public static ContinuousDistribution getBestDistribution(Vec v, double KDECutOff,
            ContinuousDistribution... possibleDistributions) {
        if (v.length() == 0) {
            throw new ArithmeticException("Can not fit a distribution to an empty set");
        }
        final Pair<Boolean, Double> result = checkForDifferentValues(v);
        if (result.getFirstItem()) {
            return new SingleValueDistribution(result.getSecondItem());
        }
        //Thread Safety, clone the possible distributions

        final ContinuousDistribution[] possDistCopy = new ContinuousDistribution[possibleDistributions.length];

        for (int i = 0; i < possibleDistributions.length; i++) {
            possDistCopy[i] = possibleDistributions[i].clone();
        }

        final KSTest ksTest = new KSTest(v);

        ContinuousDistribution bestDist = null;
        double bestProb = 0;

        for (final ContinuousDistribution cd : possDistCopy) {
            try {
                cd.setUsingData(v);
                final double prob = ksTest.testDist(cd);

                if (prob > bestProb) {
                    bestDist = cd;
                    bestProb = prob;
                }

            } catch (final Exception ex) {

            }
        }

        ///Return the best distribution, or if somehow everythign went wrong, a normal distribution
        try {
            if (bestProb >= KDECutOff) {
                return bestDist == null ? new Normal(v.mean(), v.standardDeviation()) : bestDist.clone();
            } else {
                return new MyKernelDensityEstimator(v);
            }
        } catch (final RuntimeException ex)//Mostly likely occurs if all values are all zero
        {
            if (v.standardDeviation() == 0) {
                return null;
            }
            throw new ArithmeticException("Catistrophic faulure getting a distribution");
        }
    }

    /**
     * True iff there are only identical values in the vector
     * @param v
     */
    public static Pair<Boolean, Double> checkForDifferentValues(Vec v) {
        final double value = v.get(0);
        for (int i = 1; i < v.length(); i++) {
            if (!Precision.equals(v.get(i), value)) {
                return new Pair<>(false, -1.0);
            }
        }
        return new Pair<>(true, value);
    }

    /**
     * search for all possible distributions and maybe also for a KDE. Does not compare bestProb to cutoff
     * @param v
     * @param includeKDE
     */
    public static Distribution getBestDistribution(Vec v, boolean includeKDE) {
        if (!includeKDE) {
            return getBestDistribution(v);
        } else {
            final ContinuousDistribution[] possibleDists = Arrays.copyOf(possibleDistributionSet,
                    possibleDistributionSet.length + 1);
            possibleDists[possibleDists.length - 1] = new MyKernelDensityEstimator(v);
            return getBestDistribution(v, possibleDists);
        }
    }
}