Java tutorial
/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.toolbox.statistics; import java.util.Arrays; import java.util.List; import java.util.ArrayList; import org.fhcrc.cpl.toolbox.datastructure.Pair; /** * A class with generic methods for basic stats. Add methods as needed. * A lot of things are written twice, once for floats and once for doubles. That's annoying. * But it would be wasteful of space and time to do conversions */ public class BasicStatistics { /** * Calculate the standard deviation of the inputs * @param inputs * @return */ public static float standardDeviation(float[] inputs) { if (inputs.length < 2) return 0; float variance = variance(inputs); return (float) Math.sqrt(variance); } /** * Calculate the standard deviation of the inputs * @param inputs * @return */ public static double standardDeviation(double[] inputs) { if (inputs.length < 2) return 0; double variance = variance(inputs); return Math.sqrt(variance); } public static double standardDeviation(List<? extends Number> inputs) { double[] inputsAsArray = new double[inputs.size()]; for (int i = 0; i < inputs.size(); i++) inputsAsArray[i] = inputs.get(i).doubleValue(); return standardDeviation(inputsAsArray); } public static float standardDeviationFloatList(List<? extends Number> inputs) { double[] inputsAsArray = new double[inputs.size()]; for (int i = 0; i < inputs.size(); i++) inputsAsArray[i] = inputs.get(i).doubleValue(); return (float) standardDeviation(inputsAsArray); } /** * Calculate the variance of the inputs * @param inputs * @return */ public static float variance(float[] inputs) { if (inputs.length < 2) return 0; float mean = mean(inputs); float varianceNumerator = 0; for (int i = 0; i < inputs.length; i++) { float diffFromMean = inputs[i] - mean; varianceNumerator += (diffFromMean * diffFromMean); } return varianceNumerator / (inputs.length - 1); } /** * Calculate the variance of the inputs * @param inputs * @return */ public static double variance(double[] inputs) { if (inputs.length < 2) return 0; double mean = mean(inputs); double varianceNumerator = 0; for (int i = 0; i < inputs.length; i++) { double diffFromMean = inputs[i] - mean; varianceNumerator += (diffFromMean * diffFromMean); } return varianceNumerator / (inputs.length - 1); } /** * Calculate the mean of the inputs * @param inputs * @return */ public static double mean(int[] inputs) { float sum = 0; for (int i = 0; i < inputs.length; i++) { sum += inputs[i]; } float mean = sum / inputs.length; return mean; } /** * Calculate the mean of the inputs * @param inputs * @return */ public static float mean(float[] inputs) { float sum = 0; for (int i = 0; i < inputs.length; i++) { sum += inputs[i]; } float mean = sum / inputs.length; return mean; } /** * Calculate the mean of the inputs * @param inputs * @return */ public static double mean(List<? extends Number> inputs) { return sum(inputs) / (double) inputs.size(); } /** * Calculate the mean of the inputs * @param inputs * @return */ public static double mean(double[] inputs) { double sum = 0; for (int i = 0; i < inputs.length; i++) { sum += inputs[i]; } double mean = sum / inputs.length; return mean; } public static double median(List<? extends Number> inputs) { double[] inputsArray = new double[inputs.size()]; for (int i = 0; i < inputs.size(); i++) inputsArray[i] = inputs.get(i).doubleValue(); return median(inputsArray); } /** * Calculate the median of the inputs. * This is median as defined in R -- average of the middle two points. NOT necessarily * one of the inputs. * Clones before sorting, so as not to disrupt input * @param inputs * @return */ public static double median(double[] inputs) { //clone before sorting double[] inputsClone = inputs.clone(); Arrays.sort(inputsClone); if (inputs.length == 0) return 0; //if odd, return the middle element if (inputsClone.length % 2 == 1) return inputsClone[(inputsClone.length - 1) / 2]; //if got here, then even. Average the middle two elements return (inputsClone[(inputsClone.length) / 2 - 1] + inputsClone[(inputsClone.length) / 2]) / 2; } /** * Calculate the interquartile range. I think this is right. * * Calculate the median, then calculate the median of everything under the * median and everything over the median. * @param inputs * @return */ public static Pair<Double, Double> interQuartileRange(double[] inputs) { double median = median(inputs); List<Double> underMedian = new ArrayList<Double>(); List<Double> overMedian = new ArrayList<Double>(); for (Double input : inputs) { if (input <= median) underMedian.add(input); else overMedian.add(input); } double q1 = median(underMedian); double q3 = median(overMedian); return new Pair<Double, Double>(q1, q3); } public static Pair<Double, Double> interQuartileRange(List<? extends Number> inputs) { double[] inputsArray = new double[inputs.size()]; for (int i = 0; i < inputs.size(); i++) inputsArray[i] = inputs.get(i).doubleValue(); return interQuartileRange(inputsArray); } public static double min(double[] values) { double minValue = Double.MAX_VALUE; for (double value : values) if (value < minValue) minValue = value; return minValue; } public static float min(float[] values) { float minValue = Float.MAX_VALUE; for (float value : values) if (value < minValue) minValue = value; return minValue; } public static double min(List<? extends Number> values) { double minValue = Float.MAX_VALUE; for (Number value : values) if (value.doubleValue() < minValue) minValue = value.doubleValue(); return minValue; } public static double max(double[] values) { double maxValue = Double.MIN_VALUE; for (double value : values) if (value > maxValue) maxValue = value; return maxValue; } public static float max(float[] values) { float maxValue = Float.MIN_VALUE; for (float value : values) if (value > maxValue) maxValue = value; return maxValue; } public static double max(List<? extends Number> values) { double maxValue = Float.MIN_VALUE; for (Number value : values) if (value.doubleValue() > maxValue) maxValue = value.doubleValue(); return maxValue; } /** * calculate percentile p of values in list * @param values * @param p * @return */ public static double percentile(List<? extends Number> values, double p) { double[] inputsArray = new double[values.size()]; for (int i = 0; i < values.size(); i++) inputsArray[i] = values.get(i).doubleValue(); return percentile(inputsArray, p); } /** * Returns an estimate of the <code>p</code>th percentile of the values * in the <code>values</code> array, starting with the element in (0-based) * position <code>begin</code> in the array and including <code>length</code> * values. * <p> * Calls to this method do not modify the array * * Copied from org.apache.commons.math.stat * * todo: figure out for sure whether having median() call this is OK * * @param values array of input values * @param p the percentile to compute * @return the percentile value * @throws IllegalArgumentException if the parameters are not valid or the * input array is null */ public static double percentile(double[] values, double p) { int length = values.length; if ((p > 100) || (p <= 0)) { throw new IllegalArgumentException("invalid quantile value: " + p); } double n = (double) length; if (n == 0) { return Double.NaN; } if (n == 1) { return values[0]; // always return single value for n = 1 } double pos = p * (n + 1) / 100; double fpos = Math.floor(pos); int intPos = (int) fpos; double dif = pos - fpos; double[] sorted = new double[length]; System.arraycopy(values, 0, sorted, 0, length); Arrays.sort(sorted); if (pos < 1) { return sorted[0]; } if (pos >= n) { return sorted[length - 1]; } double lower = sorted[intPos - 1]; double upper = sorted[intPos]; return lower + dif * (upper - lower); } /** * Calculate the median of the inputs. * This is median as defined in R -- average of the middle two points. NOT necessarily * one of the inputs. * Clones before sorting, so as not to disrupt input * @param inputs * @return */ public static float median(float[] inputs) { //clone before sorting float[] inputsClone = inputs.clone(); Arrays.sort(inputsClone); if (inputsClone.length == 0) return 0; //if odd, return the middle element if (inputsClone.length % 2 == 1) return inputsClone[(inputsClone.length - 1) / 2]; //if got here, then even. Average the middle two elements return (inputsClone[(inputsClone.length) / 2 - 1] + inputsClone[(inputsClone.length) / 2]) / 2; } /** * Compute the leverage of each input: * (x-xbar)^2/((n-1)sigmax^2)) * @param inputs * @return */ public static double[] leverages(double[] inputs) { int n = inputs.length; double sigmaInputsSquared = variance(inputs); double meanInput = BasicStatistics.mean(inputs); double leverageDenomenator = (n - 1) * sigmaInputsSquared; double[] leverages = new double[n]; for (int i = 0; i < n; i++) { leverages[i] = (1.0 / (double) n) + (Math.pow(inputs[i] - meanInput, 2) / leverageDenomenator); } return leverages; } /** * Compute the leverage of each input: * (x-xbar)^2/((n-1)sigmax^2)) * @param inputs * @return */ public static float[] leverages(float[] inputs) { int n = inputs.length; double sigmaInputsSquared = variance(inputs); double meanInput = BasicStatistics.mean(inputs); double leverageDenomenator = (n - 1) * sigmaInputsSquared; float[] leverages = new float[n]; for (int i = 0; i < n; i++) { leverages[i] = (float) (Math.pow(inputs[i] - meanInput, 2) / leverageDenomenator); } return leverages; } /** * Convenience method * @param xValues * @param residuals * @return */ public static double[] studentizedResiduals(double[] xValues, double[] residuals) { double[] leverages = leverages(xValues); return studentizedResiduals(xValues, residuals, leverages); } /** * Calculate the studentized residuals of arrays of somethingorother * represented here by their X values, leverages (based on X values), * and residuals. * @param xValues * @param residuals * @param leverages * @return */ public static double[] studentizedResiduals(double[] xValues, double[] residuals, double[] leverages) { int n = xValues.length; if (leverages == null) leverages = leverages(xValues); //sigmaError requires a denomenator of n-2, so can't use //BasicStatistics.standardDeviation() double sumSquaredResiduals = 0; for (double residual : residuals) sumSquaredResiduals += Math.pow(residual, 2); double sigmaError = Math.sqrt(sumSquaredResiduals / (n - 2)); //1 + 1/n. Same for all inputs, compute once double onePlusOneOverN = 1 + 1 / n; double[] studentizedResiduals = new double[n]; //Find the "studentized residual" of every feature: for (int i = 0; i < n; i++) { double scaledResidual = residuals[i] / sigmaError; studentizedResiduals[i] = scaledResidual / Math.sqrt(onePlusOneOverN + leverages[i]); } return studentizedResiduals; } public static double sum(double[] values) { double result = 0; for (double value : values) result += value; return result; } public static double sum(int[] values) { double result = 0; for (double value : values) result += value; return result; } public static double sum(List<? extends Number> values) { double result = 0; for (Number value : values) { double doubleValue = value.doubleValue(); if (doubleValue != Double.NaN) result += doubleValue; } return result; } public static float sum(float[] values) { float result = 0; for (float value : values) result += value; return result; } public static double coefficientOfVariation(double[] input) { return standardDeviation(input) / mean(input); } public static double coefficientOfVariation(List<? extends Number> input) { return standardDeviation(input) / mean(input); } /** * * Covariance = (sum(x*y) - ((sumx)(sumy)/n)) / (n-1) * @param xvalues * @param yvalues * @return */ public static double covariance(double[] xvalues, double[] yvalues) { int n = xvalues.length; double[] xtimesy = new double[n]; for (int i = 0; i < n; i++) xtimesy[i] = xvalues[i] * yvalues[i]; double numerator = sum(xtimesy) - (((sum(xvalues) * sum(yvalues))) / n); return numerator / (double) (n - 1); } /** * * Covariance = (sum(x*y) - ((sumx)(sumy)/n)) / (n-1) * @param xvalues * @param yvalues * @return */ public static float covariance(float[] xvalues, float[] yvalues) { int n = xvalues.length; float[] xtimesy = new float[n]; for (int i = 0; i < n; i++) xtimesy[i] = xvalues[i] * yvalues[i]; float numerator = sum(xtimesy) - (((sum(xvalues) * sum(yvalues))) / n); return numerator / (float) (n - 1); } public static double covariance(List<? extends Number> xvaluesList, List<? extends Number> yvaluesList) { int n = xvaluesList.size(); double[] xtimesy = new double[n]; for (int i = 0; i < n; i++) xtimesy[i] = xvaluesList.get(i).doubleValue() * yvaluesList.get(i).doubleValue(); double numerator = sum(xtimesy) - (((BasicStatistics.sum(xvaluesList) * BasicStatistics.sum(yvaluesList))) / n); return numerator / (float) (n - 1); } /** * covariance(xy) / (sx * sy) */ public static double correlationCoefficient(List<? extends Number> xvaluesList, List<? extends Number> yvaluesList) { return covariance(xvaluesList, yvaluesList) / (standardDeviation(xvaluesList) * standardDeviation(yvaluesList)); } /** * covariance(xy) / (sx * sy) */ public static double correlationCoefficient(double[] xvalues, double[] yvalues) { return covariance(xvalues, yvalues) / (standardDeviation(xvalues) * standardDeviation(yvalues)); } /** * covariance(xy) / (sx * sy) */ public static float correlationCoefficient(float[] xvalues, float[] yvalues) { return covariance(xvalues, yvalues) / (standardDeviation(xvalues) * standardDeviation(yvalues)); } /** * Geometric mean is defined as ((x1)(x2)(x3)...(xk)) ^ 1/k * @param values * @return */ public static double geometricMean(double[] values) { double result = 1.0; for (double value : values) result *= value; return Math.pow(result, 1.0 / (double) values.length); } public static double geometricMean(List<? extends Number> inputs) { double result = 1.0; for (Number value : inputs) result *= value.doubleValue(); return Math.pow(result, 1.0 / (double) inputs.size()); } public static float geometricMean(float[] values) { float result = 1.0f; for (float value : values) result *= value; return (float) Math.pow(result, 1.0f / (float) values.length); } public static double geometricMean(double value1, double value2) { double[] values = new double[2]; values[0] = value1; values[1] = value2; return geometricMean(values); } /** * Calculate the weighted geometric mean. Exponentiated weighted mean of logs. * @param inputs * @param weights * @return */ public static double weightedGeometricMean(List<? extends Number> inputs, List<? extends Number> weights) { List<Double> logInputs = new ArrayList<Double>(); for (Number input : inputs) logInputs.add(Math.log(input.doubleValue())); return Math.exp(weightedMean(logInputs, weights)); } /** * Calculate the weighted mean * @param inputs * @param weights * @return */ public static double weightedMean(List<? extends Number> inputs, List<? extends Number> weights) { double weightedSum = 0; double sumOfWeights = 0; for (int i = 0; i < inputs.size(); i++) { weightedSum += (inputs.get(i).doubleValue() * weights.get(i).doubleValue()); sumOfWeights += weights.get(i).doubleValue(); } return weightedSum / sumOfWeights; } /** * Calculate a standard Gaussian with mean 0, variance 1 * @param x * @return */ public static double standardGaussian(double x) { //sqrt(2pi) = 2.50662827463 return (1.0 / (2.50662827463)) * Math.exp(-Math.pow(x, 2) / 2); } /** * Calculate the density of the normal distribution function with parameters mu and sigma, * at point x * @param mu * @param sigma * @param x * @return */ public static double calcNormalDistDensity(double mu, double sigma, double x) { //sqrt(2pi) = 2.50662827463 return Math.pow(Math.E, -.5 * Math.pow((x - mu) / sigma, 2)) / (sigma * 2.50662827463); } /** * Error function for standard normal distribution * @param z * @return */ public static double erf(double z) { double t = 1.0 / (1.0 + 0.5 * Math.abs(z)); // use Horner's method double ans = 1 - t * Math.exp(-z * z - 1.26551223 + t * (1.00002368 + t * (0.37409196 + t * (0.09678418 + t * (-0.18628806 + t * (0.27886807 + t * (-1.13520398 + t * (1.48851587 + t * (-0.82215223 + t * (0.17087277)))))))))); if (z >= 0) return ans; else return -ans; } /** * Calculate the <i>cumulative</i> density of the standard normal distribution at point x * @param z * @return */ public static double calcStandardNormalCumDensity(double z) { return 0.5 * (1.0 + erf(z / (Math.sqrt(2.0)))); } /** * Calculate the <i>cumulative</i> density of the normal distribution function with parameters mu and sigma, * at point x * @param mu * @param sigma * @param x * @return */ public static double calcNormalCumDensity(double mu, double sigma, double x) { return calcStandardNormalCumDensity((x - mu) / sigma); } }