org.caleydo.view.tourguide.impl.Statistics.java Source code

Java tutorial

Introduction

Here is the source code for org.caleydo.view.tourguide.impl.Statistics.java

Source

/*******************************************************************************
 * Caleydo - Visualization for Molecular Biology - http://caleydo.org
 * Copyright (c) The Caleydo Team. All rights reserved.
 * Licensed under the new BSD license, available at http://caleydo.org/license
 ******************************************************************************/
package org.caleydo.view.tourguide.impl;

import java.lang.ref.SoftReference;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;

import org.apache.commons.math.MathException;
import org.apache.commons.math.distribution.ChiSquaredDistribution;
import org.apache.commons.math.distribution.ChiSquaredDistributionImpl;
import org.caleydo.core.util.logging.Logger;

import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.Sets;
import com.google.common.primitives.Doubles;
import com.google.common.primitives.Ints;

/**
 * util set of statistics functions
 *
 * @author Samuel Gratzl
 *
 */
public final class Statistics {
    private static final Logger log = Logger.create(Statistics.class);

    private Statistics() {

    }

    /**
     * computes the rand index, see <a href="http://en.wikipedia.org/wiki/Rand_index">Rand_Index</a> in Wikipedia
     *
     * @param x
     *            the collection of groups of the first stratification
     * @param y
     *            the collection of groups of the second stratification
     * @return its rand index
     */
    public static float randIndex(Collection<? extends Collection<Integer>> as,
            Collection<? extends Collection<Integer>> bs) {
        List<Integer> alist = new ArrayList<>();
        List<Integer> blist = new ArrayList<>();
        int ai = 0;
        for (Collection<Integer> ag : as) {
            for (Integer id : ag) {
                // search in which this id is in the other stratification
                int bi = 0;
                for (Collection<Integer> bg : bs) {
                    if (bg.contains(id)) {
                        alist.add(ai);
                        blist.add(bi);
                        break;
                    }
                    bi++;
                }
            }
            ai++;
        }
        final int[] x = Ints.toArray(alist);
        final int[] y = Ints.toArray(blist);
        final int nn = x.length;

        // a, the number of pairs of elements in S that are in the same set in X and in the same set in Y
        int a = 0;
        // b, the number of pairs of elements in S that are in different sets in X and in different sets in Y+
        int b = 0;
        // c, the number of pairs of elements in S that are in the same set in X and in different sets in Y
        int c = 0;
        // d, the number of pairs of elements in S that are in different sets in X and in the same set in Y
        int d = 0;

        for (int t = 0; t < nn; t++) {
            for (int r = t + 1; r < nn; r++) {
                final boolean sameX = x[t] == x[r];
                final boolean sameY = y[t] == y[r];
                if (sameX) {
                    if (sameY)
                        a++;
                    else
                        c++;
                } else {
                    if (sameY)
                        d++;
                    else
                        b++;
                }
            }
        }
        // R = \frac{a+b}{a+b+c+d} = \frac{a+b}{{n \choose 2 }}
        float r = (a + b) / (float) (a + b + c + d);

        return r;
    }

    /**
     * computes the adjusted rand, see <a href="http://en.wikipedia.org/wiki/Rand_index">Rand_Index</a> in Wikipedia
     *
     * @param x
     *            the collection of groups of the first stratification
     * @param y
     *            the collection of groups of the second stratification
     * @return its adjusted rand
     */
    public static float adjustedRand(Collection<Set<Integer>> x, Collection<Set<Integer>> y) {
        // http://en.wikipedia.org/wiki/Rand_index
        // ARI = \frac{ \sum_{ij} \binom{n_{ij}}{2} - [\sum_i \binom{a_i}{2} \sum_j \binom{b_j}{2}] / \binom{n}{2} }{
        // \frac{1}{2} [\sum_i \binom{a_i}{2} + \sum_j \binom{b_j}{2}] - [\sum_i \binom{a_i}{2} \sum_j \binom{b_j}{2}] /
        // \binom{n}{2} }
        // where n_{ij}, a_i, b_j are values from the contingency table.

        int n = 0;
        int ais = 0;
        int bjs = 0;
        int nijs = 0;

        int bj[] = new int[y.size()];
        Arrays.fill(bj, 0);

        for (Set<Integer> xi : x) {
            int ai = 0;
            int j = 0;
            for (Set<Integer> yi : y) {
                int nij = Sets.intersection(xi, yi).size();
                ai += nij;
                nijs += binom2(nij);
                bj[j] += nij;
                j++;
            }
            ais += binom2(ai);
            n += xi.size();
        }
        for (int j = 0; j < bj.length; j++)
            bjs += binom2(bj[j]);

        float n2 = 1.f / binom2(n);

        float ari = (nijs - (ais * bjs) * n2) / (0.5f * (ais + bjs) - (ais * bjs) * n2);

        return ari;
    }

    /**
     * computes the jaccard index between the two sets
     *
     * @param a
     *            the ids of the first set
     * @param b
     *            the ids of the second set
     * @return its jaccard index
     */
    public static double jaccardIndex(Collection<Integer> a, Collection<Integer> b) {
        int intersection = 0;
        for (Integer ai : a) {
            if (b.contains(ai))
                intersection++;
        }
        int union = b.size() + a.size() - intersection;
        // intersect(a,b) / union(a,b)
        double score = union == 0 ? 0. : (double) intersection / union;
        return score;
    }

    // cache for most used one
    private static transient SoftReference<ChiSquaredDistribution> chiSquare1 = null;

    public static double chiSquaredProbability(double x, int df) {
        // return weka.core.Statistics.chiSquaredProbability(x, df);
        ChiSquaredDistribution d;
        if (df == 1) {
            d = chiSquare1 != null ? chiSquare1.get() : null;
            if (d == null) {
                d = new ChiSquaredDistributionImpl(1);
                chiSquare1 = new SoftReference<>(d);
            }
        } else {
            d = new ChiSquaredDistributionImpl(df);
        }
        try {
            return 1.0 - d.cumulativeProbability(x);
        } catch (MathException e) {
            log.error("can't compute chiSquaredProbability of " + x + " with df: " + df, e);
        }
        return Float.NaN;
    }

    // public static double tTest(double mu, double[] observed) {
    // try {
    // return TestUtils.tTest(mu, observed);
    // } catch (IllegalArgumentException | MathException e) {
    // log.error("can't compute tTest of " + Arrays.toString(observed) + " with mu: " + mu, e);
    // }
    // return Float.NaN;
    // }
    //
    // public static double tTest(double[] a, double[] b) {
    // try {
    // return TestUtils.tTest(a, b);
    // } catch (IllegalArgumentException | MathException e) {
    // log.error("can't compute tTest of " + Arrays.toString(a) + " and " + Arrays.toString(b), e);
    // }
    // return Float.NaN;
    // }

    public static float foldChange(float a, float b) {
        if (a > b)
            return a / b;
        else
            return -b / a;
    }

    /**
     * computes the log rank score between the two given survival curves
     *
     * based on <a href="http://www.ncbi.nlm.nih.gov/pmc/articles/PMC403858/">Logrank_test</a>
     *
     * @param as
     *            the survival of specific samples of a
     * @param asurvived
     *            the number of survived samples after the experiment
     * @param bs
     *            the survival of specific samples of b
     * @param bsurvived
     *            the number of survived samples in b after the experiment
     * @return
     */
    public static double logRank(List<Double> as_l, int asurvived, List<Double> bs_l, int bsurvived) {
        final double[] as = sorted(as_l);
        final double[] bs = sorted(bs_l);

        // distinct events
        final SortedSet<Double> distinct = ImmutableSortedSet.<Double>naturalOrder().addAll(as_l).addAll(bs_l)
                .build();
        int a_i = 0, b_i = 0;

        double a_deaths_expected_acc = 0;
        double a_deaths_variance_acc = 0;
        int a_deaths_observed_acc = 0;

        double b_deaths_expected_acc = 0;
        double b_deaths_variance_acc = 0;
        int b_deaths_observed_acc = 0;

        for (double event : distinct) {

            int a_alive = asurvived + as.length - a_i; // alive before the event
            int a_deaths = deathsAt(as, a_i, event); // find deaths
            a_i += a_deaths; // shift event consumption

            int b_alive = bsurvived + bs.length - b_i;
            int b_deaths = deathsAt(bs, b_i, event);
            b_i += b_deaths; // shift

            double deaths = a_deaths + b_deaths;
            double alive = a_alive + b_alive;

            double a_deaths_expected = expected(a_alive, deaths, alive);
            a_deaths_expected_acc += a_deaths_expected;
            a_deaths_observed_acc += a_deaths;

            double b_deaths_expected = expected(b_alive, deaths, alive);
            b_deaths_expected_acc += b_deaths_expected;
            b_deaths_observed_acc += b_deaths;

            double a_deaths_variance = var(a_alive, deaths, alive);
            a_deaths_variance_acc += a_deaths_variance;

            double b_deaths_variance = var(b_alive, deaths, alive);
            b_deaths_variance_acc += b_deaths_variance;
        }

        // double z_article = Math.pow(a_deaths_observed_acc - a_deaths_expected_acc, 2) / a_deaths_expected_acc +
        // Math.pow(b_deaths_observed_acc - b_deaths_expected_acc, 2) / b_deaths_expected_acc;
        // //http://www.ncbi.nlm.nih.gov/pmc/articles/PMC403858/
        // double z_wiki = (b_deaths_observed_acc - b_deaths_expected_acc) / Math.sqrt(b_deaths_variance_acc);
        // //wikipedia

        double z_r = Math.pow(b_deaths_observed_acc - b_deaths_expected_acc, 2) / b_deaths_variance_acc; // r survival
        // double z_r2 = Math.pow(a_deaths_observed_acc - a_deaths_expected_acc, 2) / a_deaths_variance_acc; // r
        // survival
        // // package
        return z_r;
    }

    private static double expected(int a_alive, double deaths, double alive) {
        return a_alive == 0 ? 0 : (deaths / alive) * a_alive;
    }

    private static double var(int a_alive, double deaths, double alive) {
        return a_alive == 0 || alive == 1 ? 0
                : (deaths * (a_alive / alive) * (1 - a_alive / alive) * (alive - deaths)) / (alive - 1);
    }

    private static int deathsAt(final double[] as, int start, double event) {
        int deaths = 0;
        for (int i = start; i < as.length && as[i] == event; ++i) { // event hits
            deaths++; // find act
        }
        return deaths;
    }

    private static double[] sorted(List<Double> l) {
        final double[] as = Doubles.toArray(l);
        Arrays.sort(as);
        return as;
    }

    private static int binom2(int n) {
        return (n - 1) * n / 2;
    }

    public static void main(String[] args) {
        {
            List<Double> a = Doubles.asList(41, 54, 59, 65, 67, 73, 92, 137, 139, 164, 220, 224, 245, 311, 329, 330,
                    333, 336, 342, 362, 431, 454, 459, 477, 479, 480, 485, 510, 551, 563, 572, 573, 586, 637, 678,
                    683, 709, 722, 768, 769, 782, 822, 827, 877, 884, 927, 945, 951, 992, 1092, 1097, 1121, 1173,
                    1190, 1316, 1337, 1370, 1417, 1463, 1587, 1590, 1597, 1625, 1661, 1723, 1912, 1913, 1964, 1979,
                    2145, 2190, 2227, 2298, 2751, 2763);
            int asurvived = 217;
            List<Double> b = Doubles.asList(50);
            int bsurvived = 0;

            double z = logRank(a, asurvived, b, bsurvived);
            double p = chiSquaredProbability(z, 1);
            System.out.println("test1: " + z + " " + p);
        }

        {
            //          a =59  115  156  431  448  477  638  803  855 1040 1106  268  329);
            //          as = 1 1 1 1 0 0 1 0 0 0 0 1 1;
            //          b = 421  464  475  563  744  769  770 1129 1206 1227  353  365  377);
            //          bs = 0 1 1 1 0 0 0 0 0 0 1 1 0;
            List<Double> a2 = Doubles.asList(59, 115, 156, 431, 638, 268, 329);
            Collections.sort(a2);
            int asurv = 6;
            List<Double> b2 = Doubles.asList(464, 475, 563, 353, 365);
            int bsurv = 8;
            Collections.sort(b2);
            double z = logRank(a2, asurv, b2, bsurv);
            double p = chiSquaredProbability(z, 1);
            System.out.println("ovarian: " + z + " " + p);
        }

    }
}