Java tutorial
/* * BBiCat is a toolbox that combines different Bi-Clustering and clustering techniques in it, which * can be applied on a given dataset. This software is the modified version of the original BiCat * Toolbox implemented at ETH Zurich by Simon Barkow, Eckart Zitzler, Stefan Bleuler, Amela * Prelic and Don Frick. * * DOI for citing the release : 10.5281/zenodo.33117 * * Copyright (c) 2015 Taghi Aliyev, Marco Manca, Alberto Di Meglio * * This file is part of BBiCat. * * BBiCat is a free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * BBiCat is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with BBiCat. If not, see <http://www.gnu.org/licenses/>. * * You can contact the author at the following email address: * taghi.aliyev@cern.ch * * PLEASE NOTE THAT ORIGINAL GROUP AT ETH ZURICH HAS BEEN DISSOLVED AND SO, * CONTACTING TAGHI ALIYEV MIGHT BE MORE BENEFITIAL FOR ANY QUESTIONS. * * IN NO EVENT SHALL THE AUTHORS AND MAINTAINERS OF THIS SOFTWARE BE LIABLE TO * ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE * AUTHORS AND MAINTAINERS HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * AUTHORS AND THE MAINTAINERS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" * BASIS, AND THE AUTHORS AND THE MAINTAINERS HAS NO OBLIGATION TO PROVIDE * MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS * * COPYRIGHT NOTICE FROM THE ORIGINAL SOFTWARE: * * Copyright notice : Copyright (c) 2005 Swiss Federal Institute of Technology, Computer * Engineering and Networks Laboratory. All rights reserved. * BicAT - A Biclustering Analysis Toolbox * IN NO EVENT SHALL THE SWISS FEDERAL INSTITUTE OF TECHNOLOGY, COMPUTER * ENGINEERING AND NETWORKS LABORATORY BE LIABLE TO ANY PARTY FOR * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING * OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE * SWISS FEDERAL INSTITUTE OF TECHNOLOGY, COMPUTER ENGINEERING AND * NETWORKS LABORATORY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * * THE SWISS FEDERAL INSTITUTE OF TECHNOLOGY, COMPUTER ENGINEERING AND * NETWORKS LABORATORY, SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS * ON AN "AS IS" BASIS, AND THE SWISS FEDERAL INSTITUTE OF TECHNOLOGY, * COMPUTER ENGINEERING AND NETWORKS LABORATORY HAS NO OBLIGATION * TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR * MODIFICATIONS. */ package MetaFramework; import MetaFramework.AbstractPathwayUtils.AbstractPathwayAnalysis; import MetaFramework.AbstractPathwayUtils.Interaction; import MetaFramework.AbstractPathwayUtils.Molecule; import MetaFramework.AbstractPathwayUtils.Pathway; import MetaFramework.NCI.PathwayAnalysisMixing; import bicat.biclustering.Bicluster; import bicat.biclustering.Dataset; import org.apache.commons.math3.stat.inference.TTest; import javax.swing.*; import java.io.PrintWriter; import java.nio.file.Path; import java.util.*; /** * Class that will contain the bayesian statistics computations * Method at : http://www.biomedcentral.com/content/pdf/1471-2105-7-86.pdf * <p> * * @author Taghi Aliyev, email : taghi.aliyev@cern.ch */ public class Bayesian<E extends Molecule, T extends Interaction<E>, G extends Pathway<E, T>> { private int nSim; // number of simulations private PrintWriter outFile; // Output file private Class<E> molClass; private Class<T> interClass; private Class<G> pathClass; public Bayesian(Class<E> molClass, Class<T> interClass, Class<G> pathClass) throws Exception { outFile = new PrintWriter("PostAnalysis.out"); nSim = 500; // Default is 500 this.molClass = molClass; this.interClass = interClass; this.pathClass = pathClass; } public Bayesian(int nSim, Class<E> molClass, Class<T> interClass, Class<G> pathClass) throws Exception { outFile = new PrintWriter("PostAnalysis.out"); this.nSim = nSim; this.molClass = molClass; this.interClass = interClass; this.pathClass = pathClass; } /** * Computes the bayesian statistics for enrichment for all the relevant pathways and it prints the results to a * file. * * @param bicluster Bicluster for which the enrichment analysis will be done * @param engine Engine/Parser with the geneToPathway and pathToGene hashsets. * @param dataset Dataset from which biclusters are computed * @param colChoice Column on which differentiation will be done. -1 Means diff expressed genes will be computed over columns */ public void compute(Bicluster bicluster, AbstractPathwayAnalysis<E, T, G> engine, Dataset dataset, int colChoice) throws Exception { // Idea is to compute Bayesian statistics for the given gene list/cluster and term ArrayList<String> geneNames = new ArrayList<String>(); // System.out.println(bicluster.getGenes().length); for (int i = 0; i < bicluster.getGenes().length; i++) { geneNames.add(dataset.getGeneName(bicluster.getGenes()[i])); } // System.out.println(geneNames.size()); Set<String> allPathways = getAllPath(geneNames, engine); ArrayList<String> diffGenes = new ArrayList<String>(); if (colChoice == -1) { // Diff expression diffGenes = calculateDiff(bicluster, colChoice, dataset, -1.0); // Threshold does not matter here // System.out.println("Number of differentially expressed genes : " + diffGenes.size()); } else { // Column based differentiation // Make sure it is binary (0/1 or false/true) // And 1 basically means differentiated String threshold = Double.toString(Double.MAX_VALUE); if (!checkForBinary(dataset, colChoice)) { // If not binary, then ask for a threshold (smaller -> notDiff, higher -> Diff) threshold = JOptionPane.showInputDialog(null, "Column values are not binary! Please input a threshold for differentiation:"); } // Now that we know the threshold, we should be fine diffGenes = calculateDiff(bicluster, colChoice, dataset, Double.parseDouble(threshold)); } // System.out.println("Some debug results : "); // System.out.println("Number of diff genes : " + diffGenes.size()); // for (String dTm : diffGenes) // System.out.println("Diff gene : " + dTm); Set<String> notDiffGenes = MatrixFunctions.setDifference(geneNames, new HashSet<String>(diffGenes)); // System.out.println("Amount of pathways : " + allPathways); // if (diffGenes.size() != 0) // { // System.out.println(notDiffGenes.size()); // } for (String tmp : allPathways) { G pathTmp = pathClass.newInstance(); pathTmp.setInteractions(null); pathTmp.setId(0); pathTmp.setMolList(null); pathTmp.setName(tmp); ArrayList<E> molecules = engine.getPathToGene().get(pathTmp); ArrayList<String> genesInPathway = new ArrayList<String>(); for (E mol : molecules) genesInPathway.add(mol.getName()); int nMin = (int) (1); // At least 20 percent of input genes should be part of the pathway int xMin = 1; // Let's say at least 1 gene should be diff expressed Set<String> genesInOther = new HashSet<String>(); // Getting all the genes from other pathways. for (String tmp2 : allPathways) { if (!tmp.equalsIgnoreCase(tmp2)) { G pathTmp2 = pathClass.newInstance(); pathTmp2.setInteractions(null); pathTmp2.setId(0); pathTmp2.setMolList(null); pathTmp2.setName(tmp2); ArrayList<E> mols = engine.getPathToGene().get(pathTmp2); ArrayList<String> molNames = new ArrayList<String>(); for (Molecule molT : mols) molNames.add(molT.getName()); genesInOther.addAll(molNames); } } Set<String> genesOnlyPathway = MatrixFunctions.setDifference(genesInPathway, genesInOther); Set<String> genesNotPathway = MatrixFunctions.setDifference(genesInOther, genesInPathway); // Computation of relevant terms int x1OnlyPathway = MatrixFunctions.intersection(genesOnlyPathway, diffGenes).size(); // x1OnlyPathway = 0; int x0OnlyPathway = MatrixFunctions.intersection(genesOnlyPathway, notDiffGenes).size(); // x0OnlyPathway = 0; int nOnlyPath = genesOnlyPathway.size(); // nOnlyPath = 0; int x1PathAnd = MatrixFunctions.intersection(genesInPathway, diffGenes).size() - x1OnlyPathway; // x1PathAnd = 4; int x0PathAnd = MatrixFunctions.intersection(genesInPathway, notDiffGenes).size() - x0OnlyPathway; // x0PathAnd = 2; int nAnd = MatrixFunctions.setDifference(genesInPathway, genesOnlyPathway).size(); // nAnd = 6; int x1NoPathway = MatrixFunctions.intersection(genesNotPathway, diffGenes).size(); // x1NoPathway = 380; int x0NoPathway = MatrixFunctions.intersection(genesNotPathway, notDiffGenes).size(); // x0NoPathway = 3590; int nNoPath = genesNotPathway.size(); // nNoPath = 380 + 3590; double gHat = GScore(x1OnlyPathway, x1PathAnd, x1NoPathway, x0OnlyPathway, x0PathAnd, x0NoPathway); // if (gHat != 0) // System.out.println("Ghat is : " + gHat); int x1Path = x1OnlyPathway + x1PathAnd; // System.out.println("Genes in Pathways : " + genesInPathway.size() + " , x1Path : " + x1Path); if (!(genesInPathway.size() < nMin) && !(x1Path < xMin) && !(gHat <= 0)) { // System.out.println("Not skipping"); double[] gObs = new double[nSim]; // Do not skip the loop, still continue if (genesInPathway.size() > (x1OnlyPathway + x0OnlyPathway + x1PathAnd + x0PathAnd)) { ArrayList<Double> X1onlyPath = RPosterior(nSim, x1OnlyPathway, x1OnlyPathway + x0OnlyPathway, nOnlyPath); double[] X0onlyPath = MatrixFunctions.constantMinusVector(nOnlyPath, X1onlyPath); ArrayList<Double> X1andPath = RPosterior(nSim, x1PathAnd, x1PathAnd + x0PathAnd, nAnd); double[] X0andPath = MatrixFunctions.constantMinusVector(nAnd, X1andPath); ArrayList<Double> X1noPath = RPosterior(nSim, x1NoPathway, x1NoPathway + x0NoPathway, nNoPath); double[] X0noPath = MatrixFunctions.constantMinusVector(nNoPath, X1noPath); // gObs will be used for the comparison vs simulation results gObs = GScoreMatrix(X1onlyPath, X1andPath, X1noPath, X0onlyPath, X0andPath, X0noPath); // System.out.println("For debug"); } else { double X1onlyPath = x1OnlyPathway; double X0onlyPath = nOnlyPath - X1onlyPath; double X1andPath = x1OnlyPathway; double X0andPath = nAnd - X1andPath; double X1noPath = x1NoPathway; double X0noPath = nNoPath - X1noPath; for (int k = 0; k < nSim; k++) gObs[k] = gHat; } double[] gRand = new double[nSim]; // Random simulations loop. // NOTE: As you will see in the comments in the loop, some of the variables actually should be replaced // in case there is an existence of knowledge about observability of certain genes for (int j = 0; j < nSim; j++) { // Simulation loop int n = Math.min(poisson(diffGenes.size()), geneNames.size()); // 2nd argument is actually number of observed genes Set<String> diffRandom = new HashSet<String>(); Random random = new Random(); for (int i = 0; i < n; i++) { diffRandom.add(geneNames.get(random.nextInt(geneNames.size()))); } Set<String> notDiffRandom = MatrixFunctions.setDifference(geneNames, diffRandom); int x1OnlyPathR = MatrixFunctions.intersection(genesOnlyPathway, diffRandom).size(); int x0OnlyPathR = MatrixFunctions.intersection(genesOnlyPathway, notDiffRandom).size(); int nOnlyPathR = genesOnlyPathway.size(); int x1AndPathR = MatrixFunctions.intersection(genesInPathway, diffRandom).size() - x1OnlyPathR; int x0AndPathR = MatrixFunctions.intersection(genesInPathway, notDiffRandom).size() - x0OnlyPathR; int nPathAndR = MatrixFunctions.setDifference(genesInPathway, genesOnlyPathway).size(); int x1NoPathR = MatrixFunctions.intersection(genesNotPathway, diffRandom).size(); int x0NoPathR = MatrixFunctions.intersection(genesNotPathway, notDiffRandom).size(); int nNoPathR = genesNotPathway.size(); ArrayList<Double> X1onlyPathR = RPosterior(1, x1OnlyPathR, x1OnlyPathR + x0OnlyPathR, nOnlyPathR); double[] X0onlyPathR = new double[X1onlyPathR.size()]; for (int k = 0; k < X0onlyPathR.length; k++) { X0onlyPathR[k] = nOnlyPathR - X1onlyPathR.get(k); } ArrayList<Double> X1andPathR = RPosterior(1, x1AndPathR, x1AndPathR + x0AndPathR, nPathAndR); double[] X0andPathR = new double[X1andPathR.size()]; for (int k = 0; k < X0andPathR.length; k++) { X0andPathR[k] = nPathAndR - X1andPathR.get(k); } ArrayList<Double> X1noPathR = RPosterior(1, x1NoPathR, x1NoPathR + x0NoPathR, nNoPathR); double[] X0noPathR = new double[X1noPathR.size()]; for (int k = 0; k < X1noPathR.size(); k++) { X0noPathR[k] = nNoPathR - X1noPathR.get(k); } gRand[j] = GScoreMatrix(X1onlyPathR, X1andPathR, X1noPathR, X0onlyPathR, X0andPathR, X0noPathR)[0]; } // Simulations are done, so let's start comparing the results int cnt = 0; for (int j = 0; j < nSim; j++) if (gRand[j] >= gObs[j]) cnt++; outFile.println("Results for : " + tmp); double result = (cnt + 0.0) / (nSim + 0.0); // 0.0 is needed to push for the double division if (result <= 0.05) outFile.println("Pathway with name : " + tmp + " has p-value of : " + result); // computing error bars double errorLeft = Math.min(quantile(gObs, 0.05), gHat); double errorRight = Math.max(quantile(gObs, 0.95), gHat); outFile.println("G hat : " + gHat); outFile.println("Errorbar : [" + errorLeft + ", " + errorRight + "]"); outFile.println("There were " + bicluster.getGenes().length + " genes. " + diffGenes.size() + " were diff expressed. Pathway has " + genesInPathway.size() + " genes"); outFile.println( "From the diff expressed genes, " + (x1PathAnd + x1OnlyPathway) + " were in the pathway"); } else { outFile.println("Results for : " + tmp); outFile.println("GHat : " + gHat); outFile.println("There were " + bicluster.getGenes().length + " genes. " + diffGenes.size() + " were diff expressed. Pathway has " + genesInPathway.size() + " genes"); outFile.println( "From the diff expressed genes, " + (x1PathAnd + x1OnlyPathway) + " were in the pathway"); } outFile.println("-------------------------------"); } } public void closeFile() { this.outFile.close(); } /** * Implements the quantile function (type 7 from R implementation) where m = 1 - p * * @param data * @param p * @return */ public static double quantile(double[] data, double p) { double m = 1 - p; int j = (int) Math.floor(data.length * p + m); double gamma = data.length * p + m - j; Arrays.sort(data); double res = (1 - gamma) * data[j - 1] + gamma * data[j]; return res; } /** * Draws a number from poisson distribution * * @param l * @return */ public int poisson(double l) { Random random = new Random(); double limit = Math.exp(-l); double prod = random.nextDouble(); int n = 0; for (n = 0; prod >= limit; n++) prod *= random.nextDouble(); return n; } /** * GScore function which takes vectors as input. Why these specifically? Needed it for Bayesian computations * Could have also used loop and call gscore repeatedly * * @param a * @param b * @param c * @param d * @param e * @param f * @return */ public double[] GScoreMatrix(ArrayList<Double> a, ArrayList<Double> b, ArrayList<Double> c, double[] d, double[] e, double[] f) { double[] res = new double[a.size()]; for (int i = 0; i < a.size(); i++) { double p = a.get(i) * (e[i] + f[i]) + b.get(i) * f[i]; double q = c.get(i) * (d[i] + e[i]) + b.get(i) * d[i]; res[i] = (p - q) / (p + q); if (Double.isNaN(res[i])) { // System.out.println("NaN found!"); res[i] = 0; } } return res; } /** * Gets all the pathways that are represented in the given list of gene names * * @param geneNames * @param engine * @return */ public Set<String> getAllPath(ArrayList<String> geneNames, AbstractPathwayAnalysis<E, T, G> engine) throws Exception { Set<String> allPath = new HashSet<String>(); for (String tmp : geneNames) { // TODO: Possible initialize to null and then change? Might be an option, still to be investigated E tmpMol = molClass.newInstance(); tmpMol.setId(0); tmpMol.setName(tmp); tmpMol.setIds(null); tmpMol.setType(false); if (engine.getGeneToPath().get(tmpMol) != null) { ArrayList<G> mols = engine.getGeneToPath().get(tmpMol); for (G mol : mols) allPath.add(mol.getName()); } } return allPath; } /** * Checks if a chosen column can be used as differentiation point (has to be binary) * Idea: For now, it might as well give user a choice to set a threshold for differentiation * * @param dataset * @param choice * @return */ public boolean checkForBinary(Dataset dataset, int choice) { Set<String> allValues = new HashSet<String>(); float[][] data = dataset.getData(); for (int i = 0; i < data.length; i++) { allValues.add(Float.toString(data[i][choice])); } System.out.println("Amount of different values : " + allValues.size()); return allValues.size() <= 2; // Means, we can easily distinguish if needed. // And we make sure to use original data, not the processed one } /** * Given a list of gene Names, gives back a list which is differentially expressed * * @param bicluster A bicluster for which the differential expressed genes should be computed * @param chosenColumn Number representing upon which column should the differention be performed * @param dataset Dataset containing the expression level of the genes * @param threshold Threshold value based on which, differentiation will be performed on a @param chosenColumn * @return */ private boolean done = false; private int[] firstColumns, secondColumns; public ArrayList<String> calculateDiff(Bicluster bicluster, int chosenColumn, Dataset dataset, double threshold) { ArrayList<String> res = new ArrayList<String>(); int[] genes = bicluster.getGenes(); if (chosenColumn == -1) { // Differential gene expression // Only need this one as the other option is still to be implemented // Let's do grouping/based on the conditions. However, we need that as being an input or something // There might be many groups // First ask for the groups/conditions if (!done) { String first = JOptionPane .showInputDialog("Enter the columns of first group/condition (comma separated)"); String second = JOptionPane .showInputDialog("Enter the columns of second group/condition (comma separated)"); String[] words = first.split(","); String[] words2 = second.split(","); ArrayList<Integer> tmp = new ArrayList<Integer>(); // firstColumns = new int[words.length]; // secondColumns = new int[words2.length]; done = true; for (int i = 0; i < words.length; i++) { String[] range = words[i].split("-"); if (range.length == 1) tmp.add(Integer.parseInt(words[i].replaceAll("\\s+", ""))); // Just a number else { int left = Integer.parseInt(range[0].replaceAll("\\s+", "")); int right = Integer.parseInt(range[1].replaceAll("\\s+", "")); for (int j = left; j <= right; j++) tmp.add(j); } //range } firstColumns = new int[tmp.size()]; for (int i = 0; i < tmp.size(); i++) firstColumns[i] = tmp.get(i); System.out.println("First columns size : " + firstColumns.length); tmp = new ArrayList<Integer>(); for (int i = 0; i < words2.length; i++) { String[] range = words2[i].split("-"); if (range.length == 1) tmp.add(Integer.parseInt(words2[i].replaceAll("\\s+", ""))); // Just a number else { int left = Integer.parseInt(range[0].replaceAll("\\s+", "")); int right = Integer.parseInt(range[1].replaceAll("\\s+", "")); for (int j = left; j <= right; j++) tmp.add(j); } //range } secondColumns = new int[tmp.size()]; for (int i = 0; i < tmp.size(); i++) { secondColumns[i] = tmp.get(i); } System.out.println("Second columns size : " + secondColumns.length); } float[][] data = dataset.getData(); // Let's compute the means first Double[] allP = new Double[genes.length]; for (int i = 0; i < genes.length; i++) { // double meanA = 0.0, meanB = 0.0, mean = 0.0; // // for (int j = 0; j < data[0].length; j++) // mean += data[genes[i]][j]; // // mean = mean / data[0].length; // // for (int j = 0; j < firstColumns.length; j++) // meanA += data[genes[i]][firstColumns[j]]; // for (int j = 0; j < secondColumns.length; j++) // meanB += data[genes[i]][secondColumns[j]]; // //// mean = (meanA + meanB) / (firstColumns.length + secondColumns.length); // meanA = meanA / firstColumns.length; // meanB = meanB / secondColumns.length; // double sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0; // for (int j = 0; j < firstColumns.length; j++) { // sum1 += Math.pow(data[genes[i]][firstColumns[j]] - meanA, 2); // sum3 += Math.pow(data[genes[i]][firstColumns[j]] - mean, 2); // } // for (int j = 0; j < secondColumns.length; j++) { // sum2 += Math.pow(data[genes[i]][secondColumns[j]] - meanB, 2); // sum4 += Math.pow(data[genes[i]][secondColumns[j]] - mean, 2); // } // double t = 2.0 * (sum1 + sum2) / (sum3 + sum4); // DecimalFormat formatter = new DecimalFormat("#0.00"); // t = Double.parseDouble(formatter.format(t)); // Bonferroni correction (multiply by sample size) // double pValue = computePValue(t, 1); // double toCheck = Math.min(1.0, pValue * genes.length); double[] aValues = new double[firstColumns.length]; double[] bValues = new double[secondColumns.length]; for (int j = 0; j < firstColumns.length; j++) { aValues[j] = data[genes[i]][firstColumns[j]]; // System.out.print(aValues[j] + ","); } // System.out.println(); for (int j = 0; j < secondColumns.length; j++) { bValues[j] = data[genes[i]][secondColumns[j]]; // System.out.print(bValues[j] + ","); } TTest tTest = new TTest(); // Using apache common math library. This seems to be more stable double apacheP = tTest.tTest(aValues, bValues); double toCheck = apacheP * genes.length; // System.out.println("Apache adapted P value : " + toCheck); if (toCheck <= 0.1) { // System.out.println("Diff expressed!"); res.add(dataset.getGeneName(genes[i])); } // Benjamini-Hochberg: // allP[i] = computePValue(t, 1); } // Benjamini - Hochberg: // TODO : CHECK WITH MATLAB T-TEST FUNCTION! // Arrays.sort(allP); // for (int i = 0; i < allP.length; i++) // System.out.print(allP[i] + " "); // double[] adjustedP = new double[allP.length]; // double min = 1.0; // for (int i = allP.length; i > 0; i--) // { // double tmp = (allP.length * allP[i - 1]) / (i + 0.0); // if (tmp < min) // min = tmp; // adjustedP[i - 1] = min; // System.out.println(adjustedP[i - 1]); // if (tmp <= 0.05) // { // res.add(dataset.getGeneName) // } // } } else { // Based on the column // This part is easy as it will be just about grouping genes into two values if (threshold == Double.MAX_VALUE) { // Means column was actually binary (so, 2 possible values) // This is the case for our test case int[][] discrData = dataset.getDiscrData(); System.out.println("For debugging!"); for (int i = 0; i < genes.length; i++) { if (discrData[genes[i]][chosenColumn] > 0.5) // 0.5 is used as in binary columns I expect 0/1 values res.add(dataset.getGeneName(genes[i])); // TODO : Change this to min/max } } else { // Based on the threshold. Higher than threshold is diff float[][] discrData = dataset.getData(); for (int i = 0; i < genes.length; i++) { if (discrData[genes[i]][chosenColumn] > threshold) res.add(dataset.getGeneName(genes[i])); } } } return res; } /** * Given a critical value t and degrees of freedom to account for, computes a p-value for a chi-square distribution * * @param t Critical value/test statistics * @param df Degrees of Freedom * @return Returns a p-value */ public double computePValue(double t, int df) { double result = 0.0; if (t < 0 || df < 1) { return 0.0; } double K = ((double) df) * 0.5; double X = t * 0.5; if (df == 2) { return Math.exp(-1.0 * X); } double PValue = igf(K, X); if (Double.isNaN(PValue) || Double.isInfinite(PValue) || PValue <= 1e-8) { return 1e-14; } PValue /= approx_gamma(K); // PValue /= gamma(K); //PValue /= tgamma(K); return (1.0 - PValue); } /** * Implementation of a incomplete Gamma function * * @param S * @param Z * @return */ public double igf(double S, double Z) { if (Z < 0.0) { return 0.0; } double Sc = (1.0 / S); Sc *= Math.pow(Z, S); Sc *= Math.exp(-Z); System.out.println("Sc is : " + Sc + " " + S + " " + Z); double Sum = 1.0; double Nom = 1.0; double Denom = 1.0; for (int I = 0; I < 200; I++) { Nom *= Z; S++; Denom *= S; Sum += (Nom / Denom); } return Sum * Sc; } public double approx_gamma(double Z) { double RECIP_E = 0.36787944117144232159552377016147; // RECIP_E = (E^-1) = (1.0 / E) double TWOPI = 6.283185307179586476925286766559; // TWOPI = 2.0 * PI double D = 1.0 / (10.0 * Z); D = 1.0 / ((12 * Z) - D); D = (D + Z) * RECIP_E; D = Math.pow(D, Z); D *= Math.sqrt(TWOPI / Z); return D; } /** * Implementation of a Gamma function (not the approximated, faster version) * * @param N * @return */ public double gamma(double N) { final double SQRT2PI = 2.5066282746310005024157652848110452530069867406099383; double A = 15.0; double Z = (double) N; double Sc = Math.pow((Z + A), (Z + 0.5)); Sc *= Math.exp(-1.0 * (Z + A)); Sc /= Z; double F = 1.0; double Ck; double Sum = SQRT2PI; for (int K = 1; K < A; K++) { Z++; Ck = Math.pow(A - K, K - 0.5); Ck *= Math.exp(A - K); Ck /= F; Sum += (Ck / Z); F *= (-1.0 * K); } return (double) (Sum * Sc); } /** * Implementation of the G score function * * @param a * @param b * @param c * @param d * @param e * @param f * @return */ public double GScore(int a, int b, int c, int d, int e, int f) { double result = 0.0; double P = a * (e + f) + b * f; double Q = c * (d + e) + b * d; result = (P - Q + 0.0) / (P + Q + 0.0); // 0.0 are there in order to make sure division is not an integer division if (Double.isNaN(result)) { // System.out.println("Here"); result = 0.0; } return result; } public ArrayList<Double> RPosterior(double k, int x, int n, int N) { ArrayList<Double> xVect = new ArrayList<Double>(); int[] xAr = new int[N + 1]; for (int i = 0; i <= N; i++) xAr[i] = i; for (int i = 0; i < k; i++) { double[] firstPart = MatrixFunctions.matrixConstantSum(lchooseVect(xAr, x), -lchoose(N, n)); double[] secondPart = lchooseVect( MatrixFunctions.matrixConstantSum(MatrixFunctions.vectorConstantMult(xAr, -1), N), n - x); Double[] P = null; if (firstPart.length != secondPart.length) { // well, this will be weird. One of them should be constant if (secondPart.length == 1) P = MatrixFunctions.expVect(MatrixFunctions.matrixConstantSum(firstPart, secondPart[0])); else if (firstPart.length == 1) P = MatrixFunctions.expVect(MatrixFunctions.matrixConstantSum(secondPart, firstPart[0])); else System.out.println("Strange"); } else P = MatrixFunctions.expVect(MatrixFunctions.matrixSum(firstPart, secondPart)); for (int j = 0; j < P.length; j++) if (Double.isNaN(P[j])) P[j] = 0.0; // Sorting both datasets and indices SpecialMatrix comparator = new SpecialMatrix(P); Integer[] indices = comparator.createIndexArray(); // Arrays.sort(P, Comparator.reverseOrder()); Arrays.sort(indices, comparator); // Indices are in correct order now xVect.add(randomSelect(xAr, P, indices)); } return xVect; } public double randomSelect(int[] xAr, Double[] P, Integer[] indices) { // Have to follow the indices correctly double totalSum = 0.0; for (int i = 0; i < P.length; i++) { totalSum += P[i]; } double rand = Math.random(); double sumSoFar = 0.0; for (int i = 0; i < indices.length; i++) { int actualIndex = indices[i]; sumSoFar += P[actualIndex] / totalSum; if (rand <= sumSoFar) return xAr[actualIndex]; } return 0; // Will not happen } public ArrayList<Double> RPosterior(int x, int n, int N) { return RPosterior(1, x, n, N); } /** * Method that returns log of binomial coefficients for given n and k values * * @param n * @param k * @return */ public double lchoose(int n, int k) { // Computes logarithm of combinations of choosing y from x if (k < 0) // If we try to choose more than we have, then there 0 ways of doing it return 0; else if (k == 0) return 1; else { if (n < k) return 0; // Formula is: n*(n-1)*...*(n-k+1)/k! // We only need the log versions, so, let's not compute any factorials ;) double num = 0.0, kFact = 0.0; for (int i = n - k + 1; i <= n; i++) { num += Math.log(i); } for (int i = 1; i <= k; i++) { kFact += Math.log(i); } return num - kFact; } } /** * Method that returns binomial coefficients for a vector of n values * * @param n * @param k * @return */ public double[] lchooseVect(int[] n, int k) { if (k < 0) // If we try to choose more than we have, then there 0 ways of doing it return new double[] { 0 }; else if (k == 0) return new double[] { 1 }; else { // Formula is: n*(n-1)*...*(n-k+1)/k! // We only need the log versions, so, let's not compute any factorials ;) double[] res = new double[n.length]; for (int j = 0; j < n.length; j++) { if (n[j] < k) res[j] = 0; else { double num = 0.0, kFact = 0.0; for (int i = n[j] - k + 1; i <= n[j]; i++) { num += Math.log(i); } for (int i = 1; i <= k; i++) { kFact += Math.log(i); } res[j] = num - kFact; } } return res; } } }