ubic.gemma.analysis.expression.coexpression.links.AbstractMatrixRowPairAnalysis.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.analysis.expression.coexpression.links.AbstractMatrixRowPairAnalysis.java

Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2007 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.analysis.expression.coexpression.links;

import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import ubic.basecode.dataStructure.Link;
import ubic.basecode.dataStructure.matrix.CompressedSparseDoubleMatrix;
import ubic.basecode.dataStructure.matrix.Matrix2D;
import ubic.gemma.datastructure.matrix.ExpressionDataDoubleMatrix;
import ubic.gemma.datastructure.matrix.ExpressionDataMatrixRowElement;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.genome.Gene;
import cern.colt.bitvector.BitMatrix;
import cern.colt.list.DoubleArrayList;
import cern.colt.list.ObjectArrayList;

/**
 * @author paul
 * @version $Id: AbstractMatrixRowPairAnalysis.java,v 1.16 2011/10/20 15:51:41 paul Exp $
 */
public abstract class AbstractMatrixRowPairAnalysis implements MatrixRowPairAnalysis {

    protected static final int NUM_BINS = 2048;
    protected static final int HALF_BIN = NUM_BINS / 2;
    protected static final Log log = LogFactory.getLog(MatrixRowPairPearsonAnalysis.class);

    protected ExpressionDataDoubleMatrix dataMatrix;
    protected int[] fastHistogram = new int[NUM_BINS];
    protected Map<Gene, Collection<CompositeSequence>> geneToProbeMap = null;
    protected double globalMean = 0.0; // mean of the entire distribution.
    protected double globalTotal = 0.0; // used to store the running total of the matrix values.
    protected boolean[] hasGenesCache;
    protected boolean[] hasMissing = null;
    protected boolean histogramIsFilled = false;
    protected ObjectArrayList keepers = null;
    protected double lowerTailThreshold = 0.0;

    /**
     * Absolute lower limit to minNumUsed. (This used to be 3!). It doesn't make much sense to set this higher than
     * ExpressionExperimentFilter.MIN_NUMBER_OF_SAMPLES_PRESENT
     */
    private static final int HARD_LIMIT_MIN_NUM_USED = 5;

    /**
     * If fewer than this number values are available, the correlation is rejected. This helps keep the correlation
     * distribution reasonable. This is primarily relevant when there are missing values in the data, but to be
     * consistent we check it for other cases as well.
     */
    protected int minNumUsed = HARD_LIMIT_MIN_NUM_USED;

    // store total number of missing values.
    protected int numMissing;

    protected int numVals = 0; // number of values actually stored in the matrix

    protected Map<CompositeSequence, Collection<Collection<Gene>>> probeToGeneMap = null;
    protected double pValueThreshold = 0.0;

    protected CompressedSparseDoubleMatrix<ExpressionDataMatrixRowElement, ExpressionDataMatrixRowElement> results = null;

    protected Map<ExpressionDataMatrixRowElement, CompositeSequence> rowMapCache = new HashMap<ExpressionDataMatrixRowElement, CompositeSequence>();

    protected double storageThresholdValue;

    protected double upperTailThreshold = 0.0;

    protected boolean useAbsoluteValue = false;

    protected BitMatrix used = null;

    protected boolean usePvalueThreshold = true;
    private int numUniqueGenes = 0;

    private boolean omitNegativeCorrelationLinks = false;
    private HashMap<CompositeSequence, Collection<Gene>> flatProbe2GeneMap;

    /**
     * Read back the histogram as a DoubleArrayList of counts.
     * 
     * @return cern.colt.list.DoubleArrayList
     * @todo - put this somewhere more generically useful!
     */
    @Override
    public DoubleArrayList getHistogramArrayList() {
        DoubleArrayList r = new DoubleArrayList(fastHistogram.length);
        for (int i = 0; i < fastHistogram.length; i++) {
            r.add(fastHistogram[i]);
        }
        return r;
    }

    /**
     * Identify the correlations that are above the set thresholds.
     * 
     * @return cern.colt.list.ObjectArrayList
     */
    @Override
    public ObjectArrayList getKeepers() {
        return keepers;
    }

    /**
     * @return baseCode.dataStructure.NamedMatrix
     */
    public Matrix2D<ExpressionDataMatrixRowElement, ExpressionDataMatrixRowElement, Double> getMatrix() {
        return results;
    }

    @Override
    public double getNumUniqueGenes() {
        return numUniqueGenes;
    }

    /**
     * @param rowEl
     * @return
     */
    @Override
    public CompositeSequence getProbeForRow(ExpressionDataMatrixRowElement rowEl) {
        return this.rowMapCache.get(rowEl);
    }

    @Override
    public double getScoreInBin(int i) {
        // bin 2048 = correlation of 1.0 2048/1024 - 1 = 1
        // bin 1024 = correlation of 0.0 1024/1024 - 1 = 0
        // bin 0 = correlation of -1.0 : 0/1024 - 1 = -1
        return ((double) i / HALF_BIN) - 1.0;
    }

    /**
     * @return the usePvalueThreshold
     */
    public boolean isUsePvalueThreshold() {
        return usePvalueThreshold;
    }

    /**
     * @return double
     */
    public double kurtosis() {
        if (!histogramIsFilled) {
            throw new IllegalStateException("Don't call kurtosis when histogram isn't filled!");
        }
        double sumsquare = 0.0;
        for (int i = 0, n = results.rows(); i < n; i++) {
            for (int j = i + 1, m = results.columns(); j < m; j++) {
                double r;
                if (results.get(i, j) != 0) {
                    r = results.get(i, j);
                } else {
                    r = 0;
                    /** @todo calculate the value */
                }
                double deviation = r - globalMean;
                sumsquare += deviation * deviation;
            }
        }
        return sumsquare * numVals * (numVals + 1.0) / (numVals - 1.0)
                - 3 * sumsquare * sumsquare / (numVals - 2) * (numVals - 3);
    }

    /**
     * Flag the correlation matrix as un-fillable. This means that when PearsonMatrix is called, only the histogram will
     * be filled in. Also trashes any values that might have been stored there.
     */
    public void nullMatrix() {
        results = null;
    }

    /**
     * @return The number of values stored in the correlation matrix.
     */
    @Override
    public int numCached() {
        return results.cardinality();
    }

    /*
     * (non-Javadoc)
     * 
     * @see ubic.gemma.analysis.expression.coexpression.links.MatrixRowPairAnalysis#size()
     */
    @Override
    public int size() {
        return this.dataMatrix.rows();
    }

    /**
     * 
     */
    @Override
    public void setDuplicateMap(Map<CompositeSequence, Collection<Collection<Gene>>> probeToGeneMap) {
        this.probeToGeneMap = probeToGeneMap;
        init();
    }

    /**
     * Set the threshold, below which correlations are kept (e.g., negative values)
     * 
     * @param k double
     */
    @Override
    public void setLowerTailThreshold(double k) {
        lowerTailThreshold = k;
    }

    /**
     * Set the number of mutually present values in a pairwise comparison that must be attained before the correlation
     * is stored. Note that you cannot set the value less than HARD_LIMIT_MIN_NUM_USED.
     */
    @Override
    public void setMinNumpresent(int minSamplesToKeepCorrelation) {
        if (minSamplesToKeepCorrelation > HARD_LIMIT_MIN_NUM_USED)
            this.minNumUsed = minSamplesToKeepCorrelation;
    }

    /**
     * @param omitNegativeCorrelationLinks the omitNegativeCorrelationLinks to set
     */
    @Override
    public void setOmitNegativeCorrelationLinks(boolean omitNegativeCorrelationLinks) {
        this.omitNegativeCorrelationLinks = omitNegativeCorrelationLinks;
    }

    /**
     * @param k double
     */
    @Override
    public void setPValueThreshold(double k) {
        if (k < 0.0 || k > 1.0) {
            throw new IllegalArgumentException(
                    "P value threshold must be greater or equal to zero than 0.0 and less than or equal to 1.0");
        }
        this.pValueThreshold = k;
    }

    /**
     * Set the threshold, above which correlations are kept.
     * 
     * @param k double
     */
    @Override
    public void setUpperTailThreshold(double k) {
        upperTailThreshold = k;
    }

    /**
     * If set to true, then the absolute value of the correlation is used for histograms and choosing correlations to
     * keep. The correlation matrix, if actually used to store all the values, maintains the actual number.
     * 
     * @param k boolean
     */
    @Override
    public void setUseAbsoluteValue(boolean k) {
        useAbsoluteValue = k;
    }

    /**
     * @param usePvalueThreshold the usePvalueThreshold to set
     */
    @Override
    public void setUsePvalueThreshold(boolean usePvalueThreshold) {
        this.usePvalueThreshold = usePvalueThreshold;
    }

    /**
     * @return java.lang.String
     */
    @Override
    public String toString() {
        return results.toString();
    }

    /**
     * Store information about whether data includes missing values.
     * 
     * @return int
     */
    protected int fillUsed() {

        int missingCount = 0;
        int numrows = this.dataMatrix.rows();
        int numcols = this.dataMatrix.columns();
        hasMissing = new boolean[numrows];

        if (used == null) {
            used = new BitMatrix(numrows, numcols);
        }

        for (int i = 0; i < numrows; i++) {
            int rowmissing = 0;
            for (int j = 0; j < numcols; j++) {
                if (Double.isNaN(this.dataMatrix.get(i, j))) {
                    missingCount++;
                    rowmissing++;
                    used.put(i, j, false);
                } else {
                    used.put(i, j, true);
                }
            }
            hasMissing[i] = (rowmissing > 0);
        }

        if (missingCount == 0) {
            log.info("No missing values");
        } else {
            log.info(missingCount + " missing values");
        }
        return missingCount;
    }

    /**
     *  
     */
    protected void finishMetrics() {
        this.histogramIsFilled = true;
        globalMean = globalTotal / numVals;
    }

    /**
     * @param j
     * @return
     */
    protected Collection<Collection<Gene>> getGenesForRow(int j) {
        return this.probeToGeneMap.get(getProbeForRow(dataMatrix.getRowElement(j)));
    }

    /**
     * Skip the probes without blat association
     */
    protected boolean hasGene(ExpressionDataMatrixRowElement rowEl) {
        return hasGenesCache[rowEl.getIndex()];
    }

    /**
     * Initialize caches.
     */
    protected void init() {

        initGeneToProbeMap();

        List<ExpressionDataMatrixRowElement> rowElements = this.dataMatrix.getRowElements();
        hasGenesCache = new boolean[rowElements.size()];

        for (ExpressionDataMatrixRowElement element : rowElements) {

            CompositeSequence de = element.getDesignElement();
            rowMapCache.put(element, de);

            Collection<Collection<Gene>> geneIdSet = this.probeToGeneMap.get(de);
            Integer i = element.getIndex();
            hasGenesCache[i] = geneIdSet != null && geneIdSet.size() > 0;

        }
        assert rowMapCache.size() > 0;
        log.info("Initialized caches for probe/gene information");
    }

    /**
     * Decide whether to keep the correlation. The correlation must be greater or equal to the set thresholds.
     * 
     * @param i int
     * @param j int
     * @param correl double
     * @param numused int
     * @return boolean
     */
    protected boolean keepCorrel(int i, int j, double correl, int numused) {

        if (keepers == null) {
            return false;
        }

        if (Double.isNaN(correl))
            return false;

        if (omitNegativeCorrelationLinks && correl < 0.0) {
            return false;
        }

        double acorrel = Math.abs(correl);

        if (acorrel < storageThresholdValue) {
            // return false;
        }

        double c;
        if (useAbsoluteValue) {
            c = acorrel;
        } else {
            c = correl;
        }

        if (upperTailThreshold != 0.0 && c >= upperTailThreshold
                && (!this.usePvalueThreshold || correctedPvalue(i, j, correl, numused) <= this.pValueThreshold)) {

            keepers.add(new Link(i, j, correl));
            return true;
        }

        else if (!useAbsoluteValue && lowerTailThreshold != 0.0 && c <= lowerTailThreshold
                && (!this.usePvalueThreshold || correctedPvalue(i, j, correl, numused) <= this.pValueThreshold)) {
            keepers.add(new Link(i, j, correl));
            return true;
        }

        return false;
    }

    /**
     * Tests whether the correlations still need to be calculated for final retrieval, or if they can just be retrieved.
     * This looks at the current settings and decides whether the value would already have been cached.
     * 
     * @return boolean
     */
    protected boolean needToCalculateMetrics() {

        /* are we on the first pass, or haven't stored any values? */
        if (!histogramIsFilled || results == null) {
            return true;
        }

        if (this.storageThresholdValue > 0.0) {

            if (this.useAbsoluteValue) {
                if (upperTailThreshold > storageThresholdValue) { // then we would have stored it already.
                    log.info("Second pass, good news, all values are cached");
                    return false;
                }
            } else {
                if (Math.abs(lowerTailThreshold) > storageThresholdValue
                        && upperTailThreshold > storageThresholdValue) { // then we would have stored it already.
                    log.info("Second pass, good news, all values are cached");
                    return false;
                }
            }
        }
        log.info("Second pass, have to recompute some values");
        return true;
    }

    /**
     * Determine if the probes at this location in the matrix assay any of the same gene(s)
     * 
     * @param i
     * @param j
     * @return true if the probes hit the same gene; false otherwise.
     */
    private boolean crossHybridizes(int i, int j) {
        if (this.dataMatrix == null)
            return false; // can happen in tests.
        ExpressionDataMatrixRowElement itemA = this.dataMatrix.getRowElement(i);
        ExpressionDataMatrixRowElement itemB = this.dataMatrix.getRowElement(j);

        Collection<Gene> genesA = this.flatProbe2GeneMap.get(itemA.getDesignElement());
        Collection<Gene> genesB = this.flatProbe2GeneMap.get(itemB.getDesignElement());

        return CollectionUtils.containsAny(genesA, genesB);

    }

    /**
     * Checks for valid values of correlation and encoding.
     * 
     * @param i int
     * @param j int
     * @param correl double
     * @param numused int
     */
    protected void setCorrel(int i, int j, double correl, int numused) {
        if (Double.isNaN(correl))
            return;

        if (correl < -1.00001 || correl > 1.00001) {
            throw new IllegalArgumentException("Correlation out of valid range: " + correl);
        }

        if (correl < -1.0) {
            correl = -1.0;
        } else if (correl > 1.0) {
            correl = 1.0;
        }

        double acorrel = Math.abs(correl);

        // it is possible, due to roundoff, to overflow the bins.
        int lastBinIndex = fastHistogram.length - 1;

        if (!histogramIsFilled && !crossHybridizes(i, j)) {

            if (useAbsoluteValue) {
                int bin = Math.min((int) ((1.0 + acorrel) * HALF_BIN), lastBinIndex);
                fastHistogram[bin]++;
                globalTotal += acorrel;
                // histogram.fill( acorrel ); // this is suprisingly slow due to zillions of calls to Math.floor.
            } else {
                globalTotal += correl;
                int bin = Math.min((int) ((1.0 + correl) * HALF_BIN), lastBinIndex);
                fastHistogram[bin]++;
                // histogram.fill( correl );
            }
            numVals++;
        }

        if (acorrel > storageThresholdValue && results != null) {
            results.set(i, j, correl);
        }

        keepCorrel(i, j, correl, numused);

    }

    /**
     * Set an (absolute value) correlation, below which values are not maintained in the correlation matrix. They are
     * still kept in the histogram. (In some implementations this can greatly reduce the memory requirements for the
     * correlation matrix).
     * 
     * @param k double
     */
    protected void setStorageThresholdValue(double k) {
        if (k < 0.0 || k > 1.0) {
            throw new IllegalArgumentException("Correlation must be given as between 0 and 1");
        }
        storageThresholdValue = k;
    }

    /**
     * populate geneToProbeMap and gather stats. Probes that do not map to any genes are still counted.
     */
    private void initGeneToProbeMap() {
        int[] stats = new int[20]; // how many genes per probe
        this.numUniqueGenes = 0;
        this.flatProbe2GeneMap = new HashMap<CompositeSequence, Collection<Gene>>();
        this.geneToProbeMap = new HashMap<Gene, Collection<CompositeSequence>>();
        for (CompositeSequence cs : probeToGeneMap.keySet()) {

            if (!this.flatProbe2GeneMap.containsKey(cs)) {
                this.flatProbe2GeneMap.put(cs, new HashSet<Gene>());
            }

            Collection<Collection<Gene>> genes = probeToGeneMap.get(cs);

            /*
             * Genes will be empty if the probe does not map to any genes.
             */
            if (genes == null)
                continue; // defensive.

            for (Collection<Gene> cluster : genes) {
                if (cluster.isEmpty())
                    continue;
                numUniqueGenes++;
                for (Gene g : cluster) {
                    if (!geneToProbeMap.containsKey(g)) {
                        geneToProbeMap.put(g, new HashSet<CompositeSequence>());
                    }
                    this.geneToProbeMap.get(g).add(cs);
                    this.flatProbe2GeneMap.get(cs).add(g);
                }

                if (cluster.size() >= stats.length) {
                    stats[stats.length - 1]++;
                } else {
                    stats[cluster.size()]++;
                }
            }
        }

        if (numUniqueGenes == 0) {
            log.warn("There are no genes for this data set, " + this.flatProbe2GeneMap.size() + " probes.");
        }

        log.info("Mapping Stats: " + numUniqueGenes + " unique genes; genes per probe distribution summary: "
                + ArrayUtils.toString(stats));
    }
}