ubic.gemma.analysis.expression.coexpression.links.LinkAnalysis.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.analysis.expression.coexpression.links.LinkAnalysis.java

Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2006 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.analysis.expression.coexpression.links;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import ubic.basecode.dataStructure.Link;
import ubic.basecode.math.CorrelationStats;
import ubic.basecode.math.Stats;
import ubic.basecode.math.distribution.Histogram;
import ubic.gemma.analysis.expression.coexpression.links.LinkAnalysisConfig.SingularThreshold;
import ubic.gemma.datastructure.matrix.ExpressionDataDoubleMatrix;
import ubic.gemma.model.analysis.expression.coexpression.ProbeCoexpressionAnalysis;
import ubic.gemma.model.common.quantitationtype.QuantitationType;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.util.ConfigUtils;
import cern.colt.list.DoubleArrayList;
import cern.colt.list.ObjectArrayList;

/**
 * Handles running a linkAnalysis. Results are made available at the end. See LinkAnalysisCli for more instructions.
 * 
 * @author xiangwan
 * @author paul (refactoring)
 * @version $Id: LinkAnalysis.java,v 1.21 2012/06/07 05:17:14 paul Exp $
 */
public class LinkAnalysis {

    protected static final Log log = LogFactory.getLog(LinkAnalysis.class);
    private MatrixRowPairAnalysis metricMatrix;
    private DoubleArrayList cdf;
    private ObjectArrayList keep; // links that are retained.
    private ExpressionDataDoubleMatrix dataMatrix = null;

    private Map<CompositeSequence, Collection<Collection<Gene>>> probeToGeneMap = null;
    private Taxon taxon = null;

    private NumberFormat form;

    private LinkAnalysisConfig config;
    private ExpressionExperiment expressionExperiment;

    private ProbeCoexpressionAnalysis analysis;

    private Map<Integer, Integer> probeDegreeMap = new HashMap<Integer, Integer>();

    /**
     * @param config
     */
    public LinkAnalysis(LinkAnalysisConfig config) {
        this.form = NumberFormat.getInstance();
        if (form instanceof DecimalFormat)
            ((DecimalFormat) form).applyPattern("0.###E0");
        this.config = config;
    }

    /**
     * Main entry point.
     */
    public void analyze() {
        assert this.dataMatrix != null;
        assert this.taxon != null;
        assert this.probeToGeneMap != null;

        log.debug("Taxon: " + this.taxon.getCommonName());

        if (this.probeToGeneMap.size() == 0) {
            log.warn("No genes found for this dataset. Do the associated platforms need processing?");
        }

        log.info("Current Options: \n" + this.config);
        this.calculateDistribution();

        if (Thread.currentThread().isInterrupted()) {
            log.info("Cancelled.");
            return;
        }

        if (metricMatrix.getNumUniqueGenes() == 0) {
            throw new UnsupportedOperationException(
                    "Link analysis not supported when there are no genes mapped to the data set.");
        }

        this.getLinks();

        if (expressionExperiment != null) {// input is not from expression data file
            this.writeCorrelationDistribution();
            this.writeProbeDegreeDistribution();
        }
    }

    /**
     * Writes two flies: one the actual probe degrees, and the other a histogram (dist)
     */
    private void writeProbeDegreeDistribution() {

        File outputDir = getOutputDir();

        if (outputDir == null)
            return;

        String distPath = outputDir + File.separator + expressionExperiment.getShortName() + ".degreeDist.txt";
        String path = outputDir + File.separator + expressionExperiment.getShortName() + ".degrees.txt";

        File outputFile = new File(path);
        if (outputFile.exists()) {
            outputFile.delete();
        }

        File outputDistFile = new File(distPath);
        if (outputDistFile.exists()) {
            outputDistFile.delete();
        }

        try {
            FileWriter out = new FileWriter(outputFile);

            out.write("# Probe degree statistics (before filtering by probeDegreeThreshold)\n");
            out.write("# date=" + (new Date()) + "\n");
            out.write("# exp=" + expressionExperiment + " " + expressionExperiment.getShortName() + "\n");
            out.write("ProbeID\tProbeName\tNumLinks\n");

            for (Integer i : probeDegreeMap.keySet()) {
                CompositeSequence probe = this.getProbe(i);
                out.write(probe.getId() + "\t" + probe.getName() + "\t" + probeDegreeMap.get(i) + "\n");
            }

            out.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

        Histogram hist = new Histogram("foo", 100, 0, 1000);
        for (Integer i : probeDegreeMap.values()) {
            hist.fill(i.doubleValue());
        }

        double[] counts = hist.getArray();
        try {
            FileWriter out = new FileWriter(outputDistFile);

            int d = (int) hist.min();
            int step = (int) hist.stepSize();

            out.write("# Probe degree histogram (before filtering by probeDegreeThreshold)\n");
            out.write("# date=" + (new Date()) + "\n");
            out.write("# exp=" + expressionExperiment + " " + expressionExperiment.getShortName() + "\n");
            out.write("Bin\tCount\n");

            for (int i = 0; i < counts.length; i++) {
                Double v = counts[i];
                out.write(d + "\t" + v.intValue() + "\n");
                d += step;
            }

            out.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

    }

    /**
     * @return
     */
    private File getOutputDir() {
        File outputDir = null;

        if (this.config.isUseDb()) {
            outputDir = new File(ConfigUtils.getAnalysisStoragePath());
        } else {
            outputDir = new File(System.getProperty("user.home") + File.separator + "gemma.output");
            if (!outputDir.exists()) {
                boolean ok = outputDir.mkdirs();
                if (!ok) {
                    log.warn("Cannot create " + outputDir);
                    return null;
                }
            }
        }

        if (!outputDir.canWrite()) {
            log.warn("Cannot write to " + outputDir);
            return null;
        }
        return outputDir;
    }

    /**
     * Clear/null data so this object can be reused.
     */
    public void clear() {
        this.dataMatrix = null;
        this.probeToGeneMap = null;
        this.metricMatrix = null;
    }

    public void setDataMatrix(ExpressionDataDoubleMatrix paraDataMatrix) {
        this.dataMatrix = paraDataMatrix;
    }

    public void setTaxon(Taxon taxon) {
        this.taxon = taxon;
    }

    /**
     * Write histogram into file.
     * 
     * @throws IOException
     */
    public void writeCorrelationDistribution() {

        File outputDir = getOutputDir();

        if (outputDir == null)
            return;

        String path = outputDir + File.separator + expressionExperiment.getShortName() + ".correlDist.txt";

        File outputFile = new File(path);
        if (outputFile.exists()) {
            outputFile.delete();
        }

        DoubleArrayList histogramArrayList = this.metricMatrix.getHistogramArrayList();
        try {
            FileWriter out = new FileWriter(outputFile);

            double d = -1.0;
            double step = 2.0 / histogramArrayList.size();

            out.write("# Correlation distribution\n");
            out.write("# date=" + (new Date()) + "\n");
            out.write("# exp=" + expressionExperiment + " " + expressionExperiment.getShortName() + "\n");
            out.write("Bin\tCount\n");

            for (int i = 0; i < histogramArrayList.size(); i++) {
                double v = histogramArrayList.get(i);
                out.write(form.format(d) + "\t" + (int) v + "\n");
                d += step;
            }

            out.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Compute the distribution of similarity metrics for the entire matrix.
     */
    private void calculateDistribution() {
        if (config.getMetric().equals("pearson")) {
            log.info("Using Pearson linear correlation");
            metricMatrix = MatrixRowPairAnalysisFactory.pearson(this.dataMatrix,
                    config.getCorrelationCacheThreshold());
        } else if (config.getMetric().equals("spearman")) {
            log.info("Using Spearman rank correlation");
            metricMatrix = MatrixRowPairAnalysisFactory.spearmann(dataMatrix,
                    config.getCorrelationCacheThreshold());
        }

        metricMatrix.setOmitNegativeCorrelationLinks(config.isOmitNegLinks());
        metricMatrix.setDuplicateMap(probeToGeneMap); // populates numUniqueGenes
        metricMatrix.setUseAbsoluteValue(config.isAbsoluteValue());
        this.init();

        metricMatrix.calculateMetrics();
        log.info("Completed first pass over the data. Cached " + metricMatrix.numCached()
                + " values in the correlation matrix with values over " + config.getCorrelationCacheThreshold());
    }

    /**
     * Compute the thresholds needed to choose links for storage in the system.
     */
    private void chooseCutPoints() {
        cdf = Stats.cdf(metricMatrix.getHistogramArrayList());
        if (config.getCdfCut() <= 0.0) {
            config.setUpperTailCut(1.0);
            config.setLowerTailCut(-1.0);
            return;
        }

        if (config.getCdfCut() >= 1.0) {
            config.setUpperTailCut(0.0);
            config.setLowerTailCut(0.0);
            return;
        }

        double cdfTailCut = config.getCdfCut();
        double cdfUpperCutScore = 0.0;
        double cdfLowerCutScore = 0.0;

        // find the lower tail cutpoint, if we have to.
        if (!config.isAbsoluteValue()) {
            cdfTailCut /= 2.0;
            // find the lower cut point. Roundoff could be a problem...really
            // need two cdfs or do it directly from
            // histogram.
            for (int i = 0; i < cdf.size(); i++) {
                if (1.0 - cdf.get(i) >= cdfTailCut) {
                    cdfLowerCutScore = metricMatrix.getScoreInBin(i == cdf.size() ? i : i + 1);
                    break;
                }
            }
            log.info(form.format(cdfLowerCutScore) + " is the lower cdf cutpoint at " + cdfTailCut);
        }

        // find the upper cut point.
        for (int i = cdf.size() - 1; i >= 0; i--) {
            if (cdf.get(i) >= cdfTailCut) {
                cdfUpperCutScore = metricMatrix.getScoreInBin(i == cdf.size() ? i : i + 1);
                break;
            }
        }

        log.info(form.format(cdfUpperCutScore) + " is the upper cdf cutpoint at " + cdfTailCut);

        // get the cutpoint based on statistical signficance.
        double maxP = 1.0;
        double scoreAtP = 0.0;
        if (config.getFwe() != 0.0) {
            double numUniqueGenes = metricMatrix.getNumUniqueGenes();

            maxP = config.getFwe() / numUniqueGenes; // bonferroni.

            scoreAtP = CorrelationStats.correlationForPvalue(maxP, this.dataMatrix.columns());
            log.info("Minimum correlation to get " + form.format(maxP) + " is about " + form.format(scoreAtP)
                    + " for " + numUniqueGenes + " unique items (if all " + this.dataMatrix.columns()
                    + " items are present)");
            if (scoreAtP > 0.9) {
                log.warn("This data set has a very high threshold for statistical significance!");
            }
        }
        this.metricMatrix.setPValueThreshold(maxP); // this is the corrected
        // value.

        // choose cut points, with one independent criterion or the most stringent criteria
        if (config.getSingularThreshold().equals(SingularThreshold.none)) {
            config.setUpperTailCut(Math.max(scoreAtP, cdfUpperCutScore));
            if (config.getUpperTailCut() == scoreAtP) {
                config.setUpperCdfCutUsed(false);
            } else if (config.getUpperTailCut() == cdfUpperCutScore) {
                config.setUpperCdfCutUsed(true);
            }
            log.info("Final upper cut is " + form.format(config.getUpperTailCut()));
            if (!config.isAbsoluteValue()) {
                config.setLowerTailCut(Math.min(-scoreAtP, cdfLowerCutScore));
                log.info("Final lower cut is " + form.format(config.getLowerTailCut()));
            }
            if (config.getLowerTailCut() == scoreAtP) {
                config.setLowerCdfCutUsed(false);
            } else if (config.getLowerTailCut() == cdfLowerCutScore) {
                config.setLowerCdfCutUsed(true);
            }
        } else if (config.getSingularThreshold().equals(SingularThreshold.fwe)) {
            config.setUpperTailCut(scoreAtP);
            log.info("Final upper cut is " + form.format(config.getUpperTailCut()));
            if (!config.isAbsoluteValue()) {
                config.setLowerTailCut(-scoreAtP);
                log.info("Final lower cut is " + form.format(config.getLowerTailCut()));
            }
            config.setUpperCdfCutUsed(false);
            config.setLowerCdfCutUsed(false);
        } else if (config.getSingularThreshold().equals(SingularThreshold.cdfcut)) {
            config.setUpperTailCut(cdfUpperCutScore);
            log.info("Final upper cut is " + form.format(config.getUpperTailCut()));
            if (!config.isAbsoluteValue()) {
                config.setLowerTailCut(cdfLowerCutScore);
                log.info("Final lower cut is " + form.format(config.getLowerTailCut()));
            }
            metricMatrix.setUsePvalueThreshold(false);// use only cdfCut exclusively to keep links
            config.setUpperCdfCutUsed(true);
            config.setLowerCdfCutUsed(true);
        }

        metricMatrix.setUpperTailThreshold(config.getUpperTailCut());
        if (config.isAbsoluteValue()) {
            metricMatrix.setLowerTailThreshold(config.getUpperTailCut());
        } else {
            metricMatrix.setLowerTailThreshold(config.getLowerTailCut());
        }

    }

    /**
     * Does the main computation and link selection.
     */
    private void getLinks() {
        chooseCutPoints();
        metricMatrix.calculateMetrics();

        keep = metricMatrix.getKeepers();

        computeProbeDegrees();
    }

    /**
     * Populates a map of probe index (in matrix) -> how many links.
     */
    private void computeProbeDegrees() {

        this.probeDegreeMap = new HashMap<Integer, Integer>();

        for (Integer i = 0; i < metricMatrix.size(); i++) {
            probeDegreeMap.put(i, 0);
        }

        for (int i = 0; i < keep.size(); i++) {
            Link l = (Link) keep.get(i);
            Integer x = l.getx();
            Integer y = l.gety();

            probeDegreeMap.put(x, probeDegreeMap.get(x) + 1);
            probeDegreeMap.put(y, probeDegreeMap.get(y) + 1);

        }
    }

    public CompositeSequence getProbe(int index) {
        return getMetricMatrix().getProbeForRow(getDataMatrix().getRowElement(index));
    }

    /**
     * @param index row number in the metrixMatirx
     * @return how many Links that probe appears in, or null if the probeDegree has not been populated for that index
     *         (that is, either to early to check, or it was zero).
     */
    public Integer getProbeDegree(int index) {
        return this.probeDegreeMap.get(index);
    }

    /**
     * 
     *
     */
    private void init() {
        // estimate the correlation needed to reach significance.
        double scoreP = CorrelationStats.correlationForPvalue(
                config.getFwe() / this.metricMatrix.getNumUniqueGenes(), this.dataMatrix.columns()) - 0.001;
        if (scoreP > config.getCorrelationCacheThreshold())
            config.setCorrelationCacheThreshold(scoreP);
    }

    public ExpressionDataDoubleMatrix getDataMatrix() {
        return dataMatrix;
    }

    public ObjectArrayList getKeep() {
        return keep;
    }

    public MatrixRowPairAnalysis getMetricMatrix() {
        return metricMatrix;
    }

    /**
     * @return
     */
    public Taxon getTaxon() {
        return this.taxon;
    }

    public void setProbeToGeneMap(Map<CompositeSequence, Collection<Collection<Gene>>> probeToGeneMap) {
        this.probeToGeneMap = probeToGeneMap;
    }

    public ExpressionExperiment getExpressionExperiment() {
        return this.expressionExperiment;
    }

    public void setExpressionExperiment(ExpressionExperiment expressionExperiment) {
        this.expressionExperiment = expressionExperiment;
    }

    public QuantitationType getMetric() {
        return this.metricMatrix.getMetricType();
    }

    public LinkAnalysisConfig getConfig() {
        return config;
    }

    public Map<CompositeSequence, Collection<Collection<Gene>>> getProbeToGeneMap() {
        return probeToGeneMap;
    }

    public void setAnalysisObj(ProbeCoexpressionAnalysis analysis) {
        this.analysis = analysis;
    }

    /**
     * @return object containing the parameters used.
     */
    public ProbeCoexpressionAnalysis getAnalysisObj() {
        return analysis;
    }

}