ubic.gemma.core.analysis.expression.coexpression.links.LinkAnalysisServiceImpl.java Source code

Introduction

Here is the source code for ubic.gemma.core.analysis.expression.coexpression.links.LinkAnalysisServiceImpl.java
Source

/*
 * The Gemma project
 *
 * Copyright (c) 2008 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.analysis.expression.coexpression.links;

import cern.colt.list.ObjectArrayList;
import cern.colt.matrix.DoubleMatrix1D;
import cern.colt.matrix.impl.DenseDoubleMatrix1D;
import cern.jet.math.Functions;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import ubic.basecode.dataStructure.Link;
import ubic.basecode.io.ByteArrayConverter;
import ubic.gemma.core.analysis.preprocess.InsufficientProbesException;
import ubic.gemma.core.analysis.preprocess.OutlierDetails;
import ubic.gemma.core.analysis.preprocess.OutlierDetectionService;
import ubic.gemma.core.analysis.preprocess.batcheffects.BatchEffectDetails;
import ubic.gemma.core.analysis.preprocess.filter.FilterConfig;
import ubic.gemma.core.analysis.preprocess.filter.InsufficientSamplesException;
import ubic.gemma.core.analysis.preprocess.svd.ExpressionDataSVD;
import ubic.gemma.core.analysis.report.ExpressionExperimentReportService;
import ubic.gemma.core.analysis.service.ExpressionDataMatrixService;
import ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix;
import ubic.gemma.model.analysis.expression.coexpression.CoexpCorrelationDistribution;
import ubic.gemma.model.analysis.expression.coexpression.CoexpressionAnalysis;
import ubic.gemma.model.association.BioSequence2GeneProduct;
import ubic.gemma.model.common.auditAndSecurity.eventType.AuditEventType;
import ubic.gemma.model.common.auditAndSecurity.eventType.FailedLinkAnalysisEvent;
import ubic.gemma.model.common.auditAndSecurity.eventType.LinkAnalysisEvent;
import ubic.gemma.model.common.auditAndSecurity.eventType.TooSmallDatasetLinkAnalysisEvent;
import ubic.gemma.model.expression.bioAssayData.DesignElementDataVector;
import ubic.gemma.model.expression.bioAssayData.ProcessedExpressionDataVector;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.persistence.service.common.auditAndSecurity.AuditTrailService;
import ubic.gemma.persistence.service.expression.bioAssayData.ProcessedExpressionDataVectorService;
import ubic.gemma.persistence.service.expression.designElement.CompositeSequenceService;
import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import java.text.NumberFormat;
import java.util.*;

/**
 * Running link analyses through the spring context; will persist the results if the configuration says so. See
 * LinkAnalysisCli for more instructions.
 *
 * @author Paul
 */
@Component
public class LinkAnalysisServiceImpl implements LinkAnalysisService {

    private static final Log log = LogFactory.getLog(LinkAnalysisServiceImpl.class);

    @Autowired
    private AuditTrailService auditTrailService;

    @Autowired
    private CompositeSequenceService csService;

    @Autowired
    private ExpressionExperimentService eeService;

    @Autowired
    private ExpressionDataMatrixService expressionDataMatrixService;

    @Autowired
    private ExpressionExperimentReportService expressionExperimentReportService;

    @Autowired
    private OutlierDetectionService outlierDetectionService;

    @Autowired
    private LinkAnalysisPersister persister;

    @Autowired
    private ProcessedExpressionDataVectorService processedExpressionDataVectorService;

    @Override
    public LinkAnalysis process(ExpressionExperiment ee, FilterConfig filterConfig,
            LinkAnalysisConfig linkAnalysisConfig) {

        try {
            LinkAnalysis la = new LinkAnalysis(linkAnalysisConfig);
            la.clear();

            LinkAnalysisServiceImpl.log.info("Fetching expression data for " + ee);

            Collection<ProcessedExpressionDataVector> dataVectors = processedExpressionDataVectorService
                    .getProcessedDataVectors(ee);
            processedExpressionDataVectorService.thaw(dataVectors);

            LinkAnalysisServiceImpl.log.info("Starting analysis");
            this.analyze(ee, filterConfig, linkAnalysisConfig, la, dataVectors);

            LinkAnalysisServiceImpl.log.info("Done with analysis phase, starting persistence");
            this.saveResults(ee, la, linkAnalysisConfig, filterConfig);
            LinkAnalysisServiceImpl.log.info("Done with saving results for " + ee);
            return la;

        } catch (Exception e) {

            if (linkAnalysisConfig.isUseDb()) {
                this.logFailure(ee, e);
            }
            throw new RuntimeException(e);
        }

    }

    @Override
    public LinkAnalysis processVectors(Taxon t, Collection<ProcessedExpressionDataVector> dataVectors,
            FilterConfig filterConfig, LinkAnalysisConfig linkAnalysisConfig) {
        ExpressionDataDoubleMatrix datamatrix = expressionDataMatrixService
                .getFilteredMatrix(linkAnalysisConfig.getArrayName(), filterConfig, dataVectors);

        this.checkDatamatrix(datamatrix);
        LinkAnalysis la = new LinkAnalysis(linkAnalysisConfig);

        datamatrix = this.normalize(datamatrix, linkAnalysisConfig);

        this.setUpForAnalysis(t, la, dataVectors, datamatrix);

        la.analyze();

        try {
            this.writeLinks(la, filterConfig, new PrintWriter(System.out));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return la;

    }

    private void checkDatamatrix(ExpressionDataDoubleMatrix datamatrix) {
        if (datamatrix.rows() == 0) {
            LinkAnalysisServiceImpl.log.info("No rows left after filtering");
            throw new InsufficientProbesException("No rows left after filtering");
        } else if (datamatrix.rows() < FilterConfig.MINIMUM_ROWS_TO_BOTHER) {
            throw new InsufficientProbesException(
                    "To few rows (" + datamatrix.rows() + "), data sets are not analyzed unless they have at least "
                            + FilterConfig.MINIMUM_ROWS_TO_BOTHER + " rows");
        }
    }

    private void addAnalysisObj(ExpressionExperiment ee, FilterConfig filterConfig,
            LinkAnalysisConfig linkAnalysisConfig, LinkAnalysis la) {

        /*
         * Set up basics.
         */
        CoexpressionAnalysis analysis = linkAnalysisConfig.toAnalysis();

        analysis.setExperimentAnalyzed(ee);
        analysis.setName(ee.getShortName() + " link analysis");
        analysis.getProtocol().setDescription(
                analysis.getProtocol().getDescription() + "# FilterConfig:\n" + filterConfig.toString());

        la.setAnalysisObj(analysis);
    }

    private void analyze(ExpressionExperiment ee, FilterConfig filterConfig, LinkAnalysisConfig linkAnalysisConfig,
            LinkAnalysis la, Collection<ProcessedExpressionDataVector> dataVectors) {

        this.qcCheck(linkAnalysisConfig, ee);

        ExpressionDataDoubleMatrix datamatrix = expressionDataMatrixService.getFilteredMatrix(ee, filterConfig,
                dataVectors);

        this.setUpForAnalysis(ee, la, dataVectors, datamatrix);

        Map<CompositeSequence, Set<Gene>> probeToGeneMap = la.getProbeToGeneMap();

        assert !probeToGeneMap.isEmpty();

        /*
         * remove probes that have no gene mapped to them, not just those that have no sequence
         */
        datamatrix = this.filterUnmappedProbes(datamatrix, probeToGeneMap);

        this.checkDatamatrix(datamatrix);

        LinkAnalysisServiceImpl.log.info("Starting link analysis... " + ee);

        this.normalize(datamatrix, linkAnalysisConfig);

        /*
         * Link analysis section.
         */
        this.addAnalysisObj(ee, filterConfig, linkAnalysisConfig, la);
        la.analyze();

        CoexpCorrelationDistribution corrDist = la.getCorrelationDistribution();

        // another qc check.
        if (linkAnalysisConfig.isCheckCorrelationDistribution()) {
            this.diagnoseCorrelationDistribution(ee, corrDist);
        }
    }

    private void audit(ExpressionExperiment ee, String note, AuditEventType eventType) {
        expressionExperimentReportService.generateSummary(ee.getId());
        ee = this.eeService.thawLite(ee);
        auditTrailService.addUpdateEvent(ee, eventType, note);
    }

    private double binToCorrelation(int bin, int numBins) {
        return bin * 2.0 / numBins - 1.0;
    }

    /**
     * Check properties of the distribution
     */
    @SuppressWarnings("StatementWithEmptyBody") // Better readability
    private void diagnoseCorrelationDistribution(ExpressionExperiment ee, CoexpCorrelationDistribution corrDist)
            throws UnsuitableForAnalysisException {

        /*
         * Find the median, etc.
         */
        ByteArrayConverter bac = new ByteArrayConverter();
        double[] binCounts = bac.byteArrayToDoubles(corrDist.getBinCounts());
        int numBins = binCounts.length;
        DoubleMatrix1D histogram = new DenseDoubleMatrix1D(binCounts);

        // QC parameters; quantile, not correlation
        double lowerLimitofMiddle = 0.45;
        double upperLimitofMiddle = 0.55;
        double tailFraction = 0.1;

        // normalize
        histogram.assign(Functions.div(histogram.zSum()));

        double lowerTailDensity = 0.0;
        double upperTailDensity = 0.0;
        double median = 0.0;
        double s = 0.0; // cumulative
        double middleDensity = 0.0;
        for (int bin = 0; bin < histogram.size(); bin++) {

            // cumulate
            s += histogram.get(bin);

            /*
             * Perhaps these should be adjusted based on the sample size; for smaller data sets, more of the data is
             * going to be above 0.9 etc. But in practice this can't have a very big effect.
             */
            if (bin == (int) Math.floor(numBins * tailFraction)) {
                lowerTailDensity = s;
            } else if (bin == (int) Math.floor(numBins * (1.0 - tailFraction))) {
                upperTailDensity = 1.0 - s;
            } else if (bin > (int) Math.floor(lowerLimitofMiddle * numBins)
                    && bin < (int) Math.floor(upperLimitofMiddle * numBins)) {
                middleDensity += histogram.get(bin);
            }

            if (s >= 0.2) {
                // firstQuintile = binToCorrelation( i, numBins );
            } else if (s >= 0.5) {
                median = this.binToCorrelation(bin, numBins);
            } else if (s >= 0.8) {
                // lastQuintile = binToCorrelation( i, numBins );
            }
        }

        String message = "";
        boolean bad = false;

        if (median > 0.2 || median < -0.2) {
            bad = true;
            message = "Correlation distribution fails QC: median far from center (" + median + ")";
        } else if (lowerTailDensity + upperTailDensity > middleDensity) {

            bad = true;
            message = "Correlation distribution fails QC: tails too heavy";
        }

        if (bad) {
            throw new UnsuitableForAnalysisException(ee, message);
        }

    }

    /**
     * Remove rows corresponding to probes that don't map to genes. Row order may be changed.
     */
    private ExpressionDataDoubleMatrix filterUnmappedProbes(ExpressionDataDoubleMatrix dataMatrix,
            Map<CompositeSequence, Set<Gene>> probeToGeneMap) {
        return new ExpressionDataDoubleMatrix(dataMatrix, new ArrayList<>(probeToGeneMap.keySet()));
    }

    /**
     * Fills in the probe2gene map for the linkAnalysis. Note that the collection DOES NOT contain probes that have NO
     * genes mapped
     *
     * @param eeDoubleMatrix - used to make sure we don't use probes from vectors that are removed?
     */
    private void getProbe2GeneMap(LinkAnalysis la, Collection<ProcessedExpressionDataVector> dataVectors,
            ExpressionDataDoubleMatrix eeDoubleMatrix) {

        Collection<CompositeSequence> probesForVectors = new HashSet<>();
        for (DesignElementDataVector v : dataVectors) {
            CompositeSequence cs = v.getDesignElement();
            if (eeDoubleMatrix.getRow(cs) != null)
                probesForVectors.add(cs);
        }

        Map<CompositeSequence, Collection<BioSequence2GeneProduct>> specificityData = csService
                .getGenesWithSpecificity(probesForVectors);

        assert !specificityData.isEmpty();

        /*
         * Convert the specificity
         */
        Map<CompositeSequence, Set<Gene>> probeToGeneMap = new HashMap<>();
        for (CompositeSequence cs : specificityData.keySet()) {

            Collection<BioSequence2GeneProduct> bioSequenceToGeneProducts = specificityData.get(cs);

            if (!probeToGeneMap.containsKey(cs)) {
                probeToGeneMap.put(cs, new HashSet<Gene>());
            }

            for (BioSequence2GeneProduct bioSequence2GeneProduct : bioSequenceToGeneProducts) {
                Gene gene = bioSequence2GeneProduct.getGeneProduct().getGene();
                probeToGeneMap.get(cs).add(gene);
            }

        }

        /*
         * Remove the probes that have no mapping
         */
        int startingSize = probeToGeneMap.size();
        int numRemoved = 0;
        for (Iterator<CompositeSequence> it = probeToGeneMap.keySet().iterator(); it.hasNext();) {
            CompositeSequence cs = it.next();
            if (probeToGeneMap.get(cs).isEmpty()) {
                it.remove();
                numRemoved++;
            }
        }

        if (numRemoved > 0) {
            LinkAnalysisServiceImpl.log
                    .info(numRemoved + "/" + startingSize + " elements had no genes mapped and were removed.");
        }

        // assert !probeToGeneMap.isEmpty();
        if (probeToGeneMap.isEmpty()) {
            throw new IllegalStateException(
                    "No probes are mapped to genes; example=" + probeToGeneMap.keySet().iterator().next());
        }

        la.setProbeToGeneMap(probeToGeneMap);
    }

    private void logFailure(ExpressionExperiment expressionExperiment, Exception e) {

        if (e instanceof InsufficientSamplesException) {
            this.audit(expressionExperiment, e.getMessage(),
                    TooSmallDatasetLinkAnalysisEvent.Factory.newInstance());
        } else if (e instanceof InsufficientProbesException) {
            this.audit(expressionExperiment, e.getMessage(),
                    TooSmallDatasetLinkAnalysisEvent.Factory.newInstance());
        } else {
            LinkAnalysisServiceImpl.log.error("While processing " + expressionExperiment, e);
            this.audit(expressionExperiment, ExceptionUtils.getStackTrace(e),
                    FailedLinkAnalysisEvent.Factory.newInstance());
        }
    }

    /**
     * Normalize the data, as configured (possibly no normalization).
     */
    private ExpressionDataDoubleMatrix normalize(ExpressionDataDoubleMatrix datamatrix, LinkAnalysisConfig config) {

        ExpressionDataSVD svd;
        switch (config.getNormalizationMethod()) {
        case none:
            return datamatrix;
        case SVD:
            LinkAnalysisServiceImpl.log.info("SVD normalizing");
            svd = new ExpressionDataSVD(datamatrix, true);
            return svd.removeHighestComponents(1);
        case SPELL:
            LinkAnalysisServiceImpl.log.info("Computing U matrix via SVD");
            svd = new ExpressionDataSVD(datamatrix, true);
            return svd.uMatrixAsExpressionData();
        case BALANCE:
            LinkAnalysisServiceImpl.log.info("SVD-balanceing");
            svd = new ExpressionDataSVD(datamatrix, true);
            return svd.equalize();
        default:
            return null;
        }
    }

    /**
     * Reject if experiment has outliers or batch effects.
     */
    private void qcCheck(LinkAnalysisConfig config, ExpressionExperiment ee) throws UnsuitableForAnalysisException {

        if (config.isCheckForOutliers()) {
            Collection<OutlierDetails> outliers = outlierDetectionService.identifyOutliersByMedianCorrelation(ee);
            if (!outliers.isEmpty()) {
                throw new UnsuitableForAnalysisException(ee, "Potential outlier samples detected");
            }
        }

        if (config.isCheckForBatchEffect()) {
            BatchEffectDetails batchEffect = eeService.getBatchEffect(ee);

            if (batchEffect.getDataWasBatchCorrected()) {
                LinkAnalysisServiceImpl.log.info("Data are batch-corrected");
                return;
            }

            if (batchEffect.hasNoBatchInfo()) {
                // we may change this behaviour...
                throw new UnsuitableForAnalysisException(ee,
                        "No batch information available, out of an abundance of caution we are skipping");
            }

            if (batchEffect.getPvalue() < 0.001) {

                double componentVarianceProportion = batchEffect.getComponentVarianceProportion();
                Integer component = batchEffect.getComponent();
                // don't worry if it is a "minor" component. remember that is must be one of the first few to make it
                // this far.
                if (component > 2 && componentVarianceProportion < 0.1) {
                    return;
                }

                throw new UnsuitableForAnalysisException(ee,
                        String.format("Strong batch effect detected (%s)", batchEffect));
            }
        }
    }

    /**
     * Save the analysis data, either to DB or a file.
     */
    private void saveResults(ExpressionExperiment ee, LinkAnalysis la, LinkAnalysisConfig linkAnalysisConfig,
            FilterConfig filterConfig) {
        try {

            if (linkAnalysisConfig.isUseDb() && !linkAnalysisConfig.isTextOut()) {
                persister.saveLinksToDb(la);
                this.audit(ee, "", LinkAnalysisEvent.Factory.newInstance());
            } else if (linkAnalysisConfig.isTextOut()) {
                try {
                    PrintWriter w;

                    if (linkAnalysisConfig.getOutputFile() != null) {
                        w = new PrintWriter(linkAnalysisConfig.getOutputFile());
                    } else {
                        w = new PrintWriter(System.out);
                    }
                    this.writeLinks(la, filterConfig, w);
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }

            LinkAnalysisServiceImpl.log.info("Done with processing of " + ee);

        } catch (Exception e) {
            if (linkAnalysisConfig.isUseDb()) {
                this.logFailure(ee, e);
            }
            throw new RuntimeException(e);
        }

    }

    /**
     * Initializes the LinkAnalysis object; populates the probe2gene map.
     */
    private void setUpForAnalysis(ExpressionExperiment ee, LinkAnalysis la,
            Collection<ProcessedExpressionDataVector> dataVectors, ExpressionDataDoubleMatrix eeDoubleMatrix) {
        if (dataVectors == null || dataVectors.size() == 0)
            throw new IllegalArgumentException("No data vectors in " + ee);
        la.setDataMatrix(eeDoubleMatrix);

        if (ee != null) {
            la.setTaxon(eeService.getTaxon(ee));
            la.setExpressionExperiment(ee);
        }

        this.getProbe2GeneMap(la, dataVectors, eeDoubleMatrix);
    }

    /**
     * Initializes the LinkAnalysis object for data file input; populates the probe2gene map.
     */
    private void setUpForAnalysis(Taxon t, LinkAnalysis la, Collection<ProcessedExpressionDataVector> dataVectors,
            ExpressionDataDoubleMatrix eeDoubleMatrix) {

        la.setDataMatrix(eeDoubleMatrix);
        la.setTaxon(t);
        this.getProbe2GeneMap(la, dataVectors, eeDoubleMatrix);
    }

    /**
     * Write links as text.
     */
    private void writeLinks(final LinkAnalysis la, FilterConfig filterConfig, Writer wr) throws IOException {
        Map<CompositeSequence, Set<Gene>> probeToGeneMap = la.getProbeToGeneMap();
        ObjectArrayList links = la.getKeep();
        double subsetSize = la.getConfig().getSubsetSize();
        List<String> buf = new ArrayList<>();
        if (la.getConfig().isSubset() && links.size() > subsetSize) {
            la.getConfig().setSubsetUsed(true);
        }
        wr.write(la.getConfig().toString());
        wr.write(filterConfig.toString());
        NumberFormat nf = NumberFormat.getInstance();
        nf.setMaximumFractionDigits(4);

        Integer probeDegreeThreshold = la.getConfig().getProbeDegreeThreshold();

        int i = 0;
        int keptLinksCount = 0;
        Random generator = new Random();
        double rand;
        double fraction = subsetSize / links.size();
        int skippedDueToDegree = 0;
        for (int n = links.size(); i < n; i++) {

            Object val = links.getQuick(i);
            if (val == null)
                continue;
            Link m = (Link) val;
            Double w = m.getWeight();

            int x = m.getx();
            int y = m.gety();

            if (probeDegreeThreshold > 0 && (la.getProbeDegree(x) > probeDegreeThreshold
                    || la.getProbeDegree(y) > probeDegreeThreshold)) {
                skippedDueToDegree++;
                continue;
            }

            CompositeSequence p1 = la.getProbe(x);
            CompositeSequence p2 = la.getProbe(y);

            Set<Gene> g1 = probeToGeneMap.get(p1);
            Set<Gene> g2 = probeToGeneMap.get(p2);

            List<String> genes1 = new ArrayList<>();
            for (Gene cluster : g1) {
                String t = cluster.getOfficialSymbol();
                genes1.add(t);
            }

            List<String> genes2 = new ArrayList<>();
            for (Gene cluster : g2) {
                String t = cluster.getOfficialSymbol();
                genes2.add(t);
            }

            if (genes2.size() == 0 || genes1.size() == 0) {
                continue;
            }

            String gene1String = StringUtils.join(genes1.iterator(), "|");
            String gene2String = StringUtils.join(genes2.iterator(), "|");

            if (gene1String.equals(gene2String)) {
                continue;
            }

            if (++keptLinksCount % 50000 == 0) {
                LinkAnalysisServiceImpl.log.info(keptLinksCount + " links retained");
            }

            if (la.getConfig().isSubsetUsed()) {
                rand = generator.nextDouble();
                if (rand > fraction)
                    continue;
            }

            buf.add(p1.getId() + "\t" + p2.getId() + "\t" + gene1String + "\t" + gene2String + "\t" + nf.format(w)
                    + "\n");// save links
            // wr.write( p1.getId() + "\t" + p2.getId() + "\t" + gene1String + "\t" + gene2String + "\t" + nf.format( w
            // ) + "\n" );

        }

        wr.write("# totalLinks:" + keptLinksCount + "\n");
        wr.write("# printedLinks:" + buf.size() + "\n");
        wr.write("# skippedDueToHighNodeDegree:" + skippedDueToDegree + "\n");

        for (String line : buf) {// write links to file
            wr.write(line);
        }

        if (la.getConfig().isSubsetUsed()) {// subset option activated
            LinkAnalysisServiceImpl.log.info("Done, " + keptLinksCount + "/" + links.size() + " links kept, "
                    + buf.size() + " links printed");
            // wr.write("# Amount of links before subsetting/after subsetting: " + links.size() + "/" + numPrinted +
            // "\n" );
        } else {
            LinkAnalysisServiceImpl.log.info("Done, " + keptLinksCount + "/" + links.size()
                    + " links printed (some may have been filtered)");
        }
        wr.flush();

    }

}