ubic.gemma.analysis.expression.coexpression.ProbeLinkCoexpressionAnalyzerImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.analysis.expression.coexpression.ProbeLinkCoexpressionAnalyzerImpl.java

Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2007-2011 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.analysis.expression.coexpression;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.commons.lang.time.StopWatch;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import ubic.basecode.dataStructure.BitUtil;
import ubic.gemma.expression.experiment.service.ExpressionExperimentService;
import ubic.gemma.genome.gene.service.GeneService;
import ubic.gemma.model.analysis.expression.coexpression.CoexpressedGenePairValueObject;
import ubic.gemma.model.analysis.expression.coexpression.QueryGeneCoexpression;
import ubic.gemma.model.association.coexpression.Probe2ProbeCoexpressionService;
import ubic.gemma.model.expression.experiment.BioAssaySet;
import ubic.gemma.model.expression.experiment.ExpressionExperimentValueObject;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.util.EntityUtils;

/**
 * Perform gene-to-gene coexpression link analysis ("TMM-style"), based on stored probe-probe coexpression.
 * 
 * @author paul
 * @version $Id: ProbeLinkCoexpressionAnalyzerImpl.java,v 1.17 2012/07/07 06:25:34 paul Exp $
 */
@Component
public class ProbeLinkCoexpressionAnalyzerImpl implements ProbeLinkCoexpressionAnalyzer {

    private static Log log = LogFactory.getLog(ProbeLinkCoexpressionAnalyzerImpl.class.getName());
    private static final int MAX_GENES_TO_COMPUTE_EESTESTEDIN = 200;

    /**
     * Provide a consistent mapping of experiments that can be used to index a vector. The actual ordering is not
     * guaranteed to be anything in particular, just that repeated calls to this method with the same experiments will
     * yield the same mapping.
     * 
     * @param experiments
     * @return Map of EE IDs to an index, where the index values are from 0 ... N-1 where N is the number of
     *         experiments.
     */
    public static Map<Long, Integer> getOrderingMap(Collection<? extends BioAssaySet> experiments) {
        List<Long> eeIds = new ArrayList<Long>();
        for (BioAssaySet ee : experiments) {
            eeIds.add(ee.getId());
        }
        Collections.sort(eeIds);
        Map<Long, Integer> result = new HashMap<Long, Integer>();
        int index = 0;
        for (Long id : eeIds) {
            result.put(id, index);
            index++;
        }
        assert result.size() == experiments.size();
        return result;
    }

    @Autowired
    private GeneService geneService;

    @Autowired
    private Probe2ProbeCoexpressionService probe2ProbeCoexpressionService;

    @Autowired
    private ExpressionExperimentService expressionExperimentService;

    /**
     * Gene->EEs tested in. This is a cache that _should_ be safe to use, as it is only used in batch processing, which
     * uses a 'short-lived' spring context build on the command line -- not the web application.
     */
    private Map<Long, List<Boolean>> genesTestedIn = new ConcurrentHashMap<Long, List<Boolean>>();

    /**
     * Gene->ees tested in, in a faster access format - preordered by the ees. As for the genesTestedIn, this should not
     * be used in webapps.
     */
    private Map<Long, byte[]> geneTestStatusByteCache = new ConcurrentHashMap<Long, byte[]>();

    /*
     * (non-Javadoc)
     * 
     * @see ubic.gemma.analysis.expression.coexpression.ProbeLinkCoexpressionAnalyzer#linkAnalysis(java.util.Collection,
     * java.util.Collection, int, boolean, int)
     */
    @Override
    public Map<Gene, QueryGeneCoexpression> linkAnalysis(Collection<Gene> genes,
            Collection<? extends BioAssaySet> ees, int stringency, boolean interGenesOnly, int limit) {

        /*
         * Start with raw results
         */
        Map<Gene, QueryGeneCoexpression> result = geneService.getCoexpressedGenes(genes, ees, stringency,
                interGenesOnly);

        /*
         * Perform postprocessing gene by gene. It is possible this could be sped up by doing batches.
         */
        for (Gene gene : genes) {

            QueryGeneCoexpression coexpressions = result.get(gene);

            /*
             * Identify data sets the query gene is expressed in - this is fast (?) and provides an upper bound for EEs
             * we need to search in the first place.
             */
            Collection<BioAssaySet> eesQueryTestedIn = probe2ProbeCoexpressionService
                    .getExpressionExperimentsLinkTestedIn(gene, ees, false);

            /*
             * Finish the postprocessing.
             */
            if (coexpressions.getAllGeneCoexpressionData(stringency).size() == 0) {
                continue;
            }

            // don't fill in the gene info etc if we're in batch mode.
            if (limit > 0) {
                filter(coexpressions, limit, stringency); // remove excess
                fillInEEInfo(coexpressions); // do first...
                fillInGeneInfo(stringency, coexpressions);
            }

            computeEesTestedIn(ees, coexpressions, eesQueryTestedIn, stringency, limit);
        }
        return result;
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.analysis.expression.coexpression.ProbeLinkCoexpressionAnalyzer#linkAnalysis(ubic.gemma.model.genome
     * .Gene, java.util.Collection, int, int)
     */
    @Override
    public QueryGeneCoexpression linkAnalysis(Gene gene, Collection<? extends BioAssaySet> ees, int inputStringency,
            int limit) {

        int stringency = inputStringency <= 0 ? 1 : inputStringency;

        if (log.isDebugEnabled())
            log.debug("Link query for " + gene.getName() + " stringency=" + stringency);

        /*
         * Identify data sets the query gene is expressed in - this is fast (?) and provides an upper bound for EEs we
         * need to search in the first place.
         */
        Collection<BioAssaySet> eesQueryTestedIn = probe2ProbeCoexpressionService
                .getExpressionExperimentsLinkTestedIn(gene, ees, false);

        if (eesQueryTestedIn.size() == 0) {
            QueryGeneCoexpression r = new QueryGeneCoexpression(gene.getId(), stringency);
            r.setErrorState("No experiments have coexpression data for  " + gene.getOfficialSymbol());
            return r;
        }

        /*
         * Perform the coexpression search, some postprocessing done. If eesQueryTestedIn is empty, this returns real
         * quick.
         */
        QueryGeneCoexpression coexpressions = geneService.getCoexpressedGenes(gene, eesQueryTestedIn, stringency);

        /*
         * Finish the postprocessing.
         */
        StopWatch timer = new StopWatch();
        timer.start();
        if (coexpressions.getAllGeneCoexpressionData(stringency).size() == 0) {
            return coexpressions;
        }

        // don't fill in the gene info etc if we're in batch mode.
        if (limit > 0) {
            filter(coexpressions, limit, stringency); // remove excess
            fillInEEInfo(coexpressions); // do first...
            fillInGeneInfo(stringency, coexpressions);
        }

        computeEesTestedIn(ees, coexpressions, eesQueryTestedIn, stringency, limit);

        timer.stop();
        if (timer.getTime() > 1000) {
            log.info("All Post-Postprocessing: " + timer.getTime() + "ms");
        }

        log.debug("Analysis completed");

        return coexpressions;
    }

    /**
     * For each experiment, get the genes it tested and populate a map. This is slow; but once we've seen a gene, we
     * don't have to repeat it. We used to cache the ee->gene relationship, but doing it the other way around yields
     * must faster code.
     * 
     * @param ees
     * @param eeIndexMap
     */
    private void cacheEesGeneTestedIn(Collection<? extends BioAssaySet> ees, Map<Long, Integer> eeIndexMap) {

        assert ees != null;
        assert eeIndexMap != null;
        assert !ees.isEmpty();
        assert !eeIndexMap.isEmpty();

        StopWatch timer = new StopWatch();
        timer.start();
        int count = 0;
        for (BioAssaySet ee : ees) {
            Collection<Long> genes = probe2ProbeCoexpressionService.getGenesTestedBy(ee, false);

            // inverted map of gene -> ees tested in.
            Integer indexOfEEInAr = eeIndexMap.get(ee.getId());

            assert indexOfEEInAr != null : "No index for EEID=" + ee.getId() + " in the eeIndexMap";

            for (Long geneId : genes) {
                if (!genesTestedIn.containsKey(geneId)) {

                    // initialize the boolean array for this gene.
                    genesTestedIn.put(geneId, new ArrayList<Boolean>());
                    for (int i = 0; i < ees.size(); i++) {
                        genesTestedIn.get(geneId).add(Boolean.FALSE);
                    }

                }

                // flip to true since the gene was tested in the ee
                genesTestedIn.get(geneId).set(indexOfEEInAr, Boolean.TRUE);
            }

            if (++count % 100 == 0) {
                log.info("Got EEs gene tested in map for " + count + " experiments ... " + timer.getTime()
                        + "ms elapsed");
            }

        }
    }

    /**
     * Fill in gene tested information for genes coexpressed with the query.
     * 
     * @param ees ExpressionExperiments, including all that were used at the start of the query (including those the
     *        query gene is NOT expressed in)
     * @param coexpressions
     * @param eesQueryTestedIn
     * @param stringency
     * @param limit if zero, they are all collected and a batch-mode cache is used. This is MUCH slower if you are
     *        analyzing a single CoexpressionCollectionValueObject, but faster (and more memory-intensive) if many are
     *        going to be looked at (as in the case of a bulk Gene2Gene analysis).
     */
    private void computeEesTestedIn(Collection<? extends BioAssaySet> ees, QueryGeneCoexpression coexpressions,
            Collection<? extends BioAssaySet> eesQueryTestedIn, int stringency, int limit) {

        List<CoexpressedGenePairValueObject> coexpressionData = coexpressions.getCoexpressionData(stringency);

        if (limit == 0) {
            // when we expect to be analyzing many query genes. Note that we pass in the full set of experiments, not
            // just the ones in which the query gene was tested in.
            computeEesTestedInBatch(ees, coexpressionData);
        } else {
            // for when we are looking at just one gene at a time
            computeEesTestedIn(eesQueryTestedIn, coexpressionData);
        }

        /*
         * We can add this analysis to the predicted and pars if we want. I'm leaving it out for now.
         */
    }

    /**
     * For the genes that the query is coexpressed with. This is limited to the top MAX_GENES_TO_COMPUTE_EESTESTEDIN.
     * This is not very fast if MAX_GENES_TO_COMPUTE_EESTESTEDIN is large. We use this version for on-line requests.
     * 
     * @param eesQueryTestedIn, limited to the ees that the query gene is tested in.
     * @param coexpressionData
     * @see ProbeLinkCoexpressionAnalyzerImpl.computeEesTestedInBatch for the version used when requests are going to be
     *      done for many genes, so cache is built first time.
     */
    private void computeEesTestedIn(Collection<? extends BioAssaySet> eesQueryTestedIn,
            List<CoexpressedGenePairValueObject> coexpressionData) {
        Collection<Long> coexGeneIds = new HashSet<Long>();

        StopWatch timer = new StopWatch();
        timer.start();
        int i = 0;
        Map<Long, CoexpressedGenePairValueObject> gmap = new HashMap<Long, CoexpressedGenePairValueObject>();

        for (CoexpressedGenePairValueObject o : coexpressionData) {
            coexGeneIds.add(o.getCoexpressedGeneId());
            gmap.put(o.getCoexpressedGeneId(), o);
            i++;
            if (i >= MAX_GENES_TO_COMPUTE_EESTESTEDIN)
                break;
        }

        log.debug("Computing EEs tested in for " + coexGeneIds.size() + " genes.");
        Map<Long, Collection<BioAssaySet>> eesTestedIn = probe2ProbeCoexpressionService
                .getExpressionExperimentsTestedIn(coexGeneIds, eesQueryTestedIn, false);
        for (Long g : eesTestedIn.keySet()) {
            CoexpressedGenePairValueObject cvo = gmap.get(g);
            assert cvo != null;
            assert eesTestedIn.get(g).size() <= eesQueryTestedIn.size();

            Collection<Long> ids = EntityUtils.getIds(eesTestedIn.get(g));
            cvo.setDatasetsTestedIn(ids);
        }
        timer.stop();
        if (timer.getTime() > 1000) {
            log.info("computeEesTestedIn: " + timer.getTime() + "ms");
        }
    }

    /**
     * For the genes that the query is coexpressed with; this retrieves the information for all the coexpressionData
     * passed in (no limit) - all of them have to be for the same query gene!
     * 
     * @param ees, including ALL experiments that were initially started with, NOT just the ones that the query gene was
     *        tested in.
     * @param coexpressionData to check, the query gene must be the same for each of them.
     */
    private void computeEesTestedInBatch(Collection<? extends BioAssaySet> ees,
            List<CoexpressedGenePairValueObject> coexpressionData) {

        if (coexpressionData.isEmpty())
            return;

        StopWatch timer = new StopWatch();
        timer.start();

        if (log.isDebugEnabled()) {
            log.debug("Computing EEs tested in for " + coexpressionData.size() + " genes coexpressed with query.");
        }

        /*
         * Note: we assume this is actually constant as we build a cache based on it. Small risk.
         */
        Map<Long, Integer> eeIndexMap = getOrderingMap(ees);
        assert eeIndexMap.size() == ees.size();

        /*
         * Save some computation in the inner loop by assuming the query gene is constant.
         */
        CoexpressedGenePairValueObject initializer = coexpressionData.iterator().next();
        Long queryGeneId = initializer.getQueryGene();

        byte[] queryGeneEETestStatusBytes;
        if (geneTestStatusByteCache.containsKey(queryGeneId)) {
            queryGeneEETestStatusBytes = geneTestStatusByteCache.get(queryGeneId);
        } else {
            queryGeneEETestStatusBytes = computeTestedDatasetVector(queryGeneId, ees, eeIndexMap);
            geneTestStatusByteCache.put(queryGeneId, queryGeneEETestStatusBytes);
        }

        /*
         * This is a potential bottleneck, because there are often >500 ees and >1000 cvos, and each EE tests >20000
         * genes. Therefore we try hard not to iterate over all the CVOEEs more than we need to. So this is coded very
         * carefully. Once the cache is warmed up, this still can take over 500ms to run (Feb 2010). The total number of
         * loops _easily_ exceeds a million. (June 2012: this is rarely taking long enough to trigger the logging, after
         * it gets warmed up)
         */
        int loopcount = 0; // for performance statistics
        for (CoexpressedGenePairValueObject cvo : coexpressionData) {

            Long coexGeneId = cvo.getCoexpressedGeneId();
            if (!queryGeneId.equals(cvo.getQueryGene())) {
                throw new IllegalArgumentException(
                        "All coexpression value objects must have the same query gene here");
            }

            /*
             * Get the target gene info, from the cache if possible.
             */
            byte[] targetGeneEETestStatus;
            if (geneTestStatusByteCache.containsKey(coexGeneId)) {
                targetGeneEETestStatus = geneTestStatusByteCache.get(coexGeneId);
            } else {
                targetGeneEETestStatus = computeTestedDatasetVector(coexGeneId, ees, eeIndexMap);
                geneTestStatusByteCache.put(coexGeneId, targetGeneEETestStatus);
            }

            assert targetGeneEETestStatus.length == queryGeneEETestStatusBytes.length;

            byte[] answer = new byte[targetGeneEETestStatus.length];

            for (int i = 0; i < answer.length; i++) {
                answer[i] = (byte) (targetGeneEETestStatus[i] & queryGeneEETestStatusBytes[i]);
                loopcount++;
            }
            cvo.setDatasetsTestedInBytes(answer);
        }

        if (timer.getTime() > 100) {
            // It usually doesn't take this long.
            log.info("Compute EEs tested in (batch ): " + timer.getTime() + "ms; " + loopcount + " loops  ");

            // provide a cache status update at the same time. Worried about memory. 1500 data sets * 50000 genes =
            // 75megabytes, so this
            // shouldn't be a big deal.
            log.info(geneTestStatusByteCache.size() + " gene test status stored in cache, approx "
                    + (geneTestStatusByteCache.get(queryGeneId).length * geneTestStatusByteCache.size())
                    + " bytes total");
        }

    }

    /**
     * @param datasetsTestedIn
     * @param eeIdOrder
     * @return
     */
    private byte[] computeTestedDatasetVector(Long geneId, Collection<? extends BioAssaySet> ees,
            Map<Long, Integer> eeIdOrder) {
        /*
         * This condition is, pretty much only true once in practice. That's because the first time through populates
         * genesTestedIn for all the genes tested in any of the data sets.
         */
        if (!genesTestedIn.containsKey(geneId)) {
            cacheEesGeneTestedIn(ees, eeIdOrder);
        }

        assert eeIdOrder.size() == ees.size();

        assert genesTestedIn.containsKey(geneId);

        List<Boolean> eesTestingGene = genesTestedIn.get(geneId);

        assert eesTestingGene.size() == ees.size();

        // initialize.
        byte[] result = new byte[(int) Math.ceil(eeIdOrder.size() / (double) Byte.SIZE)];
        for (int i = 0, j = result.length; i < j; i++) {
            result[i] = 0x0;
        }

        for (BioAssaySet ee : ees) {
            Long eeid = ee.getId();
            Integer index = eeIdOrder.get(eeid);
            if (eesTestingGene.get(index)) {
                BitUtil.set(result, index);
            }
        }
        return result;
    }

    /**
     * @param coexpressions
     */
    private void fillInEEInfo(QueryGeneCoexpression coexpressions) {
        Collection<Long> eeIds = new HashSet<Long>();
        log.debug("Filling in EE info");

        fillInEEInfo(coexpressions, eeIds);

    }

    /**
     * @param coexpressions
     * @param eeIds
     * @param coexps
     */
    private void fillInEEInfo(QueryGeneCoexpression coexpressions, Collection<Long> eeIds) {

        eeIds.addAll(coexpressions.getDataSetsWithCoexpressionForQueryGene());
        Collection<ExpressionExperimentValueObject> ees = expressionExperimentService.loadValueObjects(eeIds,
                false);
        Map<Long, ExpressionExperimentValueObject> em = new HashMap<Long, ExpressionExperimentValueObject>();
        for (ExpressionExperimentValueObject evo : ees) {
            em.put(evo.getId(), evo);
        }

    }

    /**
     * @param stringency
     * @param coexpressions
     * @param coexp
     */
    private void fillInGeneInfo(int stringency, QueryGeneCoexpression coexpressions) {
        StopWatch timer = new StopWatch();
        timer.start();
        List<CoexpressedGenePairValueObject> coexpressionData = coexpressions.getCoexpressionData(stringency);
        Collection<Long> geneIds = new HashSet<Long>();
        for (CoexpressedGenePairValueObject cod : coexpressionData) {
            geneIds.add(cod.getCoexpressedGeneId());
        }

        Collection<Gene> genes = geneService.loadMultiple(geneIds); // this can be slow if there are a lot.
        Map<Long, Gene> gm = new HashMap<Long, Gene>();
        for (Gene g : genes) {
            gm.put(g.getId(), g);
        }

        for (CoexpressedGenePairValueObject cod : coexpressionData) {
            coexpressions.add(cod);
        }
        timer.stop();
        if (timer.getTime() > 1000) {
            log.info("Filled in gene info: " + timer.getTime() + "ms");
        }
    }

    /**
     * @param coexpressions
     * @param limit
     * @param stringency
     */
    private void filter(QueryGeneCoexpression coexpressions, int limit, int stringency) {
        coexpressions.filter(limit, stringency);
    }

}