Java tutorial
/* * The Gemma project * * Copyright (c) 2007 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.analysis.expression.coexpression; import hep.aida.bin.QuantileBin1D; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.Predicate; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.time.StopWatch; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import ubic.basecode.dataStructure.BitUtil; import ubic.basecode.dataStructure.matrix.MatrixUtil; import ubic.basecode.math.DescriptiveWithMissing; import ubic.basecode.math.Rank; import ubic.basecode.math.distribution.Histogram; import ubic.basecode.math.metaanalysis.MetaAnalysis; import ubic.gemma.expression.experiment.service.ExpressionExperimentService; import ubic.gemma.expression.experiment.service.ExpressionExperimentSetService; import ubic.gemma.genome.gene.service.GeneService; import ubic.gemma.model.analysis.Analysis; import ubic.gemma.model.analysis.expression.ExpressionExperimentSet; import ubic.gemma.model.analysis.expression.coexpression.CoexpressedGenePairValueObject; import ubic.gemma.model.analysis.expression.coexpression.GeneCoexpressionAnalysis; import ubic.gemma.model.analysis.expression.coexpression.GeneCoexpressionAnalysisImpl; import ubic.gemma.model.analysis.expression.coexpression.GeneCoexpressionAnalysisService; import ubic.gemma.model.analysis.expression.coexpression.QueryGeneCoexpression; import ubic.gemma.model.association.coexpression.Gene2GeneCoexpression; import ubic.gemma.model.association.coexpression.Gene2GeneCoexpressionService; import ubic.gemma.model.association.coexpression.GeneCoexpressionNodeDegree; import ubic.gemma.model.association.coexpression.GeneCoexpressionNodeDegreeService; import ubic.gemma.model.association.coexpression.HumanGeneCoExpression; import ubic.gemma.model.association.coexpression.MouseGeneCoExpression; import ubic.gemma.model.association.coexpression.OtherGeneCoExpression; import ubic.gemma.model.association.coexpression.RatGeneCoExpression; import ubic.gemma.model.common.protocol.Protocol; import ubic.gemma.model.common.protocol.ProtocolService; import ubic.gemma.model.expression.experiment.BioAssaySet; import ubic.gemma.model.expression.experiment.ExpressionExperiment; import ubic.gemma.model.genome.Gene; import ubic.gemma.model.genome.Taxon; import ubic.gemma.persistence.Persister; import ubic.gemma.security.SecurityService; import ubic.gemma.util.ConfigUtils; import ubic.gemma.util.EntityUtils; import cern.colt.list.DoubleArrayList; import cern.jet.random.engine.DRand; import cern.jet.random.engine.RandomEngine; import cern.jet.stat.Descriptive; /** * Used to analyze already-persisted probe-level 'links' and turn them into gene-level coexpression information * (including node degree). The results are tied to a specific Analysis that can be referred to by clients. In practice * only one 'gene2gene' analysis is maintained for each taxon, in which all the datasets for the taxon are used. Results * are then filtered to include only the datasets that the user included. The gene2gene analysis for each taxon must be * updated periodically to include new datasets. * * @author paul * @version $Id: Gene2GenePopulationServiceImpl.java,v 1.43 2012/10/24 21:22:55 cmcdonald Exp $ */ @Component public class Gene2GenePopulationServiceImpl implements Gene2GenePopulationService { private static final int BATCH_SIZE = 1000; private static boolean SINGLE_QUERY_FOR_LINKS = ConfigUtils.getBoolean("store.gene.coexpression.bothways", true); private static Log log = LogFactory.getLog(Gene2GenePopulationServiceImpl.class.getName()); private static RandomEngine randomNumberGenerator = new DRand(new Date()); /** * @param experimentsAnalyzed * @return Map of location in the vector to EE ID. */ public static List<Long> getPositionToIdMap(Collection<Long> experimentIdsAnalyzed) { List<Long> eeIds = new ArrayList<Long>(experimentIdsAnalyzed); Collections.sort(eeIds); List<Long> eeOrderId = new ArrayList<Long>(); for (Long id : eeIds) { eeOrderId.add(id); } return eeOrderId; } /** * @param ggc * @param eePositionToIdMap * @return */ public static List<Long> getSpecificExperimentIds(Gene2GeneCoexpression ggc, List<Long> eePositionToIdMap) { if (ggc.getSpecificityVector() == null) { return new ArrayList<Long>(); } // log.info( BitUtil.prettyPrint( ggc.getSpecificityVector() ) ); return convertBitVector(eePositionToIdMap, ggc.getSpecificityVector()); } /** * @param ggc * @param positionToIDMap * @return */ public static List<Long> getSupportingExperimentIds(Gene2GeneCoexpression ggc, List<Long> positionToIDMap) { return convertBitVector(positionToIDMap, ggc.getDatasetsSupportingVector()); } /** * @param ggc * @param eePositionToIdMap * @return */ public static List<Long> getTestedExperimentIds(Gene2GeneCoexpression ggc, List<Long> eePositionToIdMap) { return convertBitVector(eePositionToIdMap, ggc.getDatasetsTestedVector()); } /** * @param positionToIDMap * @param bitvector * @return */ private static List<Long> convertBitVector(List<Long> positionToIDMap, byte[] bitvector) { List<Long> ids = new ArrayList<Long>(positionToIDMap.size()); boolean[] asBools = BitUtil.asBools(bitvector); assert asBools.length >= positionToIDMap.size(); // padding at the end. for (int i = 0; i < positionToIDMap.size(); i++) { if (asBools[i]) ids.add(positionToIDMap.get(i)); } return ids; } @Autowired private ExpressionExperimentSetService expressionExperimentSetService; @Autowired private ExpressionExperimentService expressionExperimentService; @Autowired private Gene2GeneCoexpressionService gene2GeneCoexpressionService; @Autowired private GeneCoexpressionAnalysisService geneCoexpressionAnalysisService; @Autowired private GeneService geneService; @Autowired private Persister persisterHelper; @Autowired private ProbeLinkCoexpressionAnalyzer probeLinkCoexpressionAnalyzer; @Autowired private ProtocolService protocolService; @Autowired private SecurityService securityService; @Autowired private GeneCoexpressionNodeDegreeService geneCoexpressionNodeDegreeService; // Caution stateful private Map<Gene, GeneCoexpressionNodeDegree> allGeneNodeDegrees = new ConcurrentHashMap<Gene, GeneCoexpressionNodeDegree>(); /* * (non-Javadoc) * * @see ubic.gemma.analysis.expression.coexpression.Gene2GenePopulationService#analyze(java.util.Collection, * java.util.Collection, int, java.lang.String, boolean) */ @Override public void analyze(Collection<ExpressionExperiment> expressionExperiments, Collection<Gene> toUseGenes, int stringency, String analysisName, boolean useDB) { log.info("Initializing gene link analysis ... "); Taxon taxon = toUseGenes.iterator().next().getTaxon(); // assume it is same for all genes ... Collection<GeneCoexpressionAnalysis> oldAnalyses = null; if (useDB) { oldAnalyses = findExistingAnalysis(taxon); } GeneCoexpressionAnalysis analysis = null; if (useDB) { analysis = intializeNewAnalysis(expressionExperiments, taxon, toUseGenes, analysisName, stringency); assert analysis != null; } log.info("Starting gene link analysis '" + analysisName + " on " + toUseGenes.size() + " genes in " + expressionExperiments.size() + " experiments with a stringency of " + stringency + "; store links both ways = " + SINGLE_QUERY_FOR_LINKS); doAnalysis(expressionExperiments, toUseGenes, stringency, analysis); if (useDB) { // FIXME Small risk here: there may be two enabled analyses until the next call is completed. If it fails we // definitely have a problem. Ideally there would be a transaction here...but it would be far too large. disableOldAnalyses(oldAnalyses); } /* * Note that we don't delete the old analysis. That has to be done manually for now -- just in case we need it * for something. */ } /* * (non-Javadoc) * * @see * ubic.gemma.analysis.expression.coexpression.Gene2GenePopulationService#analyze(ubic.gemma.model.genome.Taxon, * java.util.Collection, int, java.lang.String, boolean) */ @Override public void analyze(Taxon taxon, Collection<Gene> toUseGenes, int toUseStringency, String analysisName, boolean useDB) { Collection<ExpressionExperiment> findByTaxon = expressionExperimentService.findByTaxon(taxon); final Collection<Long> untroubled = expressionExperimentService .getUntroubled(EntityUtils.getIds(findByTaxon)); CollectionUtils.filter(findByTaxon, new Predicate() { @Override public boolean evaluate(Object object) { return untroubled.contains(((ExpressionExperiment) object).getId()); } }); if (findByTaxon.size() != untroubled.size()) { log.info("Removed " + (findByTaxon.size() - untroubled.size()) + " experiments with 'trouble' flags, leaving " + untroubled.size()); } this.analyze(expressionExperimentService.loadMultiple(untroubled), toUseGenes, toUseStringency, analysisName, useDB); } /** * @param expressionExperiments * @param taxon * @param toUseGenes * @param analysisName * @param stringency * @return */ @Override public GeneCoexpressionAnalysis intializeNewAnalysis(Collection<ExpressionExperiment> expressionExperiments, Taxon taxon, Collection<Gene> toUseGenes, String analysisName, int stringency) { GeneCoexpressionAnalysisImpl analysis = (GeneCoexpressionAnalysisImpl) GeneCoexpressionAnalysis.Factory .newInstance(); analysis.setDescription("Coexpression analysis for " + taxon.getCommonName() + " using " + expressionExperiments.size() + " expression experiments; stringency=" + stringency); Protocol protocol = createProtocol(expressionExperiments, toUseGenes); analysis.setTaxon(taxon); analysis.setStringency(stringency); analysis.setName(analysisName); analysis.setDescription( "Coexpression analysis of " + expressionExperiments.size() + " EEs from " + taxon.getCommonName()); analysis.setProtocol(protocol); analysis.setEnabled(false); ExpressionExperimentSet eeSet = expressionExperimentSetService .updateAutomaticallyGeneratedExperimentSet(expressionExperiments, taxon); analysis.setExpressionExperimentSetAnalyzed(eeSet); analysis = (GeneCoexpressionAnalysisImpl) persisterHelper.persist(analysis); securityService.makePrivate(analysis); log.info("Done setting up the analysis entity"); return analysis; } /* * (non-Javadoc) * * @see * ubic.gemma.analysis.expression.coexpression.Gene2GenePopulationService#nodeDegreeAnalysis(java.util.Collection, * java.util.Collection, boolean) */ @Override public void nodeDegreeAnalysis(Collection<ExpressionExperiment> expressionExperiments, Collection<Gene> toUseGenes, boolean useDB) { allGeneNodeDegrees.clear(); GeneCoexpressionAnalysis toUseAnalysis = getActiveAnalysis(toUseGenes); int count = 0; for (Gene queryGene : toUseGenes) { assert queryGene != null; if (computeNodeDegree(queryGene, -1, expressionExperiments)) { Integer numberOfLinks = gene2GeneCoexpressionService.getNumberOfLinks(queryGene, toUseAnalysis); allGeneNodeDegrees.get(queryGene).setNumLinks(numberOfLinks); if (++count % 200 == 0) { log.info(count + "/" + toUseGenes.size() + " genes analyzed for node degree"); } } } completeNodeDegreeComputations(useDB); allGeneNodeDegrees.clear(); } /** * Finalize the node degree computation and persist it. * * @param useDB if false, instead of saving to the database it will be printed to stdout. */ private void completeNodeDegreeComputations(boolean useDB) { DoubleArrayList vals = new DoubleArrayList(); DoubleArrayList rawCountVals = new DoubleArrayList(); for (GeneCoexpressionNodeDegree n : this.allGeneNodeDegrees.values()) { // l.add( n.getMedian() ); /* * The final ranks are based on the pvalues, not the medians. */ vals.add(n.getPvalue()); rawCountVals.add(n.getNumLinks()); } DoubleArrayList ranks = Rank.rankTransform(vals); DoubleArrayList rawCountRanks = Rank.rankTransform(rawCountVals); QuantileBin1D f = new QuantileBin1D(true, vals.size(), 0.0, 0.0, vals.size(), randomNumberGenerator); f.addAllOf(vals); log.info("Finalizing node degree computation"); int i = 0; for (Gene gene : this.allGeneNodeDegrees.keySet()) { GeneCoexpressionNodeDegree n = this.allGeneNodeDegrees.get(gene); double rank = ranks.get(i); n.setRank(rank / ranks.size()); n.setRankNumLinks(rawCountRanks.get(i) / rawCountRanks.size()); if (useDB) { geneCoexpressionNodeDegreeService.deleteFor(n.getGene()); geneCoexpressionNodeDegreeService.create(n); } else { System.out.print(String.format("%d\t%s\t%d\t%.2f\t%.2f\t%.2f\t%d\t%.2f\t%s\t%d\t%.2f\n", n.getGene().getId(), n.getGene().getOfficialSymbol(), n.getGene().getTaxon().getId(), n.getMedian(), n.getMedianDeviation(), n.getPvalue(), n.getNumTests(), n.getRank(), n.getDistribution(), n.getNumLinks(), n.getRankNumLinks())); } if (++i % 2000 == 0) { log.info("Completed node degree computation for " + i + " genes"); } } } /** * Compute a string to represent the distribution of node degree values. * * @param vals * @return */ private String computeDistribution(DoubleArrayList vals) { /* * Set up the distribution */ int numBins = 10; Histogram h = new Histogram("", numBins, 0.0, 1.0); h.fill(MatrixUtil.fromList(vals)); int tallestBar = h.getBiggestBinSize(); StringBuilder buf = new StringBuilder(); for (int i = 0; i < numBins; i++) { int f = (int) Math.floor(9.0 * h.getArray()[i] / tallestBar); assert f < 10; // one digit buf.append(f); } String distribution = buf.toString(); return distribution; } /** * @param queryGene * @param usedLinks number of links we stored for this gene. If < 0, this will not be used. * @param expressionExperiments * @return true if node degree was computed. * @see completeNodeDegreeComputations(boolean) for the final computation. */ private boolean computeNodeDegree(Gene queryGene, int usedLinks, Collection<? extends BioAssaySet> expressionExperiments) { Map<BioAssaySet, Double> nodeDegrees = geneService.getGeneCoexpressionNodeDegree(queryGene, expressionExperiments); /* * The values are relative ranks based on probes. The ranks are "per data set". We assume these are uniform * under the null, like pvalues. There are a lot of ties, but this really shouldn't matter. I */ if (nodeDegrees == null || nodeDegrees.isEmpty()) return false; GeneCoexpressionNodeDegree n = GeneCoexpressionNodeDegree.Factory.newInstance(); n.setGene(queryGene); n.setNumTests(nodeDegrees.size()); if (usedLinks >= 0) n.setNumLinks(usedLinks); // we populate the rank later. double[] array = ArrayUtils.toPrimitive(nodeDegrees.values().toArray(new Double[] {})); DoubleArrayList vals = new DoubleArrayList(array); /* * Because everything is rank transformed, we use the median and mad. Later we re-rank everything anyway. We * compute a p-value based on Fisher's method (see above about assuming the ranks are uniform under the null). * This is a bit screwey, because we're going to have a lot of really bad p-values. The final distribution of * pvalues is really messed up. But we're going to just use ranks anyway. */ double pvalue = MetaAnalysis.fisherCombinePvalues(vals); n.setPvalue(pvalue); n.setMedian(Descriptive.median(vals)); n.setMedianDeviation(DescriptiveWithMissing.mad(vals)); String distribution = computeDistribution(vals); n.setDistribution(distribution); // so we can compute summary stats at the end. this.allGeneNodeDegrees.put(queryGene, n); return true; } /** * Create a vector representing the datasets which had specific probes for the query and target genes. A '1' means * it did, '0' means it did not. * * @param nonspecificEE * @param eeIdOrder * @return */ private byte[] computeSpecificityVector(Collection<Long> nonspecificEE, Map<Long, Integer> eeIdOrder) { assert nonspecificEE.size() <= eeIdOrder.size(); byte[] result = new byte[(int) Math.ceil(eeIdOrder.size() / (double) Byte.SIZE)]; /* * Start initialized with 0's (might not be necessary...) */ for (int i = 0, j = result.length; i < j; i++) { result[i] = 0x0; } /* * Set the bits we're using to 1. */ for (int i = 0; i < eeIdOrder.size(); i++) { BitUtil.set(result, i); } /* * Set it so 1=specific 0=nonspecific */ for (Long id : nonspecificEE) { BitUtil.clear(result, eeIdOrder.get(id)); } assert BitUtil.count(result) == eeIdOrder.size() - nonspecificEE.size() : "Got " + BitUtil.count(result) + " ones, expected " + (eeIdOrder.size() - nonspecificEE.size()); return result; } /** * Algorithm: * <ol> * <li>Initialize byte array large enough to hold all the EE information (ceil(numeeids /Byte.SIZE)) * <li>Flip the bit at the right location. * </ol> * * @param idsToFlip * @param eeIdOrder * @return */ private byte[] computeSupportingDatasetVector(Collection<Long> idsToFlip, Map<Long, Integer> eeIdOrder) { byte[] supportVector = new byte[(int) Math.ceil(eeIdOrder.size() / (double) Byte.SIZE)]; for (int i = 0, j = supportVector.length; i < j; i++) { supportVector[i] = 0x0; } for (Long id : idsToFlip) { BitUtil.set(supportVector, eeIdOrder.get(id)); } assert BitUtil.count(supportVector) == idsToFlip.size(); return supportVector; } /** * @param expressionExperiments * @param toUseGenes * @return */ private Protocol createProtocol(Collection<? extends BioAssaySet> expressionExperiments, Collection<Gene> toUseGenes) { log.info("Creating protocol object ... "); Protocol protocol = Protocol.Factory.newInstance(); protocol.setName("Stored Gene2GeneCoexpressions"); protocol.setDescription("Using: " + expressionExperiments.size() + " Expression Experiments, " + toUseGenes.size() + " Genes"); protocol = protocolService.findOrCreate(protocol); return protocol; } /** * @param oldAnalyses */ private void disableOldAnalyses(Collection<GeneCoexpressionAnalysis> oldAnalyses) { if (!oldAnalyses.isEmpty()) { for (GeneCoexpressionAnalysis oldAnalysis : oldAnalyses) { log.info("Disabling old analysis: " + oldAnalysis); oldAnalysis.setEnabled(false); geneCoexpressionAnalysisService.update(oldAnalysis); } } } /** * @param expressionExperiments * @param toUseGenes * @param stringency * @param analysis if null, no results will be saved to the database, and will be printed to stdout instead. * @return genes that were processed. */ private Collection<Long> doAnalysis(Collection<? extends BioAssaySet> expressionExperiments, Collection<Gene> toUseGenes, int stringency, GeneCoexpressionAnalysis analysis) { int totalLinks = 0; Collection<Long> processedGenes = new HashSet<Long>(); Map<Long, Integer> eeIdOrder = ProbeLinkCoexpressionAnalyzerImpl.getOrderingMap(expressionExperiments); Map<Long, Gene> genesToAnalyzeMap = EntityUtils.getIdMap(toUseGenes); try { StopWatch timer = new StopWatch(); timer.start(); for (Gene queryGene : genesToAnalyzeMap.values()) { log.info("Starting: " + queryGene); totalLinks += processGene(expressionExperiments, genesToAnalyzeMap, analysis, eeIdOrder, processedGenes, stringency, queryGene); if (timer.getTime() > 10 * 60 * 1000) { timer.reset(); timer.start(); log.info("Processed " + processedGenes.size() + "/" + toUseGenes.size() + " genes so far, " + totalLinks + " links in total"); } } completeNodeDegreeComputations(analysis != null); if (analysis != null) { // All done... analysis.setDescription(analysis.getDescription() + "; " + totalLinks + " gene pairs stored."); analysis.setEnabled(true); geneCoexpressionAnalysisService.update(analysis); securityService.makePublic(analysis); } log.info(totalLinks + " gene pairs processed."); } catch (Exception e) { log.error("There was an error during analysis!"); if (analysis != null) { log.error("Cleaning up if possible ..."); geneCoexpressionAnalysisService.delete(analysis); } throw new RuntimeException(e); } return processedGenes; } /** * Find the old analysis so we can disable it afterwards * * @param taxon * @return */ private Collection<GeneCoexpressionAnalysis> findExistingAnalysis(Taxon taxon) { Collection<? extends Analysis> analysis = null; if (taxon.getIsSpecies()) { analysis = geneCoexpressionAnalysisService.findByTaxon(taxon); } else { analysis = geneCoexpressionAnalysisService.findByParentTaxon(taxon); } Collection<GeneCoexpressionAnalysis> oldAnalyses = new HashSet<GeneCoexpressionAnalysis>(); for (Analysis a : (Collection<? extends Analysis>) analysis) { assert a instanceof GeneCoexpressionAnalysis; oldAnalyses.add((GeneCoexpressionAnalysis) a); } log.info("Found " + oldAnalyses.size() + " old analyses"); return oldAnalyses; } /** * @param co * @return */ private String format(CoexpressedGenePairValueObject co) { StringBuilder buf = new StringBuilder(); buf.append(co.getQueryGene() + "\t"); buf.append(co.getCoexpressedGeneId() + "\t"); buf.append(co.getPositiveLinkSupport() + "\t"); buf.append(co.getNegativeLinkSupport() + "\t"); buf.append(BitUtil.prettyPrint(co.getDatasetsTestedInBytes())); return buf.toString(); } /** * @param toUseGenes * @return * @throw exception if the genes are not from the same taxon or if the analysis is not found. */ private GeneCoexpressionAnalysis getActiveAnalysis(Collection<Gene> toUseGenes) { Taxon t = null; for (Gene g : toUseGenes) { if (t == null) t = g.getTaxon(); else if (!t.equals(g.getTaxon())) { throw new IllegalArgumentException("Genes must all be from one taxon"); } } Collection<GeneCoexpressionAnalysis> analyses = this.findExistingAnalysis(t); if (analyses.isEmpty()) { throw new IllegalStateException(); } GeneCoexpressionAnalysis toUseAnalysis = null; for (GeneCoexpressionAnalysis a : analyses) { if (a.getEnabled()) { toUseAnalysis = a; break; } } if (toUseAnalysis == null) { throw new IllegalStateException("No active analysis found"); } return toUseAnalysis; } /** * @param taxon * @return */ private Gene2GeneCoexpression getNewGGCOInstance(Taxon taxon) { Gene2GeneCoexpression g2gCoexpression; if (taxon.getCommonName().equalsIgnoreCase("mouse")) g2gCoexpression = MouseGeneCoExpression.Factory.newInstance(); else if (taxon.getCommonName().equalsIgnoreCase("rat")) g2gCoexpression = RatGeneCoExpression.Factory.newInstance(); else if (taxon.getCommonName().equalsIgnoreCase("human")) g2gCoexpression = HumanGeneCoExpression.Factory.newInstance(); else g2gCoexpression = OtherGeneCoExpression.Factory.newInstance(); return g2gCoexpression; } /** * @param eeIdOrder map for creating bit vector data. * @param firstGene the query gene * @param toPersist results to persist * @param analysis Analysis object to associate with links * @param genesToAnalyze list of genes to limit analysis for (links to other genes are ignored) * @param alreadyPersisted to track genes we've already done, so we don't persist links in both directions * @param stringency minimum support to store a link. * @return */ private List<Gene2GeneCoexpression> persistCoexpressions(Map<Long, Integer> eeIdOrder, Gene firstGene, QueryGeneCoexpression toPersist, GeneCoexpressionAnalysis analysis, final Map<Long, Gene> genesToAnalyze, final Collection<Long> alreadyPersisted, int stringency) { assert analysis != null; Taxon taxon = firstGene.getTaxon(); List<Gene2GeneCoexpression> all = new ArrayList<Gene2GeneCoexpression>(); Collection<Gene2GeneCoexpression> batch = new ArrayList<Gene2GeneCoexpression>(); for (CoexpressedGenePairValueObject co : toPersist.getAllGeneCoexpressionData(stringency)) { if (!genesToAnalyze.containsKey(co.getCoexpressedGeneId())) { if (log.isDebugEnabled()) log.debug("coexpressed Gene " + co.getCoexpressedGeneId() + " is not among the genes selected for analysis, so it will be skipped (while analyzing " + firstGene.getOfficialSymbol() + ")"); continue; } Gene secondGene = genesToAnalyze.get(co.getCoexpressedGeneId()); // note we just check the id to avoid any problems with thaw etc. if (secondGene.getId().equals(firstGene.getId())) { /* * This check is 'just in case'; they should have been removed earlier. These can leak in because we 1) * there can be two probes for the same genes that are coexpressed and 2) we allow them in initial query * results */ continue; } /* * If we store the links in both directions, only one query is needed for retrieval. */ if (!SINGLE_QUERY_FOR_LINKS && alreadyPersisted.contains(secondGene.getId())) continue; if (log.isDebugEnabled()) log.debug(firstGene.getName() + " link to " + secondGene.getName()); byte[] testedInVector = co.getDatasetsTestedInBytes(); assert testedInVector != null; byte[] specificityVector = computeSpecificityVector(co.getNonspecificEE(), eeIdOrder); assert specificityVector != null; /* * Note that we are storing the 'raw' link support, which includes 'non-specific' probes. These can be * filtered later. */ if (co.getNegativeLinkSupport() >= stringency) { Gene2GeneCoexpression g2gCoexpression = getNewGGCOInstance(taxon); g2gCoexpression.setDatasetsTestedVector(testedInVector); g2gCoexpression.setSpecificityVector(specificityVector); g2gCoexpression.setSourceAnalysis(analysis); g2gCoexpression.setFirstGene(firstGene); g2gCoexpression.setSecondGene(secondGene); Collection<Long> contributing2NegativeLinks = co.getEEContributing2NegativeLinks(); assert contributing2NegativeLinks.size() == co.getNegativeLinkSupport(); byte[] supportVector = computeSupportingDatasetVector(contributing2NegativeLinks, eeIdOrder); g2gCoexpression.setNumDataSets(co.getNegativeLinkSupport()); g2gCoexpression.setEffect(co.getNegativeScore()); g2gCoexpression.setDatasetsSupportingVector(supportVector); batch.add(g2gCoexpression); if (batch.size() == BATCH_SIZE) { all.addAll(this.gene2GeneCoexpressionService.create(batch)); batch.clear(); } } if (co.getPositiveLinkSupport() >= stringency) { Gene2GeneCoexpression g2gCoexpression = getNewGGCOInstance(taxon); g2gCoexpression.setDatasetsTestedVector(testedInVector); g2gCoexpression.setSpecificityVector(specificityVector); g2gCoexpression.setSourceAnalysis(analysis); g2gCoexpression.setFirstGene(firstGene); g2gCoexpression.setSecondGene(secondGene); Collection<Long> contributing2PositiveLinks = co.getEEContributing2PositiveLinks(); assert contributing2PositiveLinks.size() == co.getPositiveLinkSupport(); byte[] supportVector = computeSupportingDatasetVector(contributing2PositiveLinks, eeIdOrder); g2gCoexpression.setNumDataSets(co.getPositiveLinkSupport()); g2gCoexpression.setEffect(co.getPositiveScore()); g2gCoexpression.setDatasetsSupportingVector(supportVector); batch.add(g2gCoexpression); if (batch.size() == BATCH_SIZE) { all.addAll(this.gene2GeneCoexpressionService.create(batch)); batch.clear(); } } if (log.isDebugEnabled()) log.debug("Persisted: " + firstGene.getOfficialSymbol() + " --> " + secondGene.getOfficialSymbol() + " ( " + co.getNegativeScore() + " , +" + co.getPositiveScore() + " )"); } // bit at the end. if (batch.size() > 0) { all.addAll(this.gene2GeneCoexpressionService.create(batch)); batch.clear(); } if (log.isDebugEnabled() && all.size() > 0) { log.debug("Persisted " + all.size() + " links for " + firstGene.getOfficialSymbol() + " " + firstGene.getOfficialName()); } return all; } /** * @param expressionExperiments * @param genesToAnalyzeMap * @param analysis * @param eeIdOrder * @param processedGenes * @param stringency * @param totalLinks * @param queryGene * @return */ private int processGene(Collection<? extends BioAssaySet> expressionExperiments, Map<Long, Gene> genesToAnalyzeMap, GeneCoexpressionAnalysis analysis, Map<Long, Integer> eeIdOrder, Collection<Long> processedGenes, int stringency, Gene queryGene) { // Do it QueryGeneCoexpression coexpressions = probeLinkCoexpressionAnalyzer.linkAnalysis(queryGene, expressionExperiments, stringency, 0); // persist it or write out StopWatch timer = new StopWatch(); timer.start(); int usedLinks = 0; if (analysis != null) { List<Gene2GeneCoexpression> created = persistCoexpressions(eeIdOrder, queryGene, coexpressions, analysis, genesToAnalyzeMap, processedGenes, stringency); usedLinks = created.size(); timer.stop(); if (timer.getTime() > 2000) { log.info("Persist " + usedLinks + " links: " + timer.getTime() + "ms"); } } else { List<CoexpressedGenePairValueObject> coexps = coexpressions.getAllGeneCoexpressionData(stringency); for (CoexpressedGenePairValueObject co : coexps) { if (!genesToAnalyzeMap.containsKey(co.getCoexpressedGeneId())) { continue; } Gene secondGene = genesToAnalyzeMap.get(co.getCoexpressedGeneId()); if (processedGenes.contains(secondGene)) { continue; } usedLinks++; System.out.println(format(co)); } if (usedLinks > 0) log.info(usedLinks + " links printed for " + queryGene.getOfficialSymbol() + " (" + timer.getTime() + "ms)"); } computeNodeDegree(queryGene, usedLinks, expressionExperiments); processedGenes.add(queryGene.getId()); coexpressions.finalize(); coexpressions = null; return usedLinks; } }