Java tutorial
/* * The Gemma project. * * Copyright (c) 2006-2008 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.persistence.service.association.coexpression; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.StopWatch; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.hibernate.*; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.orm.hibernate3.support.HibernateDaoSupport; import org.springframework.stereotype.Repository; import org.springframework.transaction.annotation.Transactional; import ubic.basecode.dataStructure.CountingMap; import ubic.basecode.io.ByteArrayConverter; import ubic.basecode.util.BatchIterator; import ubic.gemma.model.analysis.expression.coexpression.GeneCoexpressedGenes; import ubic.gemma.model.analysis.expression.coexpression.GeneCoexpressionTestedIn; import ubic.gemma.model.analysis.expression.coexpression.IdArrayValueObject; import ubic.gemma.model.analysis.expression.coexpression.SupportDetails; import ubic.gemma.model.association.coexpression.ExperimentCoexpressionLink; import ubic.gemma.model.association.coexpression.Gene2GeneCoexpression; import ubic.gemma.model.association.coexpression.GeneCoexpressionNodeDegree; import ubic.gemma.model.association.coexpression.GeneCoexpressionNodeDegreeValueObject; import ubic.gemma.model.expression.experiment.BioAssaySet; import ubic.gemma.model.genome.Gene; import ubic.gemma.model.genome.Taxon; import ubic.gemma.persistence.util.EntityUtils; import java.math.BigInteger; import java.util.*; /** * Manages and queries coexpression 'links' between genes. * * @author klc * @author paul * @see ubic.gemma.model.association.coexpression.Gene2GeneCoexpression */ @SuppressWarnings("unchecked") @Repository public class CoexpressionDaoImpl extends HibernateDaoSupport implements CoexpressionDao { /* * Important implementation note: For efficiency reason, it is important that gene-level links be stored clustered * by gene in the database: so all all links with gene=x are clustered. (the actual order of x1 vs x2 doesn't * matter, just so long they are clustered). This makes retrievals much faster for the most common types of queries. */ /** * When links drop to support zero, should we remove them? Or leave them under the assumption they might get put * back again. Keeping them will help reduce fragmentation. Changing this setting without wiping the database could * be problematic... */ private static final boolean DELETE_ORPHAN_LINKS = false; /** * If there are more datasets than this specified, use a gene-specific query unless it's too few genes specified. If * there are just a few genes it's going to be faster to just get the data for the genes and filter by dataset. But * for a large number of genes in a small number of datasets, going to the datasets first will be faster. Note that * this really depends on how many data sets are available. */ private static final int MAX_DATASETS_FOR_DATASET_FIRST_QUERY = 50; /** * If no genes are specified, find results common to the given data sets, but only if there aren't too many data * sets. */ private static final int MAX_DATASETS_FOR_DATASET_ONLY_QUERY = 20; /** * If there are fewer genes than this specified, use an experiment-specific query if there aren't too many datasets * specified. If it's just a few genes, it's always going to be faster to just get the data for the genes and filter * by dataset. */ private static final int MIN_GENES_FOR_DATASET_FIRST_QUERY = 10; private static final int BATCH_SIZE = 2048; private static final int BATCH_SIZE_SMALL = 8; private static final Log log = LogFactory.getLog(CoexpressionDaoImpl.class); /** * If the stringency is less than this, we will usually want to use a dataset-first query unless the number of * datasets is quite large. Note that this setting should depend on how many datasets are in the system in the first * place, and is thus species-specific. So this is just a temporary measure. */ // private static final int MIN_STRINGENCY_FOR_GENE_FIRST_QUERY = 6; @Autowired private CoexpressionCache gene2GeneCoexpressionCache; @Autowired private GeneTestedInCache geneTestedInCache; @Autowired public CoexpressionDaoImpl(SessionFactory sessionFactory) { super.setSessionFactory(sessionFactory); } @Override public Integer countLinks(Gene gene, BioAssaySet ee) { // Looking at the first gene is enough if we save the flipped versions; we don't get a double-count here because // of the constraint on the first gene. Session sess = this.getSessionFactory().getCurrentSession(); Query q = sess.createSQLQuery( "select count(*) from " + CoexpressionQueryUtils.getExperimentLinkTableName(gene.getTaxon()) + " e where e.EXPERIMENT_FK=:ee and e.GENE1_FK=:g "); q.setParameter("ee", ee.getId()).setParameter("g", gene.getId()); return ((BigInteger) q.uniqueResult()).intValue(); } /* * Errors here will be big trouble, leading to corrupt data. It has to be all one transaction. * */ @Override public void createOrUpdate(BioAssaySet bioAssaySet, List<NonPersistentNonOrderedCoexpLink> links, LinkCreator c, Set<Gene> genesTested) { // assumption is that these are _all_ the links for this experiment assert !links.isEmpty(); assert bioAssaySet != null; assert c != null; Collections.sort(links); Session sess = this.getSessionFactory().getCurrentSession(); sess.setCacheMode(CacheMode.IGNORE); // to determine the species Gene gene = (Gene) sess.get(Gene.class, links.iterator().next().getFirstGene()); String geneLinkClassName = CoexpressionQueryUtils.getGeneLinkClassName(gene); /* * Check that there are no links for this experiment. */ if (this.countLinks(gene.getTaxon(), bioAssaySet) > 0) { throw new IllegalStateException( "There are already links for given bioAssaySet; they must be deleted before proceeding"); } /* * Attempt to save database trips */ Map<NonPersistentNonOrderedCoexpLink, Boolean> existingResults = this.preFetch(links); String s = "from " + geneLinkClassName + " where firstGene =:f and secondGene=:s and positiveCorrelation=:pc"; Query q = sess.createQuery(s); SQLQuery updateFlippedLinkQuery = sess .createSQLQuery("UPDATE " + CoexpressionQueryUtils.getGeneLinkTableName(gene.getTaxon()) + " SET SUPPORT=:s WHERE FIRST_GENE_FK=:g2 AND SECOND_GENE_FK=:g1 AND POSITIVE=:po"); // map of linkid to links, for establishing the EE-level links. TreeMap<Long, NonPersistentNonOrderedCoexpLink> linkIds = new TreeMap<>(); // keep order so for this experiment // they are in order. Set<Long> seenExistingLinks = new HashSet<>(); // for sanity checks. Set<NonPersistentNonOrderedCoexpLink> seenNewLinks = new HashSet<>(); // for sanity checks. Set<SupportDetails> seenNewSupportDetails = new HashSet<>(); // for sanity checks. int numNew = 0; int numUpdated = 0; int progress = 0; int BATCH_SIZE = 1024; // make a multiple of jdbc batch size... Map<SupportDetails, Gene2GeneCoexpression> batchToCreate = new LinkedHashMap<>(); List<Gene2GeneCoexpression> newFlippedLinks = new ArrayList<>(); Set<Long> genesWithUpdatedData = new HashSet<>(); sess.flush(); sess.clear(); // for each link see if there is already an entry; make a new one if necessary or update the old one. CoexpressionDaoImpl.log.info("Starting link processing"); for (NonPersistentNonOrderedCoexpLink proposedG2G : links) { Long firstGene = proposedG2G.getFirstGene(); Long secondGene = proposedG2G.getSecondGene(); // There is an index for f+s, but querying one-at-a-time is going to be slow. I attempted to speed it up by // fetching all links for a gene when we see it, but this causes problems with data being stale. Prefetching // with just the ability to tell if a link is new or not takes a lot of memory and doesn't speed things up // much. Trying keeping an index of which links a gene has, so we know whether we need to check the database // or not. // // Currently it takes about 1 minute to process 10k links on a relatively small database, much of this is // the findLink call. Gene2GeneCoexpression existingLink = this.findLink(q, proposedG2G, existingResults); /* * To speed this up? * * - Fetch all links for a gene in one batch, instead of looping over them one at a time. The problem is the * flipped links involve other genes that we fetch later in the same transaction, and this all has to be * done in one transaction. I experimented with this already */ if (existingLink == null) { // initialize the supportdetails SupportDetails sd = c.createSupportDetails(firstGene, secondGene, proposedG2G.isPositiveCorrelation()); sd.addEntity(bioAssaySet.getId()); assert sd.getNumIds() > 0; assert sd.isIncluded(bioAssaySet.getId()); // Must be unique assert !seenNewSupportDetails.contains(sd) : "Already saw " + sd + " while processing " + proposedG2G; assert proposedG2G.getLink() != null; batchToCreate.put(sd, proposedG2G.getLink()); if (seenNewLinks.contains(proposedG2G)) { CoexpressionDaoImpl.log.warn( "The data passed had the same new link represented more than once: " + proposedG2G); continue; } seenNewSupportDetails.add(sd); seenNewLinks.add(proposedG2G); if (CoexpressionDaoImpl.log.isDebugEnabled()) CoexpressionDaoImpl.log.debug("New: " + proposedG2G); numNew++; } else { // This code assumes that the flipped version is in the database, but we don't retrieve it // yet. also note that the support of the existing link could be zero, if DELETE_ORPHAN_LINKS = false // (or if initializeLinksFromExistingData was used) // Sanity check. If this happens, there must be two versions of the same link already in the input. if (seenExistingLinks.contains(existingLink.getId())) { throw new IllegalStateException( "The data passed had the same existing link represented more than once: " + existingLink); } /* sanity check that we aren't adding dataset twice; we might be able make this an assertion instead. */ if (existingLink.isSupportedBy(bioAssaySet)) { throw new IllegalStateException("Support for this experiment already exists for " + existingLink + ", must be deleted first"); } // cache old support for sanity check int oldSupport = existingLink.getSupportDetails().getNumIds(); // update the support existingLink.getSupportDetails().addEntity(bioAssaySet.getId()); existingLink.updateNumDatasetsSupporting(); // there is no cascade... on purpose. sess.update(existingLink.getSupportDetails()); assert oldSupport + 1 == existingLink.getNumDatasetsSupporting(); assert existingLink.getSupportDetails().getNumIds() == oldSupport + 1; // track so we add corresponding Experiment-level links later. linkIds.put(existingLink.getId(), new NonPersistentNonOrderedCoexpLink(existingLink)); seenExistingLinks.add(existingLink.getId()); /* * The flipped link is asserted to be in the database. The support details is already dealt with; we * just have to update the support value. */ int numFlippedUpdated = updateFlippedLinkQuery .setParameter("s", existingLink.getNumDatasetsSupporting()) .setParameter("g2", proposedG2G.getSecondGene()) .setParameter("g1", proposedG2G.getFirstGene()) .setParameter("po", proposedG2G.isPositiveCorrelation() ? 1 : 0).executeUpdate(); assert numFlippedUpdated == 1 : "Flipped link missing for " + proposedG2G + " [" + numFlippedUpdated + "]"; numUpdated++; if (CoexpressionDaoImpl.log.isDebugEnabled()) CoexpressionDaoImpl.log.debug("Updated: " + proposedG2G); } genesWithUpdatedData.add(firstGene); genesWithUpdatedData.add(secondGene); if (++progress % 5000 == 0) { CoexpressionDaoImpl.log.info("Processed " + progress + "/" + links.size() + " gene-level links..." + numUpdated + " updated, " + numNew + " new"); } if (batchToCreate.size() >= BATCH_SIZE) { newFlippedLinks.addAll(this.saveBatchAndMakeFlipped(sess, linkIds, batchToCreate, c)); } else if (numUpdated > 0 && numUpdated % BATCH_SIZE == 0) { sess.flush(); sess.clear(); } } // loop over links // tail end batch if (!batchToCreate.isEmpty()) { // we make the flipped links later to optimize their ordering. newFlippedLinks.addAll(this.saveBatchAndMakeFlipped(sess, linkIds, batchToCreate, c)); } // flush the updated ones one last time... if (numUpdated > 0) { sess.flush(); sess.clear(); } assert links.size() == linkIds.size(); CoexpressionDaoImpl.log.info(numUpdated + " updated, " + numNew + " new links"); /* * sort and save the accumulated new flipped versions of the new links, which reuse the supportDetails. In the * flipped links, the first gene is the second gene and vice versa. Continue to accumulate the flipped links. */ CoexpressionDaoImpl.log.info("Saving " + newFlippedLinks.size() + " flipped versions of new links ..."); Collections.sort(newFlippedLinks, new Comparator<Gene2GeneCoexpression>() { @Override public int compare(Gene2GeneCoexpression o1, Gene2GeneCoexpression o2) { return o1.getFirstGene().compareTo(o2.getFirstGene()); } }); progress = 0; for (Gene2GeneCoexpression gl : newFlippedLinks) { sess.save(gl); if (++progress % 5000 == 0) { CoexpressionDaoImpl.log.info("Processed " + progress + "/" + newFlippedLinks.size() + " new flipped gene-level links..."); } if (progress % BATCH_SIZE == 0) { sess.flush(); sess.clear(); } } /* * Save experiment-level links */ CoexpressionDaoImpl.log .info("Saving " + linkIds.size() + " experiment-level links (plus flipped versions) ..."); this.saveExperimentLevelLinks(sess, c, linkIds, bioAssaySet); if (genesTested != null) this.updatedTestedIn(bioAssaySet, genesTested); this.updateGeneCoexpressedWith(links); // kick anything we updated out of the cache. int numRemovedFromCache = this.gene2GeneCoexpressionCache.remove(genesWithUpdatedData); if (numRemovedFromCache > 0) CoexpressionDaoImpl.log.info(numRemovedFromCache + " results evicted from cache"); // flush happens on commit... CoexpressionDaoImpl.log.info("Done, flushing changes ..."); } /* * Errors here will be big trouble, leading to corrupt data. It has to be all one transaction. * */ @Override @Transactional public void deleteLinks(Taxon t, BioAssaySet experiment) { Session sess = this.getSessionFactory().getCurrentSession(); sess.setCacheMode(CacheMode.IGNORE); CoexpressionDaoImpl.log.info("Fetching any old coexpression ..."); Collection<Gene2GeneCoexpression> links = this.getCoexpression(t, experiment); Set<NonPersistentNonOrderedCoexpLink> toRemove = new HashSet<>(); // even if there are no links, we shouldn't assume we can bail; the 'tested-in' information might be there. if (!links.isEmpty()) { CoexpressionDaoImpl.log.info("Removing coexpression information for " + experiment + "; updating " + links.size() + " links (count includes flipped versions)."); // adjust gene-level links int count = 0; int numWithZeroSupportLeft = 0; int BATCH_SIZE = 1024; Collection<SupportDetails> supportDetailsToDelete = new HashSet<>(); Collection<SupportDetails> supportDetailsToUpdate = new HashSet<>(); Collection<Long> genesAffected = new HashSet<>(); for (Gene2GeneCoexpression g2g : links) { genesAffected.add(g2g.getFirstGene()); genesAffected.add(g2g.getSecondGene()); // decrement support; details are shared by both links, just update it once! SupportDetails sd = g2g.getSupportDetails(); if (!supportDetailsToUpdate.contains(sd) && !supportDetailsToDelete.contains(sd)) { /* * If we already saw the supportDetails it might already be zero. But if we didn't, it can't. */ assert g2g.getNumDatasetsSupporting() > 0 : "Support was " + g2g.getNumDatasetsSupporting() + " for " + g2g; sd.removeEntity(experiment.getId()); assert !sd.getIds().contains(experiment.getId()); supportDetailsToUpdate.add(sd); } g2g.updateNumDatasetsSupporting(); assert g2g.getNumDatasetsSupporting() >= 0; if (g2g.getNumDatasetsSupporting() == 0) { /* * we might still want to keep it, on the presumption that it will get filled back in. */ if (CoexpressionDaoImpl.DELETE_ORPHAN_LINKS) { sess.delete(g2g); // it might be in here already (flipped), but that's okay. supportDetailsToDelete.add(sd); // from the quickindex. But leave it there otherwise. toRemove.add(new NonPersistentNonOrderedCoexpLink(g2g)); } else { sess.update(g2g); } numWithZeroSupportLeft++; } else { sess.update(g2g); } if (++count % 10000 == 0) { CoexpressionDaoImpl.log .info("Removed support for " + count + " links for " + experiment + "..."); } if (count % BATCH_SIZE == 0) { sess.flush(); sess.clear(); } } sess.flush(); sess.clear(); this.updateModifiedSupportDetails(experiment, supportDetailsToDelete, supportDetailsToUpdate); if (CoexpressionDaoImpl.DELETE_ORPHAN_LINKS) { CoexpressionDaoImpl.log.info("Adjusted " + links.size() + " gene-level links supported by the experiment; " + numWithZeroSupportLeft + " links removed from the system as support dropped to zero."); } else { CoexpressionDaoImpl.log.info("Adjusted " + links.size() + " gene-level links supported by the experiment; " + numWithZeroSupportLeft + " gene-level links now have support dropped to zero but they were left in place"); } // remove the ExperimentCoexpressionLinks int numDeleted = sess.createQuery( "delete from " + CoexpressionQueryUtils.getExperimentLinkClassName(t) + " where experiment=:ee") .setParameter("ee", experiment).executeUpdate(); CoexpressionDaoImpl.log.info("Deleted " + numDeleted + " experiment-level links"); // invalidate the cache. int numRemovedFromCache = gene2GeneCoexpressionCache.remove(genesAffected); if (numRemovedFromCache > 0) CoexpressionDaoImpl.log.info(numRemovedFromCache + " results evicted from cache"); } // we do NOT redo the node degree information, which will be refreshed "periodically" // we always have to do this, even if there are no links. this.removeTestedIn(t, experiment); // update our quick index if (!toRemove.isEmpty()) this.removeCoexpressedWith(toRemove); } @Override @Transactional(readOnly = true) public List<CoexpressionValueObject> findCoexpressionRelationships(Gene gene, Collection<Long> bas, int maxResults, boolean quick) { assert !bas.isEmpty(); assert gene != null; assert maxResults >= 0; Collection<Long> g = new HashSet<>(); g.add(gene.getId()); assert gene.getTaxon() != null; Map<Long, List<CoexpressionValueObject>> r = this.getCoexpressionFromCacheOrDb(gene.getTaxon(), g, bas, bas.size(), maxResults, quick); if (r == null || !r.containsKey(gene.getId())) return new ArrayList<>(); return r.get(gene.getId()); } @Override @Transactional(readOnly = true) public Map<Long, List<CoexpressionValueObject>> findCoexpressionRelationships(Taxon taxon, Collection<Long> genes, Collection<Long> bas, int maxResults, boolean quick) { assert !bas.isEmpty(); assert !genes.isEmpty(); assert maxResults >= 0; Map<Long, List<CoexpressionValueObject>> rrr = this.getCoexpressionFromCacheOrDb(taxon, genes, bas, bas.size(), maxResults, quick); int total = 0; for (Long g : rrr.keySet()) { total += rrr.get(g).size(); } CoexpressionDaoImpl.log.info("Found " + total + " coexpression links for " + genes.size() + " genes in " + bas.size() + " datasets."); return rrr; } @Override @Transactional(readOnly = true) public Map<Long, List<CoexpressionValueObject>> findCoexpressionRelationships(Taxon t, Collection<Long> genes, Collection<Long> bas, int stringency, int maxResults, boolean quick) { assert !bas.isEmpty(); assert genes != null; assert maxResults >= 0; // maxResults is ignored if it is a "my genes only" query. assert stringency >= 1; Map<Long, List<CoexpressionValueObject>> rrr = this.getCoexpressionFromCacheOrDb(t, genes, bas, stringency, maxResults, quick); // DEBUG code; count up for logging only. int totalLinks = 0; for (List<CoexpressionValueObject> list : rrr.values()) { totalLinks += list.size(); } if (genes.size() > 1 && totalLinks > 0) CoexpressionDaoImpl.log.info("Found " + totalLinks + " coexpression links in total for " + genes.size() + " genes in " + bas.size() + " datasets at stringency=" + stringency + " quick=" + quick + " maxresults per gene=" + maxResults); // end debug code return rrr; } @Override @Transactional(readOnly = true) public Map<Long, List<CoexpressionValueObject>> findInterCoexpressionRelationships(Taxon taxon, Collection<Long> genes, Collection<Long> bas, int stringency, boolean quick) { assert !bas.isEmpty(); assert !genes.isEmpty(); if (bas.size() < stringency) { throw new IllegalArgumentException("Stringency is larger than the number of data sets"); } Map<Long, List<CoexpressionValueObject>> results = new HashMap<>(); Collection<Long> genesNeeded = this.checkCacheForInterGeneLinks(genes, results, stringency); if (!genesNeeded.isEmpty()) { // something wasn't in the cache. Map<Long, List<CoexpressionValueObject>> dbResults; if (bas.size() > CoexpressionDaoImpl.MAX_DATASETS_FOR_DATASET_FIRST_QUERY || genes.size() < CoexpressionDaoImpl.MIN_GENES_FOR_DATASET_FIRST_QUERY) { dbResults = this.getInterCoexpressionFromDbViaGenes(taxon, genes, stringency, quick); } else { dbResults = this.getInterCoexpressionFromDbViaExperiments(taxon, genes, bas, quick); } /* * We can't cache this because it was done with a constraint on the 'found' gene. But it might get added to * the queue for cache warm. */ results.putAll(dbResults); } this.trimAndFinishResults(results, bas, stringency, 0); return results; } @Override @Transactional public GeneCoexpressionNodeDegreeValueObject updateNodeDegree(Gene g, GeneCoexpressionNodeDegree nd) { Session sess = this.getSessionFactory().getCurrentSession(); List<CoexpressionValueObject> hits = this.getCoexpression(g); /* * We have to reset the support. */ GeneCoexpressionNodeDegreeValueObject gcndvo = new GeneCoexpressionNodeDegreeValueObject(nd); gcndvo.clear(); assert gcndvo.getMaxSupportNeg() == 0; for (CoexpressionValueObject hit : hits) { if (hit.isPositiveCorrelation()) { gcndvo.increment(hit.getNumDatasetsSupporting(), true); } else { gcndvo.increment(hit.getNumDatasetsSupporting(), false); } } assert gcndvo.total() == hits.size(); GeneCoexpressionNodeDegree entity = gcndvo.toEntity(); nd.setLinkCountsPositive(entity.getLinkCountsPositive()); nd.setLinkCountsNegative(entity.getLinkCountsNegative()); if (CoexpressionDaoImpl.log.isDebugEnabled()) CoexpressionDaoImpl.log.debug("gene=" + g.getId() + " pos=" + StringUtils.join(ArrayUtils.toObject(nd.getLinkCountsPositive()), " ") + " neg=" + StringUtils.join(ArrayUtils.toObject(nd.getLinkCountsNegative()), " ")); sess.update(nd); // might not be necessary, but presumption is data is stale now... this.gene2GeneCoexpressionCache.remove(g.getId()); this.geneTestedInCache.remove(g.getId()); return gcndvo; } @Override @Transactional(readOnly = true) public Collection<CoexpressionValueObject> getCoexpression(Taxon taxon, BioAssaySet experiment, boolean quick) { Session sess = this.getSessionFactory().getCurrentSession(); // could just fetch linkId. Query q = sess.createQuery( " from " + CoexpressionQueryUtils.getExperimentLinkClassName(taxon) + " where experiment=:ee"); q.setParameter("ee", experiment); List<ExperimentCoexpressionLink> links = q.list(); Collection<CoexpressionValueObject> results = new HashSet<>(); if (links.isEmpty()) { return results; } List<Long> linksToFetch = new ArrayList<>(); for (ExperimentCoexpressionLink link : links) { linksToFetch.add(link.getLinkId()); } String q2 = "from " + CoexpressionQueryUtils.getGeneLinkClassName(taxon) + " where id in (:ids)"; BatchIterator<Long> it = BatchIterator.batches(linksToFetch, 1000); for (; it.hasNext();) { List<Gene2GeneCoexpression> rawResults = sess.createQuery(q2).setParameterList("ids", it.next()).list(); for (Gene2GeneCoexpression g2g : rawResults) { CoexpressionValueObject g2gvo = new CoexpressionValueObject(g2g); results.add(g2gvo); } } if (!quick) { this.populateTestedInDetails(results); } return results; } @Override @Transactional(readOnly = true) public int queryAndCache(Gene gene) { if (gene2GeneCoexpressionCache.get(gene.getId()) != null) { // already in the cache. return -1; } CoexpressionDaoImpl.log.debug("Fetching data for gene=" + gene.getId() + " for cache"); Collection<Long> gg = new HashSet<>(); gg.add(gene.getId()); // Map<Long, List<CoexpressionValueObject>> rr = getCoexpressionFromDbViaGenes( gg, // CoexpressionQueryUtils.getGeneLinkClassName( gene ), CoexpressionCache.CACHE_QUERY_STRINGENCY, true, // true ); Map<Long, List<CoexpressionValueObject>> rr = this.getCoexpressionFromDbViaGenes2(gg, gene.getTaxon(), CoexpressionCache.CACHE_QUERY_STRINGENCY, true); List<CoexpressionValueObject> results = rr.get(gene.getId()); if (results == null || results.isEmpty()) { // it is necessary to avoid searching again when there are no results. gene2GeneCoexpressionCache.cacheCoexpression(gene.getId(), new ArrayList<CoexpressionValueObject>()); return 0; } gene2GeneCoexpressionCache.cacheCoexpression(gene.getId(), results); return results.size(); } /* * This assumes that we're going to do this for all genes, so we get links in both directions eventually. We don't * have to explicitly make the flipped linSks here. */ @Override public Map<SupportDetails, Gene2GeneCoexpression> initializeFromOldData(Gene gene, Map<Long, Gene> geneIdMap, Map<NonPersistentNonOrderedCoexpLink, SupportDetails> linksSoFar, Set<Long> skipGenes) { Session sess = this.getSessionFactory().getCurrentSession(); LinkCreator c = new LinkCreator(gene.getTaxon()); String geneLinkTableName = CoexpressionQueryUtils.getGeneLinkTableName(gene.getTaxon()); String oldGeneLinkTableName = geneLinkTableName.replace("COEX", "CO_EX"); assert oldGeneLinkTableName.contains("CO_EX"); int BATCH_SIZE = 1024; /* * Query the old table */ SQLQuery oldLinkQuery = sess.createSQLQuery("select FIRST_GENE_FK, SECOND_GENE_FK, EFFECT from " + oldGeneLinkTableName + " where FIRST_GENE_FK=?"); List<Object[]> oldLinks = oldLinkQuery.setLong(0, gene.getId()).list(); if (oldLinks.isEmpty()) { return null; } Map<SupportDetails, Gene2GeneCoexpression> linksToSave = new LinkedHashMap<>(); /* * Make new links. */ Collection<NonPersistentNonOrderedCoexpLink> links = new HashSet<>(); int i = 0; for (Object[] o : oldLinks) { Long fgid = ((BigInteger) o[0]).longValue(); Long sgid = ((BigInteger) o[1]).longValue(); if (skipGenes != null && (skipGenes.contains(fgid) || skipGenes.contains(sgid))) { continue; } Double eff = (Double) o[2]; if (fgid.equals(sgid)) { continue; } assert geneIdMap.containsKey(fgid); assert geneIdMap.containsKey(sgid); Gene2GeneCoexpression g2g = c.create(eff, fgid, sgid); /* * Check if we already have a link like this for the reverse - if so, reuse the supportdetails; the keys of * linksSoFar are id-less, so equals() is by genes and direction. */ SupportDetails sdOfFlipped = linksSoFar .get(new NonPersistentNonOrderedCoexpLink(geneIdMap.get(fgid), geneIdMap.get(sgid), eff > 0)); SupportDetails sd; if (sdOfFlipped != null) { sd = sdOfFlipped; } else { // we haven't saved the flipped link already so make a new support details. sd = c.createSupportDetails(geneIdMap.get(fgid), geneIdMap.get(sgid), eff > 0); sess.save(sd); } g2g.setNumDatasetsSupporting(0); g2g.setSupportDetails(sd); assert sd.getId() != null; linksToSave.put(sd, g2g); links.add(new NonPersistentNonOrderedCoexpLink(g2g)); if (i++ % BATCH_SIZE == 0) { sess.flush(); sess.clear(); } } for (SupportDetails sd : linksToSave.keySet()) { assert sd.getId() != null; sess.save(linksToSave.get(sd)); if (i++ % BATCH_SIZE == 0) { sess.flush(); sess.clear(); } } this.updateGeneCoexpressedWith(links); return linksToSave; } @Override @Transactional(readOnly = true) public Map<Gene, Integer> countOldLinks(Collection<Gene> genes) { Map<Gene, Integer> results = new HashMap<>(); Gene g = genes.iterator().next(); String oldTable = CoexpressionQueryUtils.getGeneLinkTableName(g.getTaxon()).replace("COEXP", "CO_EXP"); SQLQuery q = this.getSessionFactory().getCurrentSession() .createSQLQuery("select count(*) from " + oldTable + " WHERE FIRST_GENE_FK=?"); int i = 0; for (Gene gene : genes) { Number c = (Number) q.setParameter(0, gene.getId()).uniqueResult(); results.put(gene, c.intValue()); if (++i % 500 == 0) { CoexpressionDaoImpl.log.info("Got counts for " + i + " genes, last was " + gene + " and had " + c.intValue() + " links ..."); } } return results; } @Override @Transactional public void updateRelativeNodeDegrees(Map<Long, List<Double>> relRanksPerGenePositive, Map<Long, List<Double>> relRanksPerGeneNegative) { Session session = this.getSessionFactory().getCurrentSession(); ByteArrayConverter bac = new ByteArrayConverter(); int i = 0; for (Long g : relRanksPerGenePositive.keySet()) { i = this.process(relRanksPerGenePositive.get(g), session, bac, i, g, true); } for (Long g : relRanksPerGeneNegative.keySet()) { i = this.process(relRanksPerGeneNegative.get(g), session, bac, i, g, false); } } @SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use public void updateModifiedSupportDetails(BioAssaySet experiment, Collection<SupportDetails> supportDetailsToDelete, Collection<SupportDetails> supportDetailsToUpdate) { int count; int BATCH_SIZE = 1024; Session sess = this.getSessionFactory().getCurrentSession(); /* * no cascade, so we have to make sure these get updated. */ count = 0; for (SupportDetails sd : supportDetailsToUpdate) { sess.update(sd); if (++count % 10000 == 0) { CoexpressionDaoImpl.log .info("Updated " + count + " support details relevant to " + experiment + "..."); } if (count % BATCH_SIZE == 0) { sess.flush(); sess.clear(); } } CoexpressionDaoImpl.log.info("Updated " + count + " support details relevant to " + experiment + "..."); sess.flush(); sess.clear(); count = 0; for (SupportDetails sd : supportDetailsToDelete) { sess.delete(sd); if (++count % 10000 == 0) { CoexpressionDaoImpl.log .info("Removed support details for " + count + " links for " + experiment + "..."); } if (count % BATCH_SIZE == 0) { sess.flush(); sess.clear(); } } // finish deletes of the sd (this is not really necessary here, but trying to be consistent) sess.flush(); sess.clear(); } @SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use public List<Object[]> getRawCoexpressionFromDbViaGenes(Collection<Long> geneIds, Taxon t, int stringency) { String sqlQuery1 = "select ID, POSITIVE, SUPPORT, FIRST_GENE_FK, SECOND_GENE_FK, SUPPORT_DETAILS_FK from " + CoexpressionQueryUtils.getGeneLinkTableName(t) + " where FIRST_GENE_FK in (:genes) and SUPPORT>=:s"; Session sess = this.getSessionFactory().getCurrentSession(); SQLQuery query1 = sess.createSQLQuery(sqlQuery1); query1.setParameterList("genes", geneIds.toArray()); query1.setParameter("s", Math.max(1, stringency)); // This is actually pretty fast. return (List<Object[]>) query1.list(); } private int process(List<Double> relRanks, Session sess, ByteArrayConverter bac, int i, Long g, boolean positive) { GeneCoexpressionNodeDegree nd = (GeneCoexpressionNodeDegree) sess.load(GeneCoexpressionNodeDegree.class, g); byte[] r = bac.doubleArrayToBytes(relRanks.toArray(new Double[] {})); if (positive) { nd.setRelativeLinkRanksPositive(r); } else { nd.setRelativeLinkRanksNegative(r); } sess.update(nd); if (++i % 1024 == 0) { sess.flush(); sess.clear(); } return i; } /** * @param geneIds gene ids * @param className class name * @return query */ private Query buildQuery(Collection<Long> geneIds, String className) { String query = "select g2g from " + className + " as g2g"; // we usually need to get the support details, so we can security-filter the data. Exception is some admin // tasks. query = query + " where g2g.firstGene in (:geneIds) "; if (!CoexpressionDaoImpl.DELETE_ORPHAN_LINKS) { // means links could have support of zero, and we don't want those. query = query + " and g2g.numDataSetsSupporting > 0 "; } Query q = this.getSessionFactory().getCurrentSession().createQuery(query); q.setParameterList("geneIds", geneIds); return q; } /** * Importantly, this method does not filter by stringency or anything. * * @param genes genes * @param results will go here, each list is sorted * @return genes which were not found in the cache */ private Collection<Long> checkCache(Collection<Long> genes, Map<Long, List<CoexpressionValueObject>> results) { assert results != null; assert !genes.isEmpty(); /* * Check cache and initialize the result data structure. */ Collection<Long> geneIdsNeeded = new HashSet<>(); int resultsFound = 0; for (Long g : genes) { List<CoexpressionValueObject> cachedResults = this.gene2GeneCoexpressionCache.get(g); if (cachedResults != null) { resultsFound += cachedResults.size(); results.put(g, cachedResults); } else { geneIdsNeeded.add(g); } } if (genes.size() > 1 && geneIdsNeeded.size() < genes.size()) { CoexpressionDaoImpl.log .info("Found results for " + (genes.size() - geneIdsNeeded.size()) + " genes in the cache"); CoexpressionDaoImpl.log .debug("There were " + resultsFound + " results, before any stringency filtering"); } return geneIdsNeeded; } /** * find results for the cached results that include the second gene. No filter for maximum. Importantly, this method * *does* filter by stringency. Still need to trimAndFinish. * * @param genes genes * @param results results will be placed here * @param stringency used to filter the results from the cache to ones we want * @return genes which were not found in the cache. */ private Collection<Long> checkCacheForInterGeneLinks(Collection<Long> genes, Map<Long, List<CoexpressionValueObject>> results, int stringency) { // is the cache going to help? if (stringency < CoexpressionCache.CACHE_QUERY_STRINGENCY) return genes; Collection<Long> genesNeeded = new HashSet<>(); int resultsFound = 0; for (Long gid : genes) { List<CoexpressionValueObject> e = this.gene2GeneCoexpressionCache.get(gid); if (e != null) { for (CoexpressionValueObject g2g : e) { // check stringency AND *both* genes are in the link. if (g2g.getNumDatasetsSupporting() >= stringency && genes.contains(g2g.getQueryGeneId()) && genes.contains(g2g.getCoexGeneId())) { if (!results.containsKey(gid)) { results.put(gid, new ArrayList<CoexpressionValueObject>()); } resultsFound++; assert g2g.isFromCache(); results.get(gid).add(g2g); } } } else { genesNeeded.add(gid); } } if (genesNeeded.size() < genes.size()) { CoexpressionDaoImpl.log.info("Found " + resultsFound + " results for " + (genes.size() - genesNeeded.size()) + " genes in the cache at stringency " + stringency); } return genesNeeded; } private Map<Long, List<CoexpressionValueObject>> convertToValueObjects(List<Object[]> rawResults, List<Object[]> supportDetails, Collection<Long> geneIds) { int removed = 0; Set<NonPersistentNonOrderedCoexpLink> allSeen = new HashSet<>(rawResults.size()); // unwrap the supportDetails into a map. Map<Long, Set<Long>> supportDetailsLists = null; if (supportDetails != null) { supportDetailsLists = new HashMap<>(); for (Object[] oa : supportDetails) { Long id = ((BigInteger) oa[0]).longValue(); byte[] data = (byte[]) oa[1]; IdArrayValueObject vo = new IdArrayValueObject(data); supportDetailsLists.put(id, vo.getIdsSet()); } } StopWatch timer = new StopWatch(); timer.start(); Map<Long, List<CoexpressionValueObject>> results = new HashMap<>(); int numUnsupported = 0; int n = 0; for (Object[] oa : rawResults) { Long id = ((BigInteger) oa[0]).longValue(); Boolean pos = (byte) oa[1] > 0; Integer support = (Integer) oa[2]; Long queryGeneId = ((BigInteger) oa[3]).longValue(); Long secondGene = ((BigInteger) oa[4]).longValue(); Long supportDetailsId = ((BigInteger) oa[5]).longValue(); if (support == 0) { throw new IllegalArgumentException("Links should not be unsupported: " + id); } NonPersistentNonOrderedCoexpLink seen = new NonPersistentNonOrderedCoexpLink(queryGeneId, secondGene, pos); /* * remove duplicates, since each link can be here twice (x->y and y->x). (can happen.) */ if (allSeen.contains(seen)) { ++removed; continue; } allSeen.add(seen); if (!results.containsKey(queryGeneId)) { results.put(queryGeneId, new ArrayList<CoexpressionValueObject>()); } CoexpressionValueObject g2gvo = new CoexpressionValueObject(queryGeneId, secondGene, pos, support, supportDetailsId, supportDetailsLists == null ? null : supportDetailsLists.get(supportDetailsId)); assert g2gvo.getNumDatasetsSupporting() > 0; results.get(queryGeneId).add(g2gvo); if (geneIds != null && geneIds.contains(g2gvo.getCoexGeneId())) { g2gvo.setInterQueryLink(true); } if (++n % 1000 == 0 && timer.getTime() > 1000) { CoexpressionDaoImpl.log.debug("Process " + n + " coexpressions: " + timer.getTime() + "ms"); n = 0; timer.reset(); timer.start(); } } if (removed > 0) CoexpressionDaoImpl.log .debug("Removed " + removed + " duplicate links while converting to value objects"); //noinspection ConstantConditions // Can change if (numUnsupported > 0) CoexpressionDaoImpl.log.info("Removed " + numUnsupported + " links that had support of zero."); if (results.isEmpty()) throw new IllegalStateException("Removed everything! (of " + rawResults.size() + " results)"); return results; } /** * Remove duplicates and convert to value objects. Links are marked as "interQuery" if the geneIds is non-null and * the link is between two of them. * * @param rawResults from the database. The support details might not have been fetched. * @param geneIds gene IDs used in the query, can be null * @return value objects, organized by the "first" gene of each entity. Note: For some query genes, we might not * have gotten any results. */ private Map<Long, List<CoexpressionValueObject>> convertToValueObjects(List<Gene2GeneCoexpression> rawResults, Collection<Long> geneIds) { int removed = 0; Set<NonPersistentNonOrderedCoexpLink> allSeen = new HashSet<>(rawResults.size()); // raw results from db. Map<Long, List<CoexpressionValueObject>> results = new HashMap<>(); int numUnsupported = 0; for (Gene2GeneCoexpression g2g : rawResults) { if (g2g.getNumDatasetsSupporting() == 0) { throw new IllegalArgumentException("Links should not be unsupported: " + g2g); } Long queryGeneId = g2g.getFirstGene(); if (geneIds != null && !geneIds.contains(queryGeneId)) { continue; } NonPersistentNonOrderedCoexpLink seen = new NonPersistentNonOrderedCoexpLink(g2g); /* * remove duplicates, since each link can be here twice (x->y and y->x). (can happen; + and - links are * counted separately.) */ if (allSeen.contains(seen)) { ++removed; continue; } allSeen.add(seen); if (!results.containsKey(queryGeneId)) { results.put(queryGeneId, new ArrayList<CoexpressionValueObject>()); } CoexpressionValueObject g2gvo = new CoexpressionValueObject(g2g); assert g2gvo.getNumDatasetsSupporting() > 0; results.get(queryGeneId).add(g2gvo); if (geneIds != null && geneIds.contains(g2gvo.getCoexGeneId())) { g2gvo.setInterQueryLink(true); } } if (removed > 0) CoexpressionDaoImpl.log.debug("Removed " + removed + " duplicate links"); //noinspection ConstantConditions // Can change if (numUnsupported > 0) CoexpressionDaoImpl.log.info("Removed " + numUnsupported + " links that had support of zero."); if (results.isEmpty()) throw new IllegalStateException("Removed everything! (of" + rawResults.size() + " results)"); return results; } private Integer countLinks(Taxon t, BioAssaySet ee) { int rawCount = ((BigInteger) this .getSessionFactory().getCurrentSession().createSQLQuery("select count(*) from " + CoexpressionQueryUtils.getExperimentLinkTableName(t) + " e where e.EXPERIMENT_FK=:ee") .setParameter("ee", ee.getId()).uniqueResult()).intValue(); // this includes the flipped versions. assert rawCount % 2 == 0; return rawCount / 2; } /** * Find link (or null) based on the genes and direction of correlation in the given nonpersistent link. * * @param q q * @param g2g g2g * @param existingResults index of which links already have an entry in the database (possibly with a support of * zero) * @return gene 2 gene coexp */ private Gene2GeneCoexpression findLink(Query q, NonPersistentNonOrderedCoexpLink g2g, Map<NonPersistentNonOrderedCoexpLink, Boolean> existingResults) { Long firstGene = g2g.getFirstGene(); Long secondGene = g2g.getSecondGene(); assert firstGene < secondGene; if (existingResults.containsKey(g2g) && existingResults.get(g2g)) { try { q.setParameter("f", firstGene); q.setParameter("s", secondGene); q.setParameter("pc", g2g.isPositiveCorrelation()); Gene2GeneCoexpression existingLink = (Gene2GeneCoexpression) q.uniqueResult(); if (CoexpressionDaoImpl.log.isDebugEnabled() && existingLink != null && existingResults.containsKey(g2g) && existingResults.get(g2g)) CoexpressionDaoImpl.log.debug("fetched existing link: " + existingLink + " (" + g2g + ") " + existingResults.get(g2g)); return existingLink; // which can be null } catch (HibernateException e) { CoexpressionDaoImpl.log.error("Error while searching for: " + g2g + ": " + e.getMessage()); throw e; } } // // it isn't in the existing results we fetched already, so we don't bother checking if (CoexpressionDaoImpl.log.isDebugEnabled()) CoexpressionDaoImpl.log.debug("No existing link for " + g2g); return null; } /** * This method is for internal use only since it does not constrain on datasets. E.g. for node degree computations. * * @param gene gene * @return results for the gene. */ private List<CoexpressionValueObject> getCoexpression(Gene gene) { // DO NOT change this to use the alternative method getCoexpressionFromDbViaGenes2 Map<Long, List<CoexpressionValueObject>> r = this.getCoexpressionFromDbViaGenes(EntityUtils.getIds(gene), CoexpressionQueryUtils.getGeneLinkClassName(gene)); List<CoexpressionValueObject> rr = r.get(gene.getId()); if (rr == null) { return new ArrayList<>(); } return rr; } /** * @param t t * @param experiment ee * @return all the links which involve this experiment, including the "flipped" versions. */ private Collection<Gene2GeneCoexpression> getCoexpression(Taxon t, BioAssaySet experiment) { Session sess = this.getSessionFactory().getCurrentSession(); // distinct because ee links are stored twice. However, the flipped versions of the ee links are linked to only // the forward version, so we only get half of the g2g links here. CoexpressionDaoImpl.log.info("Fetching support details ..."); List<Long> supportDetails = sess .createQuery("select distinct sd.id from " + CoexpressionQueryUtils.getExperimentLinkClassName(t) + " e, " + CoexpressionQueryUtils.getGeneLinkClassName(t) + " g2g join g2g.supportDetails sd where e.experiment=:ee and e.linkId = g2g.id ") .setParameter("ee", experiment).list(); Collections.sort(supportDetails); List<Gene2GeneCoexpression> results = new ArrayList<>(); CoexpressionDaoImpl.log.info("Fetching links ..."); // refetch, this time in a manner that gets the flipped versions too. int i = 0; BatchIterator<Long> bi = BatchIterator.batches(supportDetails, 1024); for (; bi.hasNext();) { results.addAll(sess .createQuery("from " + CoexpressionQueryUtils.getGeneLinkClassName(t) + " g2g join fetch g2g.supportDetails sd where sd.id in (:ids)") .setParameterList("ids", bi.next()).list()); if (++i % 200 == 0) { CoexpressionDaoImpl.log.info(i + " batches fetched (" + results.size() + " links fetched so far)"); } } assert results.size() % 2 == 0; // not a great check, but we should have flipped versions of every link. CoexpressionDaoImpl.log.info("Fetched " + results.size() + " links"); return results; } /** * Key method. Depending on the input, the query is done experiment-first or gene-first. This is a low-level method * called by several others. * The support and tested-in details (if populated) will reflect only the datasets given. This means that data might * be removed if it no longer meets stringency requirements. * * @param t t * @param genes must be non-null, but can be empty to remove constraint on genes. * @param bas must be non-empty. * @param stringency stringency * @param maxResults max results * @param quick quick * @return map */ private Map<Long, List<CoexpressionValueObject>> getCoexpressionFromCacheOrDb(Taxon t, Collection<Long> genes, Collection<Long> bas, int stringency, int maxResults, boolean quick) { /* * If the stringency is too low (relative to the total number of datasets analyzed in the system), we end up * getting tons of data from the LINK table that then gets thrown out at the SUPPORT_DETAILS phase. Because the * stringency is largely set based on the number of data sets (genes too, but a lesser extent), this is partly * accounted for. But we should check both since the stringency can bee too low. */ assert !bas.isEmpty(); Map<Long, List<CoexpressionValueObject>> results; if (genes.isEmpty() && bas.size() < CoexpressionDaoImpl.MAX_DATASETS_FOR_DATASET_ONLY_QUERY) { /* * Experiment-major mode, no gene constraint: Find links common to the experiments in question at the * requested stringency. This could be quite slow since the cache cannot be used very well, so the caller * has to decide whether to allow this. * * NOTE we could have an experiment-level cache, but it would get big very fast for limited utility. */ if (bas.size() > 1) CoexpressionDaoImpl.log.info("Query in experiment-only mode, no gene constraint, " + bas.size() + " datasets specified, stringency=" + stringency); results = this.getCoexpressionFromDbViaExperiments(t, bas, quick); } else if (bas.size() < CoexpressionDaoImpl.MAX_DATASETS_FOR_DATASET_FIRST_QUERY && genes.size() > CoexpressionDaoImpl.MIN_GENES_FOR_DATASET_FIRST_QUERY) { /* * Experiment-major mode, with gene constraint: get results for the given genes in just the given data sets; * fetch the details after that. */ if (bas.size() > 1) CoexpressionDaoImpl.log.info("Query in experiment-first mode, with gene constraint, " + bas.size() + " datasets specified, stringency=" + stringency); results = this.getCoexpressionFromCacheOrDbViaExperiments(t, genes, bas, stringency, quick); } else if (!genes.isEmpty()) { /* * Gene-major mode: get all the results for the genes; filter for data sets selection separately. */ if (genes.size() > 1) { CoexpressionDaoImpl.log.info("Query in gene-first mode for " + genes.size() + " genes, " + bas.size() + " datasets specified, stringency=" + stringency); } results = this.getCoexpressionFromCacheOrDbViaGenes(t, genes, stringency, quick); } else { throw new IllegalArgumentException( "Query cannot be safely constructed, please provide more constraints to datasets and/or genes"); } this.trimAndFinishResults(results, bas, stringency, maxResults); return results; } /* * Get links from the cache or the database, querying in experiment-first mode, but constrained to involve the given * genes. Does not do the trimming step, nor are the results guaranteed to meet the stringency set. */ private Map<Long, List<CoexpressionValueObject>> getCoexpressionFromCacheOrDbViaExperiments(Taxon t, Collection<Long> genes, Collection<Long> bas, int stringency, boolean quick) { assert stringency <= bas.size(); assert !genes.isEmpty(); Map<Long, List<CoexpressionValueObject>> results = new HashMap<>(); /* * First, check the cache -- if the stringency is >= limit */ Collection<Long> genesNeeded = new HashSet<>(genes); if (stringency >= CoexpressionCache.CACHE_QUERY_STRINGENCY) { genesNeeded = this.checkCache(genes, results); if (genesNeeded.isEmpty()) { return results; } } /* * Get all the data for all the experiments queried, constrained to involve the genes in question. * * This uses the ECL1EFK index, which is of (experiment, gene1, gene2). Note that if there are a lot of genes * this can get slow ... */ Query q = this.getSessionFactory().getCurrentSession() .createQuery(" from " + CoexpressionQueryUtils.getExperimentLinkClassName(t) + " where experiment.id in (:ees) and firstGene in (:genes)"); // May need to batch over genes... BatchIterator<Long> it = BatchIterator.batches(bas, CoexpressionDaoImpl.BATCH_SIZE_SMALL); StopWatch timer = new StopWatch(); timer.start(); List<ExperimentCoexpressionLink> links = new ArrayList<>(); for (; it.hasNext();) { q.setParameterList("ees", it.next()).setParameterList("genes", genesNeeded); links.addAll(q.list()); } if (timer.getTime() > 2000) { CoexpressionDaoImpl.log.info("Query for coexp for : " + genes.size() + " genes " + " in " + bas.size() + " experiments: " + timer.getTime() + "ms"); } /* * Track the support for the links among the queried data sets as we go over this in experiment-major mode. */ //noinspection MismatchedQueryAndUpdateOfCollection // We still need to compare it to stringency CountingMap<Long> supportCounts = new CountingMap<>(); List<Long> keepers = new ArrayList<>(); for (ExperimentCoexpressionLink link : links) { assert genes.contains(link.getFirstGene()); if (supportCounts.increment(link.getLinkId()) >= stringency) { keepers.add(link.getLinkId()); } } if (keepers.isEmpty()) { return new HashMap<>(); } return this.loadAndConvertLinks(t, keepers, genes, quick); } /** * Fetch coexpression data for one or more genes, without a constraint on data sets, but with other parameters * possible. It checks the cache, then the database. Results not retrieved from the cache will be immediately cached * (if appropriate) * * @param t taxon * @param genes IDs, assumed to be all from the same taxon * @param stringency minimum level of support required * @param quick whether to fill in the information on which data sets were supporting and how many datasets were * tested. * @return map of gene ids to ranked list of coexpression value objects, which will still need to be trimmed. */ private Map<Long, List<CoexpressionValueObject>> getCoexpressionFromCacheOrDbViaGenes(Taxon t, Collection<Long> genes, int stringency, boolean quick) { Map<Long, List<CoexpressionValueObject>> finalResult = new HashMap<>(); /* * First, check the cache -- if the stringency is > =limit */ Collection<Long> genesNeeded = new HashSet<>(genes); if (stringency >= CoexpressionCache.CACHE_QUERY_STRINGENCY) { genesNeeded = this.checkCache(genes, finalResult); if (genesNeeded.isEmpty()) { return finalResult; } } // we assume the genes are from the same taxon. assert t != null; // fetch rest of genes needed from the database. StopWatch timer = new StopWatch(); timer.start(); int CHUNK_SIZE = 64; // how many genes to get at once. int genesQueried = 0; BatchIterator<Long> geneIdsIt = new BatchIterator<>(genesNeeded, CHUNK_SIZE); int total = 0; for (; geneIdsIt.hasNext();) { StopWatch innertimer = new StopWatch(); innertimer.start(); Collection<Long> batch = geneIdsIt.next(); Map<Long, List<CoexpressionValueObject>> rr = this.getCoexpressionFromDbViaGenes2(batch, t, stringency, !quick); // we should not cache unless everything is populated if (!rr.isEmpty() && stringency <= CoexpressionCache.CACHE_QUERY_STRINGENCY && !quick) { gene2GeneCoexpressionCache.cacheCoexpression(rr); } for (Long g : rr.keySet()) { // could replace with a single putAll but want this assertion for now. assert !finalResult.containsKey(g); finalResult.put(g, rr.get(g)); total += rr.get(g).size(); } if (innertimer.getTime() > 1000 && genesQueried > 0) { CoexpressionDaoImpl.log .debug("Fetched " + total + " coexpression results from db for " + genesQueried + "/" + genesNeeded.size() + " genes needed in " + innertimer.getTime() + "ms"); } genesQueried += batch.size(); } if (timer.getTime() > 10000) { // this raw count is not really relevant - it has to be filtered later. CoexpressionDaoImpl.log.debug("Fetched " + total + " coexpression results from db or cache for " + genes.size() + " genes in " + timer.getTime() + "ms"); } return finalResult; } /** * Find links common to the given experiments at stringency given, without any constraint on the genes. * * @param t t * @param bas not too many or else this could be slow, especially if stringency << bas.size(). * @param quick quick */ private Map<Long, List<CoexpressionValueObject>> getCoexpressionFromDbViaExperiments(Taxon t, Collection<Long> bas, boolean quick) { /* * Get all the data for all the experiments queried. We avoid a join on the gene2gene table (defeats purpose). * Distinct okay here because we're not counting stringency based on the raw results here - see comment below. */ Query q = this .getSessionFactory().getCurrentSession().createQuery("select distinct linkId from " + CoexpressionQueryUtils.getExperimentLinkClassName(t) + " where experiment.id in (:ees)") .setParameterList("ees", bas); List<Long> links = q.list(); if (links.isEmpty()) { return new HashMap<>(); } return this.loadAndConvertLinks(t, links, null, quick); } /** * Gene-focused query. Use this if you don't care about which data sets are involved (or if there are many data * sets), for a relatively small number of genes. This DOES NOT cache the results, the caller has to do that. It * also does not check the cache. * * @param geneIds the gene IDs * @param className the class name * @return results without any limit on the size, each list is already sorted. */ private Map<Long, List<CoexpressionValueObject>> getCoexpressionFromDbViaGenes(Collection<Long> geneIds, String className) { Query q = this.buildQuery(geneIds, className); StopWatch timer = new StopWatch(); timer.start(); List<Gene2GeneCoexpression> rawResults = q.list(); if (timer.getTime() > 1000) { CoexpressionDaoImpl.log.debug("Initial coexp query for " + geneIds.size() + "genes took " + timer.getTime() + "ms: " + rawResults.size() + " results"); CoexpressionDaoImpl.log.debug("Query was: " + q.getQueryString()); } if (rawResults.isEmpty()) return new HashMap<>(); timer.reset(); timer.start(); Map<Long, List<CoexpressionValueObject>> results = this.convertToValueObjects(rawResults, geneIds); for (Long g : results.keySet()) { List<CoexpressionValueObject> gc = results.get(g); Collections.sort(gc); } if (timer.getTime() > 100) { CoexpressionDaoImpl.log.debug("Convert to value objects, filter, sort and finish " + rawResults.size() + " results: " + timer.getTime() + "ms"); } return results; } /** * Alternative method: query twice, once to get the coexpression basics and then again to get the support details, * instead of using a join. * * @param populateTestedInDetails populate tested in details * @param stringency stringency * @param geneIds gene IDs * @param t taxon */ private Map<Long, List<CoexpressionValueObject>> getCoexpressionFromDbViaGenes2(Collection<Long> geneIds, Taxon t, int stringency, boolean populateTestedInDetails) { StopWatch timer = new StopWatch(); timer.start(); List<Object[]> q1results = this.getRawCoexpressionFromDbViaGenes(geneIds, t, stringency); CoexpressionDaoImpl.log.debug(q1results.size() + " raw coexpression results for " + geneIds.size() + " genes at support>=" + stringency + " " + timer.getTime() + "ms"); if (q1results.isEmpty()) { return new HashMap<>(); } List<Object[]> supportDetails = new ArrayList<>(); /* * Because we are not trimming the results at all here, this can be a lot of data to iterate over, even at * high stringencies. For example, for 20 genes at a stringency of 5, because the query above does not * constrain to data sets, there can be >500 per gene, or >100k links in total. Fetching the support details * here is rather wasteful if we are not retaining the results, but we don't know that until we know which * data sets are supporting. */ BatchIterator<Object[]> batches = BatchIterator.batches(q1results, CoexpressionDaoImpl.BATCH_SIZE); int n = 1; for (Collection<Object[]> batch : batches) { StopWatch timer2 = new StopWatch(); timer2.start(); List<Long> supportDetailsIds = new ArrayList<>(); for (Object[] oa : batch) { Long supportDetailsId = ((BigInteger) oa[5]).longValue(); supportDetailsIds.add(supportDetailsId); } // Note: should never be empty String sqlQuery2 = "select ID,BYTES from " + CoexpressionQueryUtils.getSupportDetailsTableName(t) + " where ID in (:ids)"; SQLQuery query2 = this.getSessionFactory().getCurrentSession().createSQLQuery(sqlQuery2); query2.setParameterList("ids", supportDetailsIds.toArray()); supportDetails.addAll(query2.list()); if (timer2.getTime() > 1000) { CoexpressionDaoImpl.log .debug("Fetch batch " + n + " of support details: " + timer2.getTime() + "ms"); } n++; } CoexpressionDaoImpl.log .debug("Fetched details for " + supportDetails.size() + " coexpressions, " + n + " batches"); if (timer.getTime() > 5000) { CoexpressionDaoImpl.log.info("Coexpression query: " + geneIds.size() + " genes took " + timer.getTime() + "ms: " + q1results.size() + " results"); } timer.reset(); timer.start(); // it might be better to do this in the loop above, incrementally per batch. Map<Long, List<CoexpressionValueObject>> results = this.convertToValueObjects(q1results, supportDetails, geneIds); if (timer.getTime() > 100) { CoexpressionDaoImpl.log .info("Convert to value objects " + q1results.size() + " results: " + timer.getTime() + "ms"); } timer.reset(); timer.start(); for (Long g : results.keySet()) { List<CoexpressionValueObject> gc = results.get(g); Collections.sort(gc); if (populateTestedInDetails) { this.populateTestedInDetails(gc); } } if (timer.getTime() > 100) { CoexpressionDaoImpl.log .info("Filter, sort and finish " + q1results.size() + " results: " + timer.getTime() + "ms"); } return results; } /** * Find links among the given genes in the given experiments, querying the experiment-level table. Does not check * the cache. There are easily hundreds of genes, number of experiments would be relatively small (otherwise we * would query gene-major). * * @param bas not too many * @param t taxon * @param genes gene IDs * @param quick quick run */ private Map<Long, List<CoexpressionValueObject>> getInterCoexpressionFromDbViaExperiments(Taxon t, Collection<Long> genes, Collection<Long> bas, boolean quick) { // distinct okay here because we're not counting stringency based on the raw results here. See comment below. Query q = this.getSessionFactory().getCurrentSession() .createQuery("select distinct linkId from " + CoexpressionQueryUtils.getExperimentLinkClassName(t) + " where experiment.id in (:ees) and firstGene in (:genes) and secondGene in (:genes2)") .setParameterList("ees", bas).setParameterList("genes", genes).setParameterList("genes2", genes); StopWatch timer = new StopWatch(); timer.start(); List<Long> links = q.list(); // We cannot batch this because we miss some combinations of links. CoexpressionDaoImpl.log .info(links.size() + " distinct gene2gene link ids obtained for experiment-level query for " + genes.size() + "genes in " + bas.size() + " experiments: " + timer.getTime() + "ms"); /* * Track the support for the links seen as we go over this in experiment-major mode. * * WARNING: the following idea is messed up because the links can be in a-b or b-a order, which are separate db * entities, and therefore counted separately here. [Only retain links (keepers) that meet the requested * stringency. Note that the only way we know the support is counting the number of experiments the link is in, * which happens in trimAndFinishResults(). See bug 4411 */ if (links.isEmpty()) { return new HashMap<>(); } return this.loadAndConvertLinks(t, new ArrayList<>(links), genes, quick); } /* * Does not check the cache - this must be done by the caller * */ private Map<Long, List<CoexpressionValueObject>> getInterCoexpressionFromDbViaGenes(Taxon taxon, Collection<Long> genes, int stringency, boolean quick) { if (genes.size() == 0) return new HashMap<>(); Map<Long, List<CoexpressionValueObject>> results = new HashMap<>(); // we assume the genes are from the same taxon. Confirmed: this uses the index (see bug 4055) String g2gClassName = CoexpressionQueryUtils.getGeneLinkClassName(taxon); final String firstQueryString = "select g2g from " + g2gClassName + " as g2g where g2g.firstGene in (:qgene) and g2g.secondGene in (:genes) " + "and g2g.numDataSetsSupporting >= :stringency "; /* * Note: if the number of genes is too large, it may be faster to simply query without the second 'in' clause * and filter the results. */ StopWatch oTimer = new StopWatch(); oTimer.start(); int batchSize = 32; BatchIterator<Long> it = BatchIterator.batches(genes, batchSize); List<CoexpressionValueObject> g2gs = new ArrayList<>(genes.size()); Set<CoexpressionValueObject> seen = new HashSet<>(); for (; it.hasNext();) { Collection<Long> queryGeneBatch = it.next(); StopWatch timer = new StopWatch(); timer.start(); Collection<Gene2GeneCoexpression> r = this.getHibernateTemplate().findByNamedParam(firstQueryString, new String[] { "qgene", "genes", "stringency" }, new Object[] { queryGeneBatch, genes, stringency }); if (timer.getTime() > 5000) { CoexpressionDaoImpl.log.debug("Slow query: " + firstQueryString + " took " + timer.getTime() + "ms (" + queryGeneBatch.size() + " query gene batch, " + genes.size() + " target genes), Stringency=" + stringency); } // raw db results, for a batch of genes, add to the whole. for (Gene2GeneCoexpression g2g : r) { CoexpressionValueObject g2gvo = new CoexpressionValueObject(g2g); // we get the links in 'both directions' so we want to omit them. This means some of the query genes // might not be returned as query genes, since they show up in the 'coexpressed' gene instead. if (seen.contains(g2gvo)) continue; seen.add(g2gvo); g2gvo.setInterQueryLink(true); g2gs.add(g2gvo); } } if (!quick && !g2gs.isEmpty()) { StopWatch timer = new StopWatch(); timer.start(); this.populateTestedInDetails(g2gs); if (timer.getTime() > 2000) { CoexpressionDaoImpl.log.debug("Query genes only,fetch tested-in details " + g2gs.size() + " results took " + timer.getTime() + "ms"); } timer.reset(); timer.start(); } /* * all the genes are guaranteed to be in the query list. */ for (CoexpressionValueObject g2g : g2gs) { if (!results.containsKey(g2g.getQueryGeneId())) { results.put(g2g.getQueryGeneId(), new ArrayList<CoexpressionValueObject>()); } results.get(g2g.getQueryGeneId()).add(g2g); } if (oTimer.getTime() > 2000) { CoexpressionDaoImpl.log .info("Query genes only, fetch for " + genes.size() + " genes took " + oTimer.getTime() + "ms"); } for (Long id : results.keySet()) { Collections.sort(results.get(id)); } return results; } private Map<Long, Collection<Long>> getQuickCoex(Collection<Long> ba) { Session sess = this.getSessionFactory().getCurrentSession(); Collection<GeneCoexpressedGenes> r = sess.createQuery("from GeneCoexpressedGenes where geneId in (:ids)") .setParameterList("ids", ba).list(); Map<Long, Collection<Long>> result = new HashMap<>(); for (GeneCoexpressedGenes gcog : r) { result.put(gcog.getGeneId(), gcog.getIds()); } return result; } /** * Load links given their ids (e.g. retrieved from the EE link tables). This is predicted to be slow when fetching * many links, because of random seeks in the g2g table. * * @param t t * @param linkIds to fetch; should be unique. Can already be stringency-filtered to some extent, but this will be * checked again. * @param queryGenes can be null if was unconstrained * @param quick if true, the 'testedin' details will be populated. * @return map */ private Map<Long, List<CoexpressionValueObject>> loadAndConvertLinks(Taxon t, List<Long> linkIds, Collection<Long> queryGenes, boolean quick) { assert !linkIds.isEmpty(); /* * Note that we are not checking the cache, but we could by getting the firstGene from the EE-level links? */ Query q = this.getSessionFactory().getCurrentSession() .createQuery("from " + CoexpressionQueryUtils.getGeneLinkClassName(t) + " g2g join fetch g2g.supportDetails where g2g.id in (:ids)"); /* * It is possible that we are retrieving the same underlying link twice - in the a-b and b-a orientations. Those * have to be merged. This is taken care of in the convertToValueObjects */ int BATCH_SIZE = 1024; Collections.sort(linkIds); // more efficient querying. BatchIterator<Long> idBatches = BatchIterator.batches(linkIds, BATCH_SIZE); StopWatch timer = new StopWatch(); timer.start(); List<Gene2GeneCoexpression> rawResults = new ArrayList<>(); for (; idBatches.hasNext();) { rawResults.addAll(q.setParameterList("ids", idBatches.next()).list()); } if (rawResults.isEmpty()) { CoexpressionDaoImpl.log.warn("Ids were invalid: no results for linkIds including " + linkIds.get(0)); return new HashMap<>(); } else if (rawResults.size() < linkIds.size() && rawResults.size() < new HashSet<>(linkIds).size()) { // maybe linkIds has repeats? CoexpressionDaoImpl.log.warn("Some ids were invalid, only got " + rawResults.size() + ", expected " + linkIds.size() + " results"); } if (timer.getTime() > 2000) { CoexpressionDaoImpl.log .info("Load and convert " + rawResults.size() + " links: " + timer.getTime() + "ms"); } Map<Long, List<CoexpressionValueObject>> results = this.convertToValueObjects(rawResults, queryGenes); for (Long g : results.keySet()) { if (!quick) { assert queryGenes == null || queryGenes.contains(g); this.populateTestedInDetails(results.get(g)); } } return results; } private void populateSettings(List<CoexpressionValueObject> list, int size, int maxResults) { for (CoexpressionValueObject g2g : list) { g2g.setQueryStringency(size); g2g.setMaxResults(maxResults); } } /** * When fetching data. Requires database hits, but values for testedin are cached. * * @param g2gLinks links */ private void populateTestedInDetails(Collection<CoexpressionValueObject> g2gLinks) { assert !g2gLinks.isEmpty(); StopWatch timer = new StopWatch(); timer.start(); // GeneCoexpressionTestedIn are one-per-gene so we first gather up all the unique genes we have to look at. Map<Long, GeneCoexpressionTestedIn> gcTestedIn = new HashMap<>(); Set<Long> genes = new HashSet<>(); for (CoexpressionValueObject gene2GeneCoexpression : g2gLinks) { Long queryGeneId = gene2GeneCoexpression.getQueryGeneId(); GeneCoexpressionTestedIn queryGeneTestedIn = geneTestedInCache.get(queryGeneId); if (queryGeneTestedIn == null) { genes.add(queryGeneId); } else { gcTestedIn.put(queryGeneId, queryGeneTestedIn); } Long coexGeneId = gene2GeneCoexpression.getCoexGeneId(); GeneCoexpressionTestedIn coexGeneTestedIn = geneTestedInCache.get(coexGeneId); if (coexGeneTestedIn == null) { genes.add(coexGeneId); } else { gcTestedIn.put(coexGeneId, coexGeneTestedIn); } } if (!genes.isEmpty()) { // fetch the GeneCoexpressionTestedIn information for those genes which were not cached. Query q = this.getSessionFactory().getCurrentSession() .createQuery("from GeneCoexpressionTestedIn g where geneId in (:genes)"); int BATCH_SIZE = 512; int n = 0; for (BatchIterator<Long> it = BatchIterator.batches(genes, BATCH_SIZE); it.hasNext();) { Collection<Long> g = it.next(); q.setParameterList("genes", g); List<GeneCoexpressionTestedIn> list = q.list(); Map<Long, GeneCoexpressionTestedIn> idMap = EntityUtils.getIdMap(list, "getGeneId"); geneTestedInCache.cache(idMap); gcTestedIn.putAll(idMap); ++n; } if (timer.getTime() > 1000) CoexpressionDaoImpl.log.debug( "Query for tested-in details for " + genes.size() + " genes: " + timer.getTime() + " ms (" + n + " batches), values fetched or from cache size=" + gcTestedIn.size()); } timer.reset(); timer.start(); // copy it into the g2g value objects. for (CoexpressionValueObject g2g : g2gLinks) { assert g2g.getNumDatasetsSupporting() > 0 : g2g + " has support less than 1"; Long id1 = g2g.getQueryGeneId(); Long id2 = g2g.getCoexGeneId(); GeneCoexpressionTestedIn geneCoexpressionTestedIn1 = gcTestedIn.get(id1); GeneCoexpressionTestedIn geneCoexpressionTestedIn2 = gcTestedIn.get(id2); if (geneCoexpressionTestedIn1 == null || geneCoexpressionTestedIn2 == null) { throw new IllegalStateException("Was missing GeneCoexpressionTestedIn data for genes in " + g2g); } if (geneCoexpressionTestedIn1.getNumDatasetsTestedIn() == 0 || geneCoexpressionTestedIn2.getNumDatasetsTestedIn() == 0) { throw new IllegalStateException(g2g + ": had no data sets tested in: " + StringUtils.join(geneCoexpressionTestedIn1.getIds(), ",") + " :: " + StringUtils.join(geneCoexpressionTestedIn2.getIds(), ",")); } Set<Long> testedIn = geneCoexpressionTestedIn1.andSet(geneCoexpressionTestedIn2); if (testedIn.isEmpty()) { throw new IllegalStateException(g2g + ": had no data sets tested in: " + StringUtils.join(geneCoexpressionTestedIn1.getIds(), ",") + " :: " + StringUtils.join(geneCoexpressionTestedIn2.getIds(), ",")); } g2g.setTestedInDatasets(testedIn); } if (timer.getTime() > 100) CoexpressionDaoImpl.log .debug("Populate into value obects: " + timer.getTime() + "ms (" + g2gLinks.size() + " links)"); } /** * Prefetch information on links, so when we go looking for a particular link we can decide faster. EXPERIMENTAL. * * @param links links * @return Map */ private Map<NonPersistentNonOrderedCoexpLink, Boolean> preFetch(List<NonPersistentNonOrderedCoexpLink> links) { StopWatch timer = new StopWatch(); timer.start(); Map<NonPersistentNonOrderedCoexpLink, Boolean> result = new HashMap<>(); Map<Long, Set<Long>> linksToMap = CoexpressionQueryUtils.linksToMap(links); int BATCH_SIZE = 512; BatchIterator<Long> b = BatchIterator.batches(linksToMap.keySet(), BATCH_SIZE); Map<Long, Collection<Long>> coexg = new HashMap<>(); for (; b.hasNext();) { Collection<Long> ba = b.next(); coexg.putAll(this.getQuickCoex(ba)); } // compare the links in hand with for (NonPersistentNonOrderedCoexpLink li : links) { Collection<Long> g1h = coexg.get(li.getFirstGene()); if (g1h != null && g1h.contains(li.getSecondGene())) { result.put(li, true); continue; } // this seems redundant. Collection<Long> g2h = coexg.get(li.getSecondGene()); if (g2h != null && g2h.contains(li.getFirstGene())) { result.put(li, true); } /* never are adding false */ // result.put( li, false ); } if (!result.isEmpty()) CoexpressionDaoImpl.log.info("Prefetched link data for " + result.size() + "/" + links.size() + " links in " + timer.getTime() + "ms"); return result; } private void removeCoexpressedWith(Set<NonPersistentNonOrderedCoexpLink> toRemove) { Map<Long, Set<Long>> tr = CoexpressionQueryUtils.linksToMap(toRemove); Session sess = this.getSessionFactory().getCurrentSession(); int i = 0; for (Long g : tr.keySet()) { this.removeGeneCoexpressedWith(g, tr.get(g)); if (i++ % 1000 == 0) { sess.flush(); sess.clear(); } } } private void removeGeneCoexpressedWith(Long geneId, Collection<Long> removedGenes) { Session sess = this.getSessionFactory().getCurrentSession(); GeneCoexpressedGenes gcti = (GeneCoexpressedGenes) sess .createQuery("from GeneCoexpressedGenes where geneId = :id").setParameter("id", geneId) .uniqueResult(); // note this might be a no-op. for (Long g : removedGenes) { gcti.removeEntity(g); sess.update(gcti); } } /** * Reverting the "genes-tested-in" information is annoying: we don't know which genes to fix ahead of time. So we * have to check all genes for the taxon. * * @param experiment ee * @param t t */ private void removeTestedIn(Taxon t, BioAssaySet experiment) { Session sess = this.getSessionFactory().getCurrentSession(); List<Long> geneids = sess.createQuery("select id from Gene where taxon = :t").setParameter("t", t).list(); CoexpressionDaoImpl.log .info("Removing 'tested-in' information for up to " + geneids.size() + " genes for " + experiment); BatchIterator<Long> it = BatchIterator.batches(geneids, 1000); for (; it.hasNext();) { Collection<Long> next = it.next(); for (GeneCoexpressionTestedIn gcti : (Collection<GeneCoexpressionTestedIn>) sess .createQuery("from GeneCoexpressionTestedIn where geneId in (:ids)") .setParameterList("ids", next).list()) { // note this might be a no-op. gcti.removeEntity(experiment.getId()); sess.update(gcti); } sess.flush(); sess.clear(); } } /** * Save a batch of <strong>new</strong> links, and construct the to-be-persisted flipped versions. * * @param session session * @param linkIds will be updated with the ids of the links which were saved. * @param batch; will be cleared by this call. * @param c to create flipped versions of appropriate class * @return flipped versions which we will accumulate, sort and save later. */ private List<Gene2GeneCoexpression> saveBatchAndMakeFlipped(Session session, Map<Long, NonPersistentNonOrderedCoexpLink> linkIds, Map<SupportDetails, Gene2GeneCoexpression> batch, LinkCreator c) { StopWatch timer = new StopWatch(); timer.start(); List<Gene2GeneCoexpression> flipped = new ArrayList<>(); for (SupportDetails sd : batch.keySet()) { // have to do this first otherwise adding the ID changes hashcode... Gene2GeneCoexpression g2g = batch.get(sd); assert g2g != null; session.save(sd); assert sd.getNumIds() > 0; g2g.setSupportDetails(sd); assert sd.getNumIds() > 0; assert g2g.getNumDatasetsSupporting() > 0; assert g2g.getSupportDetails().getNumIds() > 0; // make a copy that has the genes flipped; reuse the supportDetails. Gene2GeneCoexpression flippedG2g = c.create(g2g.isPositiveCorrelation() ? 1 : -1, g2g.getSecondGene(), g2g.getFirstGene()); flippedG2g.setSupportDetails(g2g.getSupportDetails()); flipped.add(flippedG2g); assert flippedG2g.getFirstGene().equals(g2g.getSecondGene()); assert flippedG2g.getSecondGene().equals(g2g.getFirstGene()); } for (Gene2GeneCoexpression g2g : batch.values()) { Long id = (Long) session.save(g2g); linkIds.put(id, new NonPersistentNonOrderedCoexpLink(g2g)); } session.flush(); session.clear(); batch.clear(); if (timer.getTime() > 1000) { CoexpressionDaoImpl.log.info("Saved batch: " + timer.getTime() + "ms"); } return flipped; } private void saveExperimentLevelLinks(Session sess, LinkCreator c, TreeMap<Long, NonPersistentNonOrderedCoexpLink> links, BioAssaySet bioAssaySet) { int progress = 0; int BATCH_SIZE = 1024; List<ExperimentCoexpressionLink> flippedLinks = new ArrayList<>(); for (Long linkid : links.keySet()) { NonPersistentNonOrderedCoexpLink link = links.get(linkid); ExperimentCoexpressionLink ecl = c.createEELink(bioAssaySet, linkid, link.getFirstGene(), link.getSecondGene()); /* * At same time, create flipped versions, but save them later for ordering. Notice that we use the SAME link * ID - not the one for the flipped version in the gene2gene table. * * Ideally we would ensure that the gene2gene link ID used is the same for all links that are between * the same pair of genes. That would let us be able to easily count the support directly from an * experiment-level query, without going to the supportDetails. I do not believe the current code guarantees * this. */ flippedLinks.add(c.createEELink(bioAssaySet, linkid, link.getSecondGene(), link.getFirstGene())); sess.save(ecl); if (++progress % 50000 == 0) { CoexpressionDaoImpl.log .info("Created " + progress + "/" + links.size() + " experiment-level links..."); } if (progress % BATCH_SIZE == 0) { sess.flush(); sess.clear(); } } sess.flush(); sess.clear(); /* * Sort the flipped links by the first gene */ Collections.sort(flippedLinks, new Comparator<ExperimentCoexpressionLink>() { @Override public int compare(ExperimentCoexpressionLink o1, ExperimentCoexpressionLink o2) { return o1.getFirstGene().compareTo(o2.getFirstGene()); } }); /* * Save the flipped ones. */ progress = 0; for (ExperimentCoexpressionLink fl : flippedLinks) { sess.save(fl); if (++progress % 50000 == 0) { CoexpressionDaoImpl.log .info("Created " + progress + "/" + links.size() + " flipped experiment-level links..."); } if (progress % BATCH_SIZE == 0) { sess.flush(); sess.clear(); } } // one for the road. sess.flush(); sess.clear(); } /** * Trim results to reflect those in the given data sets, at the selected stringency. This is required for security * (partly) but also to remove "irrelevant" results not queried for. Genes which have no results left after * filtering will be removed. We also remove (per-gene) links that go over the set maxResults limit, unless it is an * inter-query link. It is thus possible that some links at a given stringency will not be returned. * * @param results - map of gene to list of coexpressions * @param bas can be null if there is no constraint. Stringency filter will still be applied. * @param stringency used to filter, and to populate the settings in the VOs. * @param maxResults used to filter per-gene, and to populate the settings in the VOs. 0 means no limit. */ private void trimAndFinishResults(Map<Long, List<CoexpressionValueObject>> results, Collection<Long> bas, int stringency, int maxResults) { assert stringency > 0; assert !bas.isEmpty(); Set<Long> toRemove = new HashSet<>(); for (Long g : results.keySet()) { /* * The results are already sorted at this point, in decreasing stringency. (??) */ int kept = 0; for (Iterator<CoexpressionValueObject> it = results.get(g).iterator(); it.hasNext();) { CoexpressionValueObject g2g = it.next(); if (g2g.getNumDatasetsSupporting() < stringency || !g2g.trimDatasets(bas, stringency)) { it.remove(); } else if (maxResults > 0 && kept >= maxResults && !g2g.isInterQueryLink()) { // only keep up to maxResults, but always keep inter-query links. it.remove(); } else { // System.err.println( g2g ); kept++; } /* * We're removing individual results for a given query gene; if we have now run out of them, we will * remove the gene entirely from the results. */ if (results.get(g).isEmpty()) { toRemove.add(g); } assert g2g.getNumDatasetsSupporting() <= bas.size();// test for bug 4036 } if (!results.get(g).isEmpty()) { this.populateSettings(results.get(g), stringency, maxResults); } } if (!toRemove.isEmpty()) { for (Long id : toRemove) { results.remove(id); } if (CoexpressionDaoImpl.log.isDebugEnabled()) { if (results.isEmpty()) { CoexpressionDaoImpl.log.debug("After trimming, no genes had results at stringency=" + stringency + "(" + toRemove.size() + " genes)"); } else { CoexpressionDaoImpl.log.debug("After trimming, " + toRemove.size() + " genes had no results at stringency=" + stringency); } } } } /** * Mark the genes as being tested for coexpression in the data set and persist the information in the database. This * is run at the tail end of coexpression analysis for the data set. * * @param ee the data set * @param genesTested the genes */ private void updatedTestedIn(BioAssaySet ee, Collection<Gene> genesTested) { Session sess = this.getSessionFactory().getCurrentSession(); Query q = sess.createQuery("from GeneCoexpressionTestedIn where geneId in (:ids)"); Set<Long> seenGenes = new HashSet<>(); Collection<Long> geneids = EntityUtils.getIds(genesTested); BatchIterator<Long> bi = new BatchIterator<>(geneids, 512); for (; bi.hasNext();) { q.setParameterList("ids", bi.next()); List<GeneCoexpressionTestedIn> list = q.list(); int count = 0; for (GeneCoexpressionTestedIn gcti : list) { // int old = gcti.getNumIds(); // debug code gcti.addEntity(ee.getId()); sess.update(gcti); // gcti.setBytes( gcti.getBytes() ); assert gcti.isIncluded(ee.getId()); seenGenes.add(gcti.getGeneId()); if (++count % 256 == 0) { sess.flush(); sess.clear(); } } } if (!seenGenes.isEmpty()) { CoexpressionDaoImpl.log.info("Updated tested-in information for " + seenGenes.size() + " genes"); this.geneTestedInCache.clearCache(); // TODO do it just for the genes changed. } sess.flush(); sess.clear(); // discover genes which don't have an entry at all. geneids.removeAll(seenGenes); if (geneids.isEmpty()) { return; } CoexpressionDaoImpl.log.info("Adding tested-in information for " + geneids.size() + " genes"); int count = 0; for (Long id : geneids) { GeneCoexpressionTestedIn gcti = new GeneCoexpressionTestedIn(id); gcti.addEntity(ee.getId()); assert gcti.isIncluded(ee.getId()); assert gcti.getNumIds() == 1; sess.save(gcti); if (++count % 256 == 0) { sess.flush(); sess.clear(); } } } /** * Update the index about which genes have links. * * @param links links */ private void updateGeneCoexpressedWith(Collection<NonPersistentNonOrderedCoexpLink> links) { Map<Long, Set<Long>> coexpressions = CoexpressionQueryUtils.linksToMap(links); Session sess = this.getSessionFactory().getCurrentSession(); int i = 0; for (Long g : coexpressions.keySet()) { GeneCoexpressedGenes gcti = (GeneCoexpressedGenes) sess .createQuery("from GeneCoexpressedGenes where geneId = :id").setParameter("id", g) .uniqueResult(); if (gcti == null) { gcti = new GeneCoexpressedGenes(g); sess.save(gcti); } gcti.addEntities(coexpressions.get(g)); assert gcti.getIds().size() > 0; assert gcti.getIds().contains(coexpressions.get(g).iterator().next()); if (++i % 1000 == 0) { CoexpressionDaoImpl.log.info( "Updated gene-coexpressed-with information for " + i + " genes, last was geneid=" + g); sess.flush(); sess.clear(); } } CoexpressionDaoImpl.log .info("Updated gene-coexpressed-with information for " + coexpressions.size() + " genes."); } }