Java tutorial
/* * The Gemma project * * Copyright (c) 2008 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.core.analysis.expression.coexpression; import org.apache.commons.lang3.time.StopWatch; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.annotation.Lazy; import org.springframework.stereotype.Component; import ubic.gemma.core.genome.gene.service.GeneService; import ubic.gemma.model.association.coexpression.GeneCoexpressionNodeDegreeValueObject; import ubic.gemma.model.expression.experiment.ExpressionExperimentValueObject; import ubic.gemma.model.genome.Taxon; import ubic.gemma.model.genome.gene.GeneValueObject; import ubic.gemma.persistence.service.analysis.expression.coexpression.CoexpressionAnalysisService; import ubic.gemma.persistence.service.association.coexpression.CoexpressionService; import ubic.gemma.persistence.service.association.coexpression.CoexpressionValueObject; import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService; import ubic.gemma.persistence.util.EntityUtils; import java.util.*; /** * @author paul */ @Component @Lazy public class GeneCoexpressionSearchServiceImpl implements GeneCoexpressionSearchService { /** * If the query involves this many genes, we switch to looking only among the genes. The limit here is entirely * driven by performance considerations: too many genes and we get too many results. */ private static final int THRESHOLD_TRIGGER_QUERY_GENES_ONLY = 50; private static final Log log = LogFactory.getLog(GeneCoexpressionSearchServiceImpl.class.getName()); private final ExpressionExperimentService expressionExperimentService; private final CoexpressionService coexpressionService; private final CoexpressionAnalysisService coexpressionAnalysisService; private final GeneService geneService; @Autowired public GeneCoexpressionSearchServiceImpl(ExpressionExperimentService expressionExperimentService, CoexpressionService coexpressionService, CoexpressionAnalysisService coexpressionAnalysisService, GeneService geneService) { this.expressionExperimentService = expressionExperimentService; this.coexpressionService = coexpressionService; this.coexpressionAnalysisService = coexpressionAnalysisService; this.geneService = geneService; } @Override public CoexpressionMetaValueObject coexpressionSearch(Collection<Long> inputEeIds, Collection<Long> genes, int stringency, int maxResults, boolean queryGenesOnly) { return this.doCoexpressionSearch(inputEeIds, genes, stringency, maxResults, queryGenesOnly, false); } @Override public CoexpressionMetaValueObject coexpressionSearchQuick(Collection<Long> eeIds, Collection<Long> queryGenes, int stringency, int maxResults, boolean queryGenesOnly) { return this.doCoexpressionSearch(eeIds, queryGenes, stringency, maxResults, queryGenesOnly, true); } /** * Convert CoexpressionValueObjects to CoexpressionValueObjectExts * * @param queryGeneIds all those included in the query */ private List<CoexpressionValueObjectExt> addExtCoexpressionValueObjects(GeneValueObject queryGene, Collection<CoexpressionValueObject> coexp, boolean queryGenesOnly, Collection<Long> queryGeneIds) { List<CoexpressionValueObjectExt> results = new ArrayList<>(); Collection<Long> coexpGenes = new HashSet<>(); for (CoexpressionValueObject cvo : coexp) { coexpGenes.add(cvo.getCoexGeneId()); } // database hit. loadValueObjects is too slow. Map<Long, GeneValueObject> coexpedGenes = EntityUtils .getIdMap(geneService.loadValueObjectsByIds(coexpGenes)); for (CoexpressionValueObject cvo : coexp) { /* * sanity check, this is sort of debug code. Each link has to contain at least one query gene. */ if (queryGenesOnly) { if (!queryGeneIds.contains(cvo.getCoexGeneId())) { GeneCoexpressionSearchServiceImpl.log.warn( "coexpression for non-query genes obtained unexpectedly when doing queryGenesOnly " + cvo.getCoexGeneId() + " is not a query"); continue; } if (!queryGeneIds.contains(cvo.getQueryGeneId())) { GeneCoexpressionSearchServiceImpl.log.warn( "coexpression for non-query genes obtained unexpectedly when doing queryGenesOnly " + cvo.getQueryGeneId() + " is not a query"); continue; } } CoexpressionValueObjectExt ecVo = new CoexpressionValueObjectExt(); ecVo.setQueryGene(queryGene); GeneValueObject foundGene = coexpedGenes.get(cvo.getCoexGeneId()); if (!queryGeneIds.contains(foundGene.getId())) { foundGene.setIsQuery(false); } else { foundGene.setIsQuery(true); } ecVo.setFoundGene(foundGene); if (cvo.isPositiveCorrelation()) { ecVo.setPosSupp(cvo.getNumDatasetsSupporting()); } else { ecVo.setNegSupp(cvo.getNumDatasetsSupporting()); } // when 'quick', these will not necessarily be set. ecVo.setNumTestedIn(cvo.getNumDatasetsTestedIn()); ecVo.setSupportingExperiments(cvo.getSupportingDatasets()); ecVo.setSortKey(); results.add(ecVo); assert ecVo.getPosSupp() > 0 || ecVo.getNegSupp() > 0; assert ecVo.getFoundGene() != null; assert ecVo.getQueryGene() != null; } return results; } /** * @param genes 1 or more. * @param stringency; this may be modified to control the number of results, unless "queryGenesOnly" is true. * @param maxResults per gene, not including the query genes themselves. Ignored if this is 'queryGenesOnly' * @param queryGenesOnly will be ignored if number of genes is 1. * @return CoexpressionMetaValueObject, in which the results are already populated and sorted. */ private CoexpressionMetaValueObject doCoexpressionSearch(Collection<Long> inputEeIds, Collection<Long> genes, int stringency, final int maxResults, final boolean queryGenesOnly, final boolean quick) { if (genes.isEmpty()) { CoexpressionMetaValueObject r = new CoexpressionMetaValueObject(); r.setErrorState("No genes selected"); return r; } boolean actuallyUseQueryGeneOnly = queryGenesOnly && genes.size() > 1; Taxon taxon = this.geneService.load(genes.iterator().next()).getTaxon(); List<ExpressionExperimentValueObject> eevos = this.getFilteredEEVos(inputEeIds, taxon); CoexpressionMetaValueObject result = this.initValueObject(genes, eevos); if (eevos.isEmpty()) { result = new CoexpressionMetaValueObject(); result.setErrorState("No experiments selected"); return result; } Collection<Long> eeIds = EntityUtils.getIds(eevos); Map<Long, List<CoexpressionValueObject>> allCoexpressions; // Note: auto-choose stringency on client size not always giving something reasonable. Also: not clear we want // to do this auto-adjust for 'query genes only'. if (genes.size() > GeneCoexpressionSearchServiceImpl.THRESHOLD_TRIGGER_QUERY_GENES_ONLY) { if (!actuallyUseQueryGeneOnly) { GeneCoexpressionSearchServiceImpl.log.info("Switching to 'query genes only'"); } actuallyUseQueryGeneOnly = true; } if (stringency < 1) stringency = 1; if (!queryGenesOnly) { stringency = Math.max(stringency, this.chooseStringency(actuallyUseQueryGeneOnly, eeIds.size(), genes.size())); GeneCoexpressionSearchServiceImpl.log .info("Stringency set to " + stringency + " based on number of experiments (" + eeIds.size() + ") and genes (" + genes.size() + ") queried"); } else { GeneCoexpressionSearchServiceImpl.log .info("Query gene only: stringency maintained at requested value=" + stringency); } assert stringency >= 1 || eeIds.size() == 1; // HACK drop the stringency until we get some results. int stepSize = 3; while (true) { if (actuallyUseQueryGeneOnly) { // note that maxResults is ignored. if (genes.size() < 2) { // should be impossible - could assert. throw new IllegalArgumentException( "cannot do inter-gene coexpression search with only one gene"); } allCoexpressions = coexpressionService.findInterCoexpressionRelationships(taxon, genes, eeIds, stringency, quick); } else { allCoexpressions = coexpressionService.findCoexpressionRelationships(taxon, genes, eeIds, stringency, maxResults, quick); } int minimum_stringency_for_requery = 2; if (allCoexpressions.isEmpty() && stringency > minimum_stringency_for_requery) { stringency -= stepSize; // step size completely made up. stringency = Math.max(minimum_stringency_for_requery, stringency); // keep stringency at least 2. GeneCoexpressionSearchServiceImpl.log.info("No results, re-querying with stringency=" + stringency); } else { // no results. break; } } GeneCoexpressionSearchServiceImpl.log.info("Final actual stringency used was " + stringency); result.setQueryStringency(stringency); result.setQueryGenesOnly(actuallyUseQueryGeneOnly); Set<Long> queryGeneIds = allCoexpressions.keySet(); assert genes.containsAll(queryGeneIds); Map<Long, GeneValueObject> idMap = EntityUtils.getIdMap(geneService.loadValueObjectsByIds(queryGeneIds)); int k = 0; for (Long queryGene : queryGeneIds) { Collection<CoexpressionValueObject> coexpressions = allCoexpressions.get(queryGene); List<CoexpressionValueObjectExt> results = this.addExtCoexpressionValueObjects(idMap.get(queryGene), coexpressions, actuallyUseQueryGeneOnly, genes); result.getResults().addAll(results); CoexpressionSummaryValueObject summary = new CoexpressionSummaryValueObject(queryGene); summary.setDatasetsAvailable(eevos.size()); summary.setLinksFound(coexpressions.size()); result.getSummaries().put(queryGene, summary); if (++k % 100 == 0) { GeneCoexpressionSearchServiceImpl.log.info("Processed results for " + k + " query genes ..."); } } Collections.sort(result.getResults()); // FIXME we might want to suppress this in some situations if (!queryGenesOnly) { result.trim(); } this.populateNodeDegrees(result); return result; } /** * Populate node degree. Note that this ignores the datasets that were used in the query - the statistics are for * 'all' data sets. */ private void populateNodeDegrees(CoexpressionMetaValueObject result) { StopWatch timer = new StopWatch(); timer.start(); Collection<Long> allUsedGenes = new HashSet<>(); for (CoexpressionValueObjectExt coexp : result.getResults()) { allUsedGenes.add(coexp.getQueryGene().getId()); allUsedGenes.add(coexp.getFoundGene().getId()); } Map<Long, GeneCoexpressionNodeDegreeValueObject> nodeDegrees = new HashMap<>(); if (allUsedGenes.size() > 0) { nodeDegrees = coexpressionService.getNodeDegrees(allUsedGenes); } for (CoexpressionValueObjectExt coexp : result.getResults()) { // if node degree info is out of date. if (!nodeDegrees.containsKey(coexp.getQueryGene().getId()) || !nodeDegrees.containsKey(coexp.getFoundGene().getId())) { continue; } GeneCoexpressionNodeDegreeValueObject queryGeneNodeDegree = nodeDegrees .get(coexp.getQueryGene().getId()); GeneCoexpressionNodeDegreeValueObject foundGeneNodeDegree = nodeDegrees .get(coexp.getFoundGene().getId()); boolean pos = coexp.getNegSupp() == 0; coexp.setQueryGeneNodeDegree(queryGeneNodeDegree.getLinksWithMinimumSupport(coexp.getSupport(), pos)); coexp.setFoundGeneNodeDegree(foundGeneNodeDegree.getLinksWithMinimumSupport(coexp.getSupport(), pos)); coexp.setQueryGeneNodeDegreeRank(queryGeneNodeDegree.getRankAtMinimumSupport(coexp.getSupport(), pos)); coexp.setFoundGeneNodeDegreeRank(foundGeneNodeDegree.getRankAtMinimumSupport(coexp.getSupport(), pos)); } assert result.getSummaries() != null; for (Long g : nodeDegrees.keySet()) { if (!result.getSummaries().containsKey(g)) continue; result.getSummaries().get(g).setCoexpNodeDegree(nodeDegrees.get(g)); } if (timer.getTime() > 100) { GeneCoexpressionSearchServiceImpl.log .info("Populate node degree for " + result.getResults().size() + ": " + timer.getTime() + "ms"); } } /** * Security-filter the experiments, remove troubled ones, and retain only those that have an analysis. * * @param eeIds can be null, in which case all available (to user) IDs are gotten */ private List<ExpressionExperimentValueObject> getFilteredEEVos(Collection<Long> eeIds, Taxon taxon) { List<ExpressionExperimentValueObject> securityFilteredEevos; if (eeIds == null || eeIds.isEmpty()) { // all valid experiments for taxon. securityFilteredEevos = new ArrayList<>(expressionExperimentService .loadValueObjects(coexpressionAnalysisService.getExperimentsWithAnalysis(taxon), false)); } else { securityFilteredEevos = new ArrayList<>(expressionExperimentService .loadValueObjects(coexpressionAnalysisService.getExperimentsWithAnalysis(eeIds), false)); } List<ExpressionExperimentValueObject> eevos = new ArrayList<>(); StopWatch timerFilterTroubled = new StopWatch(); timerFilterTroubled.start(); // only keep untroubled experiments for (ExpressionExperimentValueObject eevo : securityFilteredEevos) { if (!eevo.getTroubled()) { eevos.add(eevo); } } if (timerFilterTroubled.getTime() > 100) { GeneCoexpressionSearchServiceImpl.log .info("Filtering eevos took " + timerFilterTroubled.getTime() + "ms"); } return eevos; } /** * Use some rough heuristics (based on manual testing) to choose an initial stringency for queries. This value could * get adjusted upwards during postprocessing to limit the result set sizes. But this should be set as high as * possible to speed up the first stages. */ private Integer chooseStringency(boolean queryGenesOnly, int numExperimentsQueried, int numGenesQueried) { /* * 0.01 means if we assay 800 genes the stringency will be increased by 8. */ double geneSlope = 0.01; /* * If we're doing queryGeneOnly, we can leave the stringency a little lower. */ double geneMinimum = 1 - (queryGenesOnly ? 1 : 0); /* * 0.05 means that we increase the stringency by 1 for every 20 data sets. */ double expSlope = 0.05; double expMinimum = 2; if (numExperimentsQueried < 5) { return (int) Math.min(numExperimentsQueried, expMinimum); } // choose initial level based on the number of genes selected int baseline = (int) (Math.ceil(geneMinimum + geneSlope * numGenesQueried)); if (baseline > numExperimentsQueried) { return numExperimentsQueried; } if (queryGenesOnly) baseline--; return (int) (Math.ceil(baseline + expSlope * numExperimentsQueried)); } private CoexpressionMetaValueObject initValueObject(Collection<Long> genes, List<ExpressionExperimentValueObject> eevos) { CoexpressionMetaValueObject result = new CoexpressionMetaValueObject(); result.setQueryGenes(new ArrayList<>(geneService.loadValueObjectsByIdsLiter(genes))); result.setResults(new ArrayList<CoexpressionValueObjectExt>()); result.setSummaries(new HashMap<Long, CoexpressionSummaryValueObject>()); result.setNumDatasetsQueried(eevos.size()); return result; } }