Java tutorial
/* * The Gemma project * * Copyright (c) 2007 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.core.loader.expression; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import ubic.gemma.core.analysis.expression.AnalysisUtilService; import ubic.gemma.core.analysis.preprocess.VectorMergingService; import ubic.gemma.core.analysis.service.ExpressionExperimentVectorManipulatingService; import ubic.gemma.model.common.quantitationtype.PrimitiveType; import ubic.gemma.model.common.quantitationtype.QuantitationType; import ubic.gemma.model.expression.arrayDesign.ArrayDesign; import ubic.gemma.model.expression.arrayDesign.TechnologyType; import ubic.gemma.model.expression.bioAssay.BioAssay; import ubic.gemma.model.expression.bioAssayData.BioAssayDimension; import ubic.gemma.model.expression.bioAssayData.DesignElementDataVector; import ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector; import ubic.gemma.model.expression.biomaterial.BioMaterial; import ubic.gemma.model.expression.designElement.CompositeSequence; import ubic.gemma.model.expression.experiment.ExpressionExperiment; import ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet; import ubic.gemma.model.genome.biosequence.BioSequence; import ubic.gemma.persistence.service.analysis.expression.sampleCoexpression.SampleCoexpressionAnalysisService; import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService; import ubic.gemma.persistence.service.expression.bioAssay.BioAssayService; import ubic.gemma.persistence.service.expression.bioAssayData.BioAssayDimensionService; import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService; import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentSubSetService; /** * Switch an expression experiment from one array design to another. This is valuable when the EE uses more than on AD, * and a merged AD exists. The following steps are needed: * <ul> * * <li>Delete old analyses of this experiment and processeddata vectors * <li>For each array design, for each probe, identify the matching probe on the merged AD. Have to deal with situation * <li>where more than one occurrence of each sequence is found. * <li>all DEDVs must be switched to use the new AD's design elements * <li>all bioassays must be switched to the new AD. * <li>update the EE description * <li>commit changes. * <li>Computed processed data vectors * </ul> * This also handles the case of multisamples-per-platform - NOT the case of one-sample-per-platform but multiple * platforms; for that you have to run VectorMerging. For more nutty situations this will probably create a mess. * * @author pavlidis * @see VectorMergingService */ @Component public class ExpressionExperimentPlatformSwitchService extends ExpressionExperimentVectorManipulatingService { /** * Used to identify design elements that have no sequence associated with them. */ public static final BioSequence NULL_BIOSEQUENCE; private static final Log log = LogFactory.getLog(ExpressionExperimentPlatformSwitchService.class.getName()); static { NULL_BIOSEQUENCE = BioSequence.Factory.newInstance(); ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE.setName("______NULL______"); ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE.setId(-1L); } @Autowired private ArrayDesignService arrayDesignService; @Autowired private BioAssayDimensionService bioAssayDimensionService; @Autowired private BioAssayService bioAssayService; @Autowired private ExpressionExperimentService expressionExperimentService; @Autowired private ExperimentPlatformSwitchHelperService helperService; @Autowired private SampleCoexpressionAnalysisService sampleCoexpressionAnalysisService; @Autowired private ExpressionExperimentSubSetService subsetService; @Autowired private AnalysisUtilService analysisUtilService; /** * If you know the array designs are already in a merged state, you should use switchExperimentToMergedPlatform * * @param ee ee * @param arrayDesign The array design to switch to. If some samples already use that array design, nothing will be * changed for them. * @return the switched experiment */ public ExpressionExperiment switchExperimentToArrayDesign(ExpressionExperiment ee, ArrayDesign arrayDesign) { assert arrayDesign != null; // remove stuff that will be in the way. processedExpressionDataVectorService.removeProcessedDataVectors(ee); sampleCoexpressionAnalysisService.removeForExperiment(ee); for (ExpressionExperimentSubSet subset : expressionExperimentService.getSubSets(ee)) { subsetService.remove(subset); } analysisUtilService.deleteOldAnalyses(ee); // get relation between sequence and designelements. Map<BioSequence, Collection<CompositeSequence>> designElementMap = new HashMap<>(); Collection<CompositeSequence> elsWithNoSeq = new HashSet<>(); this.populateCSeq(arrayDesign, designElementMap, elsWithNoSeq); ee = expressionExperimentService.thaw(ee); if (elsWithNoSeq.size() > 0) { ExpressionExperimentPlatformSwitchService.log .info(elsWithNoSeq.size() + " elements on the new platform have no associated sequence."); designElementMap.put(ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE, elsWithNoSeq); } boolean multiPlatformPerSample = this.switchPlatform(ee, arrayDesign); /* * For a multiplatform-per-sample case ("Case 2", platforms are disjoint): (note that some samples might just be * on one platform...) * 1. Pick a BAD that can be used for all DataVectors (it has all BioAssays in it). * 2. Switch vectors to use it - may require adding NaNs and reordering the vectors (switchDataForPlatform) * 3. Delete the Bioassays that are using other BADs (cleanupUnused) * * For Case 1 (each sample run on one platform, all platforms are related/similar) Simply switch the vectors * (switchDataForPlatform) */ /* * Now we have to get the BADs. Problem to watch out for: they might not be the same length, we need one that * includes all BioMaterials. */ Collection<BioAssayDimension> unusedBADs = new HashSet<>(); BioAssayDimension targetBioAssayDimension = null; if (multiPlatformPerSample) { targetBioAssayDimension = this.doMultiSample(ee, unusedBADs); } if (multiPlatformPerSample && targetBioAssayDimension == null) { throw new RuntimeException( "Data set cannot be switched to merged platform: no suitable bioassaydimension found"); } /* * To account for cases where we have no data loaded yet. */ boolean hasData = expressionExperimentService.getQuantitationTypes(ee).size() > 0 && ee.getRawExpressionDataVectors().size() > 0; Collection<ArrayDesign> oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed(ee); Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements = new HashMap<>(); int totalVectorsSwitched = 0; for (ArrayDesign oldAd : oldArrayDesigns) { totalVectorsSwitched += this.switchDataForPlatform(ee, arrayDesign, designElementMap, targetBioAssayDimension, usedDesignElements, oldAd); } if (totalVectorsSwitched == 0 && hasData) { throw new RuntimeException("No vectors were switched"); } String descriptionUpdate = "[Switched to use " + arrayDesign.getShortName() + " by Gemma]"; if (!ee.getDescription().contains(descriptionUpdate)) { ee.setDescription(ee.getDescription() + " " + descriptionUpdate); } helperService.persist(ee, arrayDesign); log.info("Finishing up"); if (targetBioAssayDimension != null && !unusedBADs.isEmpty()) { this.cleanupUnused(unusedBADs, targetBioAssayDimension); } ee = expressionExperimentService.load(ee.getId()); // refresh after cleanup ee = expressionExperimentService.thawLite(ee); if (hasData) { ee = processedExpressionDataVectorService.createProcessedDataVectors(ee); // this still fails sometimes? works fine if run later by cli } return ee; } /** * Automatically identify an appropriate merged platform * * @param expExp ee * @return ee */ public ExpressionExperiment switchExperimentToMergedPlatform(ExpressionExperiment expExp) { ArrayDesign arrayDesign = this.locateMergedDesign(expExp); if (arrayDesign == null) throw new IllegalArgumentException("Experiment has no merged design to switch to"); return this.switchExperimentToArrayDesign(expExp, arrayDesign); } private boolean switchPlatform(ExpressionExperiment ee, ArrayDesign arrayDesign) { boolean multiPlatformPerSample = false; for (BioAssay assay : ee.getBioAssays()) { // Switch the assay to use the desired platform assay.setOriginalPlatform(assay.getArrayDesignUsed()); assay.setArrayDesignUsed(arrayDesign); /* * Side effect: Detect cases of each-sample-run-on-multiple-platforms - we need to merge the bioassays, too. * That means fiddling with the BioAssayDimensions (BADs). We do this check here to avoid unnecessarily * inspecting BADs. */ if (!multiPlatformPerSample && assay.getSampleUsed().getBioAssaysUsedIn().size() > 1) { multiPlatformPerSample = true; } } return multiPlatformPerSample; } /** * Remove bioassays that are no longer needed * * @param unusedBADs * @param maxBAD */ private void cleanupUnused(Collection<BioAssayDimension> unusedBADs, BioAssayDimension maxBAD) { ExpressionExperimentPlatformSwitchService.log.info("Checking for unused BioAssays after merge"); Collection<BioAssay> removed = new HashSet<>(); for (BioAssayDimension bioAssayDimension : unusedBADs) { List<BioAssay> bioAssays = bioAssayDimension.getBioAssays(); for (BioAssay ba : bioAssays) { if (!maxBAD.getBioAssays().contains(ba) && !removed.contains(ba)) { try { ExpressionExperimentPlatformSwitchService.log.info("Deleting unused bioassay: " + ba); bioAssayService.remove(ba); removed.add(ba); } catch (Exception e) { log.error("Failed to delete: " + ba, e); } } } try { bioAssayDimensionService.remove(bioAssayDimension); log.info("Removed unused bioAssayDimension ID=" + bioAssayDimension.getId()); } catch (Exception e) { log.warn("Failed to clean up unused bioassaydimension with ID=" + bioAssayDimension.getId() + ": " + e.getMessage()); } } log.info("Removed " + removed.size() + " usused bioassays"); } /** * Find the bioassaydimension that covers all the biomaterials. * * @param ee * @param unusedBADs * @return */ private BioAssayDimension doMultiSample(ExpressionExperiment ee, Collection<BioAssayDimension> unusedBADs) { int maxSize = 0; BioAssayDimension maxBAD = null; for (BioAssay ba : ee.getBioAssays()) { Collection<BioAssayDimension> oldBioAssayDims = bioAssayService.findBioAssayDimensions(ba); for (BioAssayDimension bioAssayDim : oldBioAssayDims) { unusedBADs.add(bioAssayDim); int size = bioAssayDim.getBioAssays().size(); if (size > maxSize) { maxSize = size; maxBAD = bioAssayDim; } } } assert unusedBADs.size() > 1; // otherwise we shouldn't be here. unusedBADs.remove(maxBAD); /* * Make sure all biomaterials in the study are included in the chosen bioassaydimension. If not, we'd have * to make a new BAD. I haven't implemented that case. */ if (maxBAD != null) { Collection<BioMaterial> bmsInmaxBAD = new HashSet<>(); for (BioAssay ba : maxBAD.getBioAssays()) { bmsInmaxBAD.add(ba.getSampleUsed()); } for (BioAssay ba : ee.getBioAssays()) { if (!bmsInmaxBAD.contains(ba.getSampleUsed())) { ExpressionExperimentPlatformSwitchService.log .warn("This experiment looked like it had samples run on more than one platform, " + "but it also has no BioAssayDimension that is eligible to accomodate all samples (Example: " + ba.getSampleUsed()); maxBAD = null; break; } } } return maxBAD; } /** * Determine whether the two bioassaydimensions are the same, based on the samples used. Note it is inefficient to * call this over and over but it's not a big deal so far. * * @param currentOrder current order * @param desiredOrder desired order * @return boolean */ private boolean equivalent(List<BioAssay> currentOrder, List<BioAssay> desiredOrder) { if (currentOrder.size() != desiredOrder.size()) { return false; } for (int i = 0; i < currentOrder.size(); i++) { if (!currentOrder.get(i).getSampleUsed().equals(desiredOrder.get(i).getSampleUsed())) { return false; } } return true; } private ArrayDesign locateMergedDesign(ExpressionExperiment expExp) { // get the array designs for this EE ArrayDesign arrayDesign = null; Collection<ArrayDesign> oldArrayDesigns = expressionExperimentService.getArrayDesignsUsed(expExp); // find the AD they have been merged into, make sure it is exists and they are all merged into the same AD. for (ArrayDesign design : oldArrayDesigns) { ArrayDesign mergedInto = design.getMergedInto(); mergedInto = arrayDesignService.thaw(mergedInto); if (mergedInto == null) { throw new IllegalArgumentException( design + " used by " + expExp + " is not merged into another design"); } // TODO: go up the merge tree to find the root. This is too slow. // while ( mergedInto.getMergedInto() != null ) { // mergedInto = arrayDesignService.thaw( mergedInto.getMergedInto() ); // } if (arrayDesign == null) { arrayDesign = mergedInto; arrayDesign = arrayDesignService.thaw(arrayDesign); } if (!mergedInto.equals(arrayDesign)) { throw new IllegalArgumentException( design + " used by " + expExp + " is not merged into " + arrayDesign); } } return arrayDesign; } /** * Set up a map of sequences to elements for the platform we're switching to * * @param arrayDesign * @param designElementMap * @param elsWithNoSeq */ private void populateCSeq(ArrayDesign arrayDesign, Map<BioSequence, Collection<CompositeSequence>> designElementMap, Collection<CompositeSequence> elsWithNoSeq) { for (CompositeSequence cs : arrayDesign.getCompositeSequences()) { BioSequence bs = cs.getBiologicalCharacteristic(); if (bs == null) { elsWithNoSeq.add(cs); } else { if (!designElementMap.containsKey(bs)) { designElementMap.put(bs, new HashSet<CompositeSequence>()); } designElementMap.get(bs).add(cs); } } } /** * @param designElementMap Mapping of sequences to probes for the platform that is being switch from. This is used * to identify new candidates. * @param usedDesignElements probes from the new design that have already been assigned to probes from the old * design. If things are done correctly (the old design was merged into the new) then there should be enough. * Map is of the new design probe to the old design probe it was used for (this is debugging information) * @param vector vector * @param bad BioAssayDimension to use, if necessary. If this is null or already the one used, it's ignored. * Otherwise the vector data will be rewritten to match it. * @return true if a switch was made * @throws IllegalStateException if there is no (unused) design element matching the vector's biosequence */ private boolean processVector(Map<BioSequence, Collection<CompositeSequence>> designElementMap, Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements, RawExpressionDataVector vector, BioAssayDimension bad) { CompositeSequence oldDe = vector.getDesignElement(); Collection<CompositeSequence> newElCandidates; BioSequence seq = oldDe.getBiologicalCharacteristic(); if (seq == null) { newElCandidates = designElementMap.get(ExpressionExperimentPlatformSwitchService.NULL_BIOSEQUENCE); } else { newElCandidates = designElementMap.get(seq); } if (newElCandidates == null || newElCandidates.isEmpty()) { return false; // throw new IllegalStateException( // "There are no candidates probes for sequence: " + seq + "('null' should be okay)" ); } for (CompositeSequence newEl : newElCandidates) { if (!usedDesignElements.containsKey(newEl)) { vector.setDesignElement(newEl); usedDesignElements.put(newEl, new HashSet<BioAssayDimension>()); usedDesignElements.get(newEl).add(vector.getBioAssayDimension()); break; } if (!usedDesignElements.get(newEl).contains(vector.getBioAssayDimension())) { /* * Then it's okay to use it. */ vector.setDesignElement(newEl); usedDesignElements.get(newEl).add(vector.getBioAssayDimension()); break; } } if (bad != null && !vector.getBioAssayDimension().equals(bad)) { /* * 1. Check if they are already the same; then just switch it to the desired BAD * 2. If not, then the vector data has to be rewritten. */ this.vectorReWrite(vector, bad); } return true; } /** * Switch the vectors from one platform to the other, using the bioassaydimension passed if non-null * * @param ee * @param arrayDesign * @param designElementMap * @param targetBioAssayDimension - if null we keep the current one; otherwise we rewrite data datavectors to use * the new one. * @param usedDesignElements * @param oldAd * @return how many vectors were switched */ private int switchDataForPlatform(ExpressionExperiment ee, ArrayDesign arrayDesign, Map<BioSequence, Collection<CompositeSequence>> designElementMap, BioAssayDimension targetBioAssayDimension, Map<CompositeSequence, Collection<BioAssayDimension>> usedDesignElements, ArrayDesign oldAd) { if (oldAd.equals(arrayDesign)) return 0; oldAd = arrayDesignService.thaw(oldAd); if (oldAd.getCompositeSequences().size() == 0 && !oldAd.getTechnologyType().equals(TechnologyType.SEQUENCING)) { /* * Bug 3451 - this is okay if it is a RNA-seq experiment etc. prior to data upload. */ throw new IllegalStateException(oldAd + " has no elements"); } int totalSwitched = 0; Collection<QuantitationType> qts = expressionExperimentService.getQuantitationTypes(ee, oldAd); ExpressionExperimentPlatformSwitchService.log .info("Processing " + qts.size() + " quantitation types for vectors on " + oldAd); for (QuantitationType type : qts) { // use each design element only once per quantitation type + bioassaydimension per array design usedDesignElements.clear(); // assumption: we have no processed data. They should have been deleted at this point. Collection<RawExpressionDataVector> vecsForQt = new HashSet<>(); // int count = 0; for (RawExpressionDataVector vector : ee.getRawExpressionDataVectors()) { if (!vector.getQuantitationType().equals(type)) { continue; } CompositeSequence oldDe = vector.getDesignElement(); if (!oldDe.getArrayDesign().equals(oldAd)) { continue; } vecsForQt.add(vector); } if (vecsForQt.isEmpty()) { /* * This can happen when the quantitation types vary for the array designs. */ log.info("No vectors for " + type + " on " + oldAd + " (Might be okay; not all QTs might be on all platforms)"); continue; } log.info("Switching " + vecsForQt.size() + " vectors for " + type + " from " + oldAd.getShortName() + " to " + arrayDesign.getShortName() + (targetBioAssayDimension == null ? "" : ", BioAssayDimension=" + targetBioAssayDimension)); rawExpressionDataVectorService.thaw(vecsForQt); for (RawExpressionDataVector vector : vecsForQt) { if (this.processVector(designElementMap, usedDesignElements, vector, targetBioAssayDimension)) { count++; } } if (count != vecsForQt.size()) { throw new IllegalStateException( "Found matches for only " + count + "/" + vecsForQt.size() + " vectors for " + type + " from " + oldAd.getShortName() + (targetBioAssayDimension == null ? "" : ", BioAssayDimension=" + targetBioAssayDimension)); } // sanity check. this is all fine. // for ( RawExpressionDataVector v : vecsForQt ) { // if ( v.getQuantitationType().equals( type ) // && !arrayDesign.equals( v.getDesignElement().getArrayDesign() ) ) { // throw new IllegalStateException( "A raw vector for QT =" + v.getQuantitationType() // + " was not correctly switched to the target platform " + arrayDesign + ", it was on " // + v.getDesignElement().getArrayDesign() + " while switching from " + oldAd ); // } // } //this check shouldn't be necessary for (RawExpressionDataVector v : ee.getRawExpressionDataVectors()) { if (v.getQuantitationType().equals(type) && !arrayDesign.equals(v.getDesignElement().getArrayDesign()) && v.getDesignElement().getArrayDesign().equals(oldAd)) { throw new IllegalStateException("A raw vector " + v + "for QT =" + v.getQuantitationType() + " was not correctly switched to the target platform " + arrayDesign + ", it was still on " + v.getDesignElement().getArrayDesign()); } } totalSwitched += count; } return totalSwitched; } /** * Rearrange/expand a vector as necessary to use the given BioAssayDimension. Only used for multiplatform case of * samples run on multiple platforms. * * @param vector vector * @param bad to be used as the replacement. */ private void vectorReWrite(DesignElementDataVector vector, BioAssayDimension bad) { List<BioAssay> desiredOrder = bad.getBioAssays(); List<BioAssay> currentOrder = vector.getBioAssayDimension().getBioAssays(); if (this.equivalent(currentOrder, desiredOrder)) { // Easy, we can just switch it. vector.setBioAssayDimension(bad); return; } /* * We remake the data vector following the new ordering. */ PrimitiveType representation = vector.getQuantitationType().getRepresentation(); Object missingVal; if (representation.equals(PrimitiveType.DOUBLE)) { missingVal = Double.NaN; } else if (representation.equals(PrimitiveType.STRING)) { missingVal = ""; } else if (representation.equals(PrimitiveType.INT)) { missingVal = 0; } else if (representation.equals(PrimitiveType.BOOLEAN)) { missingVal = false; } else { throw new UnsupportedOperationException("Missing values in data vectors of type " + representation + " not supported (when processing " + vector); } List<Object> oldData = new ArrayList<>(); super.convertFromBytes(oldData, vector.getQuantitationType().getRepresentation(), vector); /* * Now data has the old data, so we need to rearrange it to match, inserting missings as necessary. */ Map<BioMaterial, Integer> bm2loc = new HashMap<>(); int i = 0; List<Object> newData = new ArrayList<>(); // initialize for (BioAssay ba : desiredOrder) { bm2loc.put(ba.getSampleUsed(), i++); newData.add(missingVal); } // Put data into new locations int j = 0; for (BioAssay ba : currentOrder) { Integer loc = bm2loc.get(ba.getSampleUsed()); assert loc != null; newData.set(loc, oldData.get(j++)); } byte[] newDataAr = converter.toBytes(newData.toArray()); vector.setData(newDataAr); vector.setBioAssayDimension(bad); } }