Java tutorial
/* * The Gemma project * * Copyright (c) 2010 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.analysis.expression.diff; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import org.apache.commons.collections.Transformer; import org.apache.commons.collections.TransformerUtils; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.time.StopWatch; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.springframework.context.annotation.Scope; import org.springframework.stereotype.Component; import ubic.basecode.dataStructure.matrix.DenseDoubleMatrix; import ubic.basecode.dataStructure.matrix.DoubleMatrix; import ubic.basecode.dataStructure.matrix.ObjectMatrix; import ubic.basecode.math.DesignMatrix; import ubic.basecode.math.LeastSquaresFit; import ubic.basecode.math.MathUtil; import ubic.basecode.math.MatrixStats; import ubic.basecode.math.MeanVarianceEstimator; import ubic.basecode.util.r.type.LinearModelSummary; import ubic.gemma.analysis.preprocess.filter.ExpressionExperimentFilter; import ubic.gemma.analysis.util.ExperimentalDesignUtils; import ubic.gemma.datastructure.matrix.ExpressionDataDoubleMatrix; import ubic.gemma.datastructure.matrix.ExpressionDataMatrixColumnSort; import ubic.gemma.datastructure.matrix.MatrixWriter; import ubic.gemma.model.analysis.Direction; import ubic.gemma.model.analysis.expression.diff.ContrastResult; import ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysis; import ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysisResult; import ubic.gemma.model.analysis.expression.diff.ExpressionAnalysisResultSet; import ubic.gemma.model.analysis.expression.diff.HitListSize; import ubic.gemma.model.common.quantitationtype.QuantitationType; import ubic.gemma.model.common.quantitationtype.ScaleType; import ubic.gemma.model.expression.bioAssay.BioAssay; import ubic.gemma.model.expression.biomaterial.BioMaterial; import ubic.gemma.model.expression.designElement.CompositeSequence; import ubic.gemma.model.expression.experiment.BioAssaySet; import ubic.gemma.model.expression.experiment.ExperimentalFactor; import ubic.gemma.model.expression.experiment.ExpressionExperiment; import ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet; import ubic.gemma.model.expression.experiment.FactorType; import ubic.gemma.model.expression.experiment.FactorValue; import ubic.gemma.model.genome.Gene; import cern.colt.matrix.DoubleMatrix1D; import cern.colt.matrix.DoubleMatrix2D; import cern.jet.math.Functions; /** * Handles fitting linear models with continuous or fixed-level covariates. Data are always log-transformed. * <p> * Interactions can be included if a DifferentialExpressionAnalysisConfig is passed as an argument to 'run'. Currently * we only support interactions if there are two factors in the model (no more). * <p> * One factor can be constant (the same value for all samples); such a factor will be analyzed by looking at the * intercept in the fitted model. This is only appropriate for 'non-reference' designs on ratiometric arrays. * <p> * This also supports subsetting the data based on a factor. For example, a data set with "tissue" as a factor could be * analyzed per-tissue rather than with tissue as a covariate. * * @author paul * @version $Id: LinearModelAnalyzer.java,v 1.83 2013/05/28 19:23:41 paul Exp $ */ @Component @Scope(value = "prototype") public class LinearModelAnalyzer extends AbstractDifferentialExpressionAnalyzer { private static Log log = LogFactory.getLog(LinearModelAnalyzer.class); /** * Preset levels for which we will store the HitListSizes. */ private static final double[] qValueThresholdsForHitLists = new double[] { 0.001, 0.005, 0.01, 0.05, 0.1 }; /* * (non-Javadoc) * * @see ubic.gemma.analysis.expression.diff.DiffExAnalyzer#computeHitListSizes(java.util.List, java.util.Map) */ @Override public Collection<HitListSize> computeHitListSizes(Collection<DifferentialExpressionAnalysisResult> results, Map<CompositeSequence, Collection<Gene>> probeToGeneMap) { Collection<HitListSize> hitListSizes = new HashSet<HitListSize>(); StopWatch timer = new StopWatch(); timer.start(); double maxThreshold = MathUtil.max(qValueThresholdsForHitLists); assert probeToGeneMap != null; Collection<Gene> allGenes = new HashSet<Gene>(); for (Collection<Gene> genes : probeToGeneMap.values()) { allGenes.addAll(genes); } // maps from Doubles are a bit dodgy... Map<Double, Integer> upCounts = new HashMap<Double, Integer>(); Map<Double, Integer> downCounts = new HashMap<Double, Integer>(); Map<Double, Integer> eitherCounts = new HashMap<Double, Integer>(); Map<Double, Integer> upCountGenes = new HashMap<Double, Integer>(); Map<Double, Integer> downCountGenes = new HashMap<Double, Integer>(); Map<Double, Integer> eitherCountGenes = new HashMap<Double, Integer>(); Collection<Gene> seenGenes = new HashSet<Gene>(); for (DifferentialExpressionAnalysisResult r : results) { Double corrP = r.getCorrectedPvalue(); if (corrP == null || corrP > maxThreshold) { continue; } CompositeSequence probe = r.getProbe(); Collection<Gene> genesForProbe = probeToGeneMap.get(probe); int numGenes = 0; if (genesForProbe != null) { numGenes = countNumberOfGenesNotSeenAlready(genesForProbe, seenGenes); } if (numGenes == 0) { // This is okay; it might mean we already counted it as differentially expressed. } Collection<ContrastResult> crs = r.getContrasts(); boolean up = false; boolean down = false; /* * We set up and down to be true (either or both) if at least on contrast is shown. */ for (ContrastResult cr : crs) { Double lf = cr.getLogFoldChange(); if (lf == null) { /* * A contrast which is actually not valid, so it won't be counted in the hit list. */ continue; } else if (lf < 0) { down = true; } else if (lf > 0) { up = true; } } for (double thresh : qValueThresholdsForHitLists) { if (!upCounts.containsKey(thresh)) { upCounts.put(thresh, 0); upCountGenes.put(thresh, 0); } if (!downCounts.containsKey(thresh)) { downCounts.put(thresh, 0); downCountGenes.put(thresh, 0); } if (!eitherCounts.containsKey(thresh)) { eitherCounts.put(thresh, 0); eitherCountGenes.put(thresh, 0); } if (corrP < thresh) { if (up) { upCounts.put(thresh, upCounts.get(thresh) + 1); upCountGenes.put(thresh, upCountGenes.get(thresh) + numGenes); } if (down) { downCounts.put(thresh, downCounts.get(thresh) + 1); downCountGenes.put(thresh, downCountGenes.get(thresh) + numGenes); } eitherCounts.put(thresh, eitherCounts.get(thresh) + 1); eitherCountGenes.put(thresh, eitherCountGenes.get(thresh) + numGenes); } } } for (double thresh : qValueThresholdsForHitLists) { // Ensure we don't set values to null. Integer up = upCounts.get(thresh) == null ? 0 : upCounts.get(thresh); Integer down = downCounts.get(thresh) == null ? 0 : downCounts.get(thresh); Integer either = eitherCounts.get(thresh) == null ? 0 : eitherCounts.get(thresh); Integer upGenes = upCountGenes.get(thresh) == null ? 0 : upCountGenes.get(thresh); Integer downGenes = downCountGenes.get(thresh) == null ? 0 : downCountGenes.get(thresh); Integer eitherGenes = eitherCountGenes.get(thresh) == null ? 0 : eitherCountGenes.get(thresh); assert !(allGenes.size() < upGenes || allGenes.size() < downGenes || allGenes .size() < eitherGenes) : "There were more genes differentially expressed than exist in the experment"; HitListSize upS = HitListSize.Factory.newInstance(thresh, up, Direction.UP, upGenes); HitListSize downS = HitListSize.Factory.newInstance(thresh, down, Direction.DOWN, downGenes); HitListSize eitherS = HitListSize.Factory.newInstance(thresh, either, Direction.EITHER, eitherGenes); hitListSizes.add(upS); hitListSizes.add(downS); hitListSizes.add(eitherS); assert upGenes <= eitherGenes : "More genes upregulated than upregulated+downregulated"; assert downGenes <= eitherGenes : "More genes downregulated than upregulated+downregulated"; } if (timer.getTime() > 1000) { log.info("Hitlist computation: " + timer.getTime() + "ms"); } return hitListSizes; } /** * Determine if any factor should be treated as the intercept term. * * @param factors * @param quantitationType * @return */ @Override public ExperimentalFactor determineInterceptFactor(Collection<ExperimentalFactor> factors, QuantitationType quantitationType) { ExperimentalFactor interceptFactor = null; for (ExperimentalFactor experimentalFactor : factors) { /* * Check if we need to treat the intercept as a factor. */ boolean useI = checkIfNeedToTreatAsIntercept(experimentalFactor, quantitationType); if (useI && interceptFactor != null) { throw new IllegalStateException("Can only deal with one constant factor (intercept)"); } else if (useI) { interceptFactor = experimentalFactor; } } return interceptFactor; } /** * @param resultList * @param probeToGeneMap * @return */ @Override public int getNumberOfGenesTested(Collection<DifferentialExpressionAnalysisResult> resultList, Map<CompositeSequence, Collection<Gene>> probeToGeneMap) { Collection<Gene> gs = new HashSet<Gene>(); for (DifferentialExpressionAnalysisResult d : resultList) { CompositeSequence probe = d.getProbe(); if (probeToGeneMap.containsKey(probe)) { gs.addAll(probeToGeneMap.get(probe)); } } return gs.size(); } /** * @param matrix on which to perform regression. * @param config containing configuration of factors to include. Any interactions or subset configuration is * ignored. Data are <em>NOT</em> log transformed unless they come in that way. (the qvaluethreshold will be * ignored) * @param retainScale if true, the data retain the global mean (intercept) * @return residuals from the regression. */ @Override public ExpressionDataDoubleMatrix regressionResiduals(ExpressionDataDoubleMatrix matrix, DifferentialExpressionAnalysisConfig config, boolean retainScale) { if (config.getFactorsToInclude().isEmpty()) { log.warn("No factors"); return matrix; // FIXME perhaps return a copy instead. OR throw an ex. } /* * Always retain all. */ config.setQvalueThreshold(null); /* * Note that this method relies on similar code to doAnalysis, for the setup stages. */ List<ExperimentalFactor> factors = config.getFactorsToInclude(); List<BioMaterial> samplesUsed = ExperimentalDesignUtils.getOrderedSamples(matrix, factors); Map<ExperimentalFactor, FactorValue> baselineConditions = ExperimentalDesignUtils .getBaselineConditions(samplesUsed, factors); ObjectMatrix<String, String, Object> designMatrix = ExperimentalDesignUtils.buildDesignMatrix(factors, samplesUsed, baselineConditions); DesignMatrix properDesignMatrix = new DesignMatrix(designMatrix, true); ExpressionDataDoubleMatrix dmatrix = new ExpressionDataDoubleMatrix(samplesUsed, matrix); DoubleMatrix<CompositeSequence, BioMaterial> namedMatrix = dmatrix.getMatrix(); DoubleMatrix<String, String> sNamedMatrix = makeDataMatrix(designMatrix, namedMatrix); // perform weighted least squares regression on COUNT data QuantitationType quantitationType = dmatrix.getQuantitationTypes().iterator().next(); LeastSquaresFit fit = null; if (quantitationType.getScale().equals(ScaleType.COUNT)) { log.info(" Calculating residuals of weighted least squares regression on COUNT data "); MeanVarianceEstimator mv = new MeanVarianceEstimator(properDesignMatrix, sNamedMatrix); fit = new LeastSquaresFit(properDesignMatrix, sNamedMatrix, mv.getWeights()); } else { fit = new LeastSquaresFit(properDesignMatrix, sNamedMatrix); } DoubleMatrix2D residuals = fit.getResiduals(); if (retainScale) { DoubleMatrix1D intercept = fit.getCoefficients().viewRow(0); for (int i = 0; i < residuals.rows(); i++) { residuals.viewRow(i).assign(Functions.plus(intercept.get(i))); } } DoubleMatrix<CompositeSequence, BioMaterial> f = new DenseDoubleMatrix<CompositeSequence, BioMaterial>( residuals.toArray()); f.setRowNames(dmatrix.getMatrix().getRowNames()); f.setColumnNames(dmatrix.getMatrix().getColNames()); return new ExpressionDataDoubleMatrix(dmatrix, f); } /* * (non-Javadoc) * * @see * ubic.gemma.analysis.expression.diff.AbstractDifferentialExpressionAnalyzer#run(ubic.gemma.model.expression.experiment * .ExpressionExperiment, ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalysisConfig) */ @Override public Collection<DifferentialExpressionAnalysis> run(ExpressionExperiment expressionExperiment, DifferentialExpressionAnalysisConfig config) { ExpressionDataDoubleMatrix dmatrix = expressionDataMatrixService .getProcessedExpressionDataMatrix(expressionExperiment); return run(expressionExperiment, dmatrix, config); } /** * @param expressionExperiment * @param dmatrix * @param config * @return */ @Override public Collection<DifferentialExpressionAnalysis> run(ExpressionExperiment expressionExperiment, ExpressionDataDoubleMatrix dmatrix, DifferentialExpressionAnalysisConfig config) { /* * I apologize for this being so complicated. Basically there are four phases: * * 1. Get the data matrix and factors * * 2. Determine baseline groups; build model and contrasts * * 3. Run the analysis * * 4. Postprocess the analysis * * By far the most complex is #2 -- it depends on which factors and what kind they are. */ /* * Initialize our matrix and factor lists... */ List<ExperimentalFactor> factors = config.getFactorsToInclude(); List<BioMaterial> samplesUsed = ExperimentalDesignUtils.getOrderedSamples(dmatrix, factors); dmatrix = new ExpressionDataDoubleMatrix(samplesUsed, dmatrix); // enforce ordering /* * Do the analysis, by subsets if requested */ Collection<DifferentialExpressionAnalysis> results = new HashSet<DifferentialExpressionAnalysis>(); ExperimentalFactor subsetFactor = config.getSubsetFactor(); if (subsetFactor != null) { if (factors.contains(subsetFactor)) { throw new IllegalStateException( "Subset factor cannot also be included in the analysis [ Factor was: " + subsetFactor + "]"); } Map<FactorValue, ExpressionDataDoubleMatrix> subsets = makeSubSets(config, dmatrix, samplesUsed, subsetFactor); /* * Now analyze each subset */ for (FactorValue subsetFactorValue : subsets.keySet()) { log.info("Analyzing subset: " + subsetFactorValue); List<BioMaterial> bioMaterials = ExperimentalDesignUtils .getOrderedSamples(subsets.get(subsetFactorValue), factors); /* * make a EESubSet */ ExpressionExperimentSubSet eesubSet = ExpressionExperimentSubSet.Factory.newInstance(); eesubSet.setSourceExperiment(expressionExperiment); eesubSet.setName("Subset for " + subsetFactorValue); Collection<BioAssay> bioAssays = new HashSet<BioAssay>(); for (BioMaterial bm : bioMaterials) { bioAssays.addAll(bm.getBioAssaysUsedIn()); } eesubSet.getBioAssays().addAll(bioAssays); Collection<ExperimentalFactor> subsetFactors = fixFactorsForSubset(subsets.get(subsetFactorValue), eesubSet, factors); DifferentialExpressionAnalysisConfig subsetConfig = fixConfigForSubset(factors, config); if (subsetFactors.isEmpty()) { log.warn("Experimental design is not valid for subset: " + subsetFactorValue + "; skipping"); continue; } /* * Run analysis on the subset. */ DifferentialExpressionAnalysis analysis = doAnalysis(eesubSet, subsetConfig, subsets.get(subsetFactorValue), bioMaterials, new ArrayList<ExperimentalFactor>(subsetFactors), subsetFactorValue); if (analysis == null) { log.warn("No analysis results were obtained for subset: " + subsetFactorValue); continue; } results.add(analysis); } } else { /* * Analyze the whole thing as one */ DifferentialExpressionAnalysis analysis = doAnalysis(expressionExperiment, config, dmatrix, samplesUsed, factors, null); if (analysis == null) { log.warn("No analysis results were obtained"); } else { results.add(analysis); } } return results; } /* * (non-Javadoc) * * @see ubic.gemma.analysis.expression.diff.DiffExAnalyzer#run(ubic.gemma.model.expression.experiment. * ExpressionExperimentSubSet, ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalysisConfig) */ @Override public DifferentialExpressionAnalysis run(ExpressionExperimentSubSet subset, DifferentialExpressionAnalysisConfig config) { /* * Start by setting it up like the full experiment. */ ExpressionDataDoubleMatrix dmatrix = expressionDataMatrixService .getProcessedExpressionDataMatrix(subset.getSourceExperiment()); ExperimentalFactor ef = config.getSubsetFactor(); Collection<BioMaterial> bmtmp = new HashSet<BioMaterial>(); for (BioAssay ba : subset.getBioAssays()) { bmtmp.add(ba.getSampleUsed()); } List<BioMaterial> samplesInSubset = new ArrayList<BioMaterial>(bmtmp); FactorValue subsetFactorValue = null; for (BioMaterial bm : samplesInSubset) { Collection<FactorValue> fvs = bm.getFactorValues(); for (FactorValue fv : fvs) { if (fv.getExperimentalFactor().equals(ef)) { if (subsetFactorValue == null) { subsetFactorValue = fv; } else if (!subsetFactorValue.equals(fv)) { throw new IllegalStateException( "This subset has more than one factor value for the supposed subsetfactor: " + fv + " and " + subsetFactorValue); } } } } samplesInSubset = ExpressionDataMatrixColumnSort.orderByExperimentalDesign(samplesInSubset, config.getFactorsToInclude()); // slice. ExpressionDataDoubleMatrix subsetMatrix = new ExpressionDataDoubleMatrix(samplesInSubset, dmatrix); Collection<ExperimentalFactor> subsetFactors = fixFactorsForSubset(dmatrix, subset, config.getFactorsToInclude()); if (subsetFactors.isEmpty()) { log.warn("Experimental design is not valid for subset: " + subsetFactorValue + "; skipping"); return null; } DifferentialExpressionAnalysisConfig subsetConfig = fixConfigForSubset(config.getFactorsToInclude(), config); DifferentialExpressionAnalysis analysis = doAnalysis(subset, subsetConfig, subsetMatrix, samplesInSubset, config.getFactorsToInclude(), subsetFactorValue); if (analysis == null) { throw new IllegalStateException("Subset could not be analyzed with config: " + config); } return analysis; } /** * @param experimentalFactor * @param quantitationType * @return boolean true if we need the intercept. */ protected boolean checkIfNeedToTreatAsIntercept(ExperimentalFactor experimentalFactor, QuantitationType quantitationType) { if (experimentalFactor.getFactorValues().size() == 1) { if (quantitationType.getIsRatio()) { return true; } throw new IllegalArgumentException( "Cannot deal with constant factors unless the data are ratiometric non-reference design"); } return false; } /** * Create files that can be used in R to check the results. * * @param dmatrix * @param designMatrix */ protected void outputForDebugging(ExpressionDataDoubleMatrix dmatrix, ObjectMatrix<String, String, Object> designMatrix) { MatrixWriter mw = new MatrixWriter(); try { mw.write(new FileWriter(File.createTempFile("data.", ".txt")), dmatrix, null, true, false); ubic.basecode.io.writer.MatrixWriter<String, String> dem = new ubic.basecode.io.writer.MatrixWriter<String, String>( new FileWriter(File.createTempFile("design.", ".txt"))); dem.writeMatrix(designMatrix, true); } catch (IOException e) { e.printStackTrace(); } } /** * Assigns the design matrix columns as factors, defines contrasts. We use treatment contrasts, comparing to a * baseline condition. * * @param designMatrix * @param baselineConditions */ protected void setupFactors(ObjectMatrix<String, String, Object> designMatrix, Map<ExperimentalFactor, FactorValue> baselineConditions) { for (ExperimentalFactor factor : baselineConditions.keySet()) { if (factor.getFactorValues().size() < 2) { continue; } String factorName = ExperimentalDesignUtils.nameForR(factor); Object[] column = designMatrix.getColumn(designMatrix.getColIndexByName(factorName)); if (ExperimentalDesignUtils.isContinuous(factor)) { double[] colD = new double[column.length]; for (int i = 0; i < column.length; i++) { colD[i] = (Double) column[i]; } } else { String[] colS = new String[column.length]; for (int i = 0; i < column.length; i++) { colS[i] = (String) column[i]; } } } } /** * Build the formula, omitting the factor taking the place of the intercept, if need be. (Mostly for R but with side * effect of populating the interactionFactorLists and label2Factors) * * @param config * @param label2Factors * @param interceptFactor * @param interactionFactorLists * @return */ private String buildModelFormula(final DifferentialExpressionAnalysisConfig config, final Map<String, Collection<ExperimentalFactor>> label2Factors, final ExperimentalFactor interceptFactor, final List<String[]> interactionFactorLists) { String modelFormula; String factTerm = ""; for (String nameInR : label2Factors.keySet()) { if (interceptFactor != null && label2Factors.get(nameInR).size() == 1 && label2Factors.get(nameInR).iterator().next().equals(interceptFactor)) { continue; } factTerm = factTerm + " " + nameInR + " +"; } factTerm = factTerm.replaceFirst("\\+$", ""); /* * Add interaction terms */ boolean hasInteractionTerms = config.getInteractionsToInclude() != null && !config.getInteractionsToInclude().isEmpty(); if (hasInteractionTerms) { for (Collection<ExperimentalFactor> interactionTerms : config.getInteractionsToInclude()) { List<String> interactionFactorNames = new ArrayList<String>(); for (ExperimentalFactor factor : interactionTerms) { interactionFactorNames.add(ExperimentalDesignUtils.nameForR(factor)); } factTerm = factTerm + " + " + StringUtils.join(interactionFactorNames, "*"); // in the R // statement interactionFactorLists.add(interactionFactorNames.toArray(new String[] {})); // In the ANOVA table. String factTableLabel = StringUtils.join(interactionFactorNames, ":"); label2Factors.put(factTableLabel, new HashSet<ExperimentalFactor>()); label2Factors.get(factTableLabel).addAll(interactionTerms); } } modelFormula = " ~ " + factTerm; return modelFormula; } /** * @param genesForProbe * @param seenGenes * @return */ private int countNumberOfGenesNotSeenAlready(Collection<Gene> genesForProbe, Collection<Gene> seenGenes) { int numGenes = 0; if (genesForProbe != null) { for (Gene g : genesForProbe) { if (seenGenes.contains(g)) { continue; } numGenes++; } seenGenes.addAll(genesForProbe); } return numGenes; } /** * @param bioAssaySet source data * @param config * @param dmatrix data * @param samplesUsed analyzed * @param factors included in the model * @param subsetFactorValue null unless analyzing a subset (only used for book-keeping) * @return analysis, or null if there was a problem. */ private DifferentialExpressionAnalysis doAnalysis(BioAssaySet bioAssaySet, DifferentialExpressionAnalysisConfig config, ExpressionDataDoubleMatrix dmatrix, List<BioMaterial> samplesUsed, List<ExperimentalFactor> factors, FactorValue subsetFactorValue) { if (factors.isEmpty()) { log.error("Must provide at least one factor"); return null; } if (samplesUsed.size() <= factors.size()) { log.error("Must have more samples than factors"); return null; } final Map<String, Collection<ExperimentalFactor>> label2Factors = getRNames(factors); Map<ExperimentalFactor, FactorValue> baselineConditions = ExperimentalDesignUtils .getBaselineConditions(samplesUsed, factors); QuantitationType quantitationType = dmatrix.getQuantitationTypes().iterator().next(); ExperimentalFactor interceptFactor = determineInterceptFactor(factors, quantitationType); /* * Build our factor terms, with interactions handled specially */ List<String[]> interactionFactorLists = new ArrayList<String[]>(); ObjectMatrix<String, String, Object> designMatrix = ExperimentalDesignUtils.buildDesignMatrix(factors, samplesUsed, baselineConditions); setupFactors(designMatrix, baselineConditions); String modelFormula = ""; boolean oneSampleTtest = interceptFactor != null && factors.size() == 1; if (oneSampleTtest) { modelFormula = " "; } else { modelFormula = buildModelFormula(config, label2Factors, interceptFactor, interactionFactorLists); } dmatrix = filterAndLogTransform(quantitationType, dmatrix); DoubleMatrix<CompositeSequence, BioMaterial> namedMatrix = dmatrix.getMatrix(); if (log.isDebugEnabled()) outputForDebugging(dmatrix, designMatrix); /* * PREPARATION FOR 'NATIVE' FITTING */ DoubleMatrix<String, String> sNamedMatrix = makeDataMatrix(designMatrix, namedMatrix); DesignMatrix properDesignMatrix = makeDesignMatrix(designMatrix, interactionFactorLists, baselineConditions); /* * Run the analysis NOTE this can be simplified if we strip out R code. */ final Map<String, LinearModelSummary> rawResults = runAnalysis(namedMatrix, sNamedMatrix, label2Factors, modelFormula, properDesignMatrix, interceptFactor, interactionFactorLists, baselineConditions, quantitationType); if (rawResults.size() == 0) { log.error("Got no results from the analysis"); return null; } /* * Initialize data structures we need to hold results */ // this used to be a Set, but a List is much faster. Map<String, List<DifferentialExpressionAnalysisResult>> resultLists = new HashMap<String, List<DifferentialExpressionAnalysisResult>>(); Map<String, List<Double>> pvaluesForQvalue = new HashMap<String, List<Double>>(); for (String factorName : label2Factors.keySet()) { if (properDesignMatrix.getDroppedFactors().contains(factorName)) { continue; } resultLists.put(factorName, new ArrayList<DifferentialExpressionAnalysisResult>()); pvaluesForQvalue.put(factorName, new ArrayList<Double>()); } addinteraction: for (String[] fs : interactionFactorLists) { for (String f : fs) { if (properDesignMatrix.getDroppedFactors().contains(f)) { continue addinteraction; } } String intF = StringUtils.join(fs, ":"); resultLists.put(intF, new ArrayList<DifferentialExpressionAnalysisResult>()); pvaluesForQvalue.put(intF, new ArrayList<Double>()); } if (pvaluesForQvalue.isEmpty()) { log.warn( "No results were obtained for the current stage of analysis, possibly due to dropped factors."); return null; } /* * Create result objects for each model fit. Keeping things in order is important. */ final Transformer rowNameExtractor = TransformerUtils.invokerTransformer("getId"); boolean warned = false; int notUsable = 0; int processed = 0; for (CompositeSequence el : namedMatrix.getRowNames()) { if (++processed % 15000 == 0) { log.info("Processed results for " + processed + " elements ..."); } LinearModelSummary lm = rawResults.get(rowNameExtractor.transform(el).toString()); if (log.isDebugEnabled()) log.debug(el.getName() + "\n" + lm); if (lm == null) { if (!warned) { log.warn("No result for " + el + ", further warnings suppressed"); warned = true; } notUsable++; continue; } for (String factorName : label2Factors.keySet()) { if (!pvaluesForQvalue.containsKey(factorName)) { // was dropped. continue; } Double overallPValue = null; DifferentialExpressionAnalysisResult probeAnalysisResult = DifferentialExpressionAnalysisResult.Factory .newInstance(); probeAnalysisResult.setProbe(el); if (lm.getCoefficients() == null) { // probeAnalysisResult.setPvalue( null ); // pvaluesForQvalue.get( factorName ).add( overallPValue ); // resultLists.get( factorName ).add( probeAnalysisResult ); notUsable++; continue; } Collection<ExperimentalFactor> factorsForName = label2Factors.get(factorName); if (factorsForName.size() > 1) { /* * Interactions */ if (factorsForName.size() > 2) { log.error("Handling more than two-way interactions is not implemented"); return null; } assert factorName.contains(":"); String[] factorNames = StringUtils.split(factorName, ":"); assert factorNames.length == factorsForName.size(); overallPValue = lm.getInteractionEffectP(factorNames); if (overallPValue != null && !Double.isNaN(overallPValue)) { Map<String, Double> interactionContrastTStats = lm.getContrastTStats(factorName); Map<String, Double> interactionContrastCoeffs = lm.getContrastCoefficients(factorName); Map<String, Double> interactionContrastPValues = lm.getContrastPValues(factorName); for (String term : interactionContrastPValues.keySet()) { Double contrastPvalue = interactionContrastPValues.get(term); makeContrast(probeAnalysisResult, factorsForName, term, factorName, contrastPvalue, interactionContrastTStats, interactionContrastCoeffs); } } else { if (!warned) { log.warn("Interaction could not be computed for " + el + ", further warnings suppressed"); warned = true; } if (log.isDebugEnabled()) log.debug("Interaction could not be computed for " + el + ", further warnings suppressed"); notUsable++; // will over count? continue; } } else { /* * Main effect */ assert factorsForName.size() == 1; ExperimentalFactor experimentalFactor = factorsForName.iterator().next(); if (interceptFactor != null && factorsForName.size() == 1 && experimentalFactor.equals(interceptFactor)) { overallPValue = lm.getInterceptP(); } else { overallPValue = lm.getMainEffectP(factorName); } /* * Add contrasts unless overallpvalue is NaN */ if (overallPValue != null && !Double.isNaN(overallPValue)) { Map<String, Double> mainEffectContrastTStats = lm.getContrastTStats(factorName); Map<String, Double> mainEffectContrastPvalues = lm.getContrastPValues(factorName); Map<String, Double> mainEffectContrastCoeffs = lm.getContrastCoefficients(factorName); for (String term : mainEffectContrastPvalues.keySet()) { Double contrastPvalue = mainEffectContrastPvalues.get(term); makeContrast(probeAnalysisResult, factorsForName, term, factorName, contrastPvalue, mainEffectContrastTStats, mainEffectContrastCoeffs); } } else { if (!warned) { log.warn("ANOVA could not be done for " + experimentalFactor + " on " + el + ", further warnings suppressed"); warned = true; } if (log.isDebugEnabled()) log.debug("ANOVA could not be done for " + experimentalFactor + " on " + el); notUsable++; // will over count? continue; } } assert !Double.isNaN(overallPValue) : "We should not be keeping non-number pvalues (null or NaNs)"; probeAnalysisResult.setPvalue(nan2Null(overallPValue)); pvaluesForQvalue.get(factorName).add(overallPValue); resultLists.get(factorName).add(probeAnalysisResult); } // over terms } // over probes if (notUsable > 0) { log.info(notUsable + " elements or results were not usable - model could not be fit, etc."); } getRanksAndQvalues(resultLists, pvaluesForQvalue); DifferentialExpressionAnalysis expressionAnalysis = makeAnalysisEntity(bioAssaySet, config, label2Factors, baselineConditions, interceptFactor, interactionFactorLists, oneSampleTtest, resultLists, subsetFactorValue); log.info("Analysis processing phase done ..."); return expressionAnalysis; } /** * Log transform if necessary, do any required filtering prior to analysis. * * @param quantitationType * @param dmatrix */ private ExpressionDataDoubleMatrix filterAndLogTransform(QuantitationType quantitationType, ExpressionDataDoubleMatrix dmatrix) { ScaleType scaleType = findScale(quantitationType, dmatrix.getMatrix()); if (scaleType.equals(ScaleType.LOG2)) { log.info("Data is already on a log2 scale"); } else if (scaleType.equals(ScaleType.LN)) { log.info(" **** Converting from ln to log2 **** "); MatrixStats.convertToLog2(dmatrix.getMatrix(), Math.E); } else if (scaleType.equals(ScaleType.LOG10)) { log.info(" **** Converting from log10 to log2 **** "); MatrixStats.convertToLog2(dmatrix.getMatrix(), 10); } else if (scaleType.equals(ScaleType.LINEAR)) { log.info(" **** LOG TRANSFORMING **** "); MatrixStats.logTransform(dmatrix.getMatrix()); } else if (scaleType.equals(ScaleType.COUNT)) { // note: counts per million log2 transform is already taken care of in MeanVarianceEstimator } else { throw new UnsupportedOperationException("Can't figure out what scale the data are on"); } /* * We do this second because doing it first causes some kind of subtle problem ... (round off? I could not * really track this down). * * Remove zero-variance rows, but also rows that have lots of equal values even if variance is non-zero. This * happens when data is "clipped" (e.g., all values under 10 set to 10). */ int r = dmatrix.rows(); dmatrix = ExpressionExperimentFilter.zeroVarianceFilter(dmatrix); if (dmatrix.rows() < r) { log.info((r - dmatrix.rows()) + " rows removed due to low variance"); } r = dmatrix.rows(); /* FIXME refactor as constant. Idea is to skip this if the data set is really small, but this is totally ad hoc. */ if (dmatrix.columns() > 4) { dmatrix = ExpressionExperimentFilter.tooFewDistinctValues(dmatrix, 0.5); if (dmatrix.rows() < r) { log.info((r - dmatrix.rows()) + " rows removed due to too many identical values"); } } return dmatrix; } /** * @param quantitationType * @param namedMatrix * @return * @see ExpressionExperimentFilter for a related implementation. */ private ScaleType findScale(QuantitationType quantitationType, DoubleMatrix<CompositeSequence, BioMaterial> namedMatrix) { if (quantitationType.getScale() != null) { if (quantitationType.getScale().equals(ScaleType.LOG2)) { return ScaleType.LOG2; } else if (quantitationType.getScale().equals(ScaleType.LOG10)) { return ScaleType.LOG10; } else if (quantitationType.getScale().equals(ScaleType.LN)) { return ScaleType.LN; } else if (quantitationType.getScale().equals(ScaleType.COUNT)) { return ScaleType.COUNT; } else if (quantitationType.getScale().equals(ScaleType.LOGBASEUNKNOWN)) { throw new UnsupportedOperationException( "Sorry, data on an unknown log scale is not supported. Please check the quantitation types, " + "and make sure the data is expressed in terms of log2 or un-logged data (" + quantitationType + ")"); } } for (int i = 0; i < namedMatrix.rows(); i++) { for (int j = 0; j < namedMatrix.columns(); j++) { double v = namedMatrix.get(i, j); if (v > 20) { log.debug("Data has large values, doesn't look log transformed"); return ScaleType.LINEAR; } } } throw new UnsupportedOperationException( "Data look log tranformed, not sure about base (" + quantitationType + ")"); } /** * Remove all configurations that have to do with factors that aren't in the selected factors. * * @param samplesInSubset * @param factors the factors that will be included * @param config * @return an updated config; the baselines are cleared; subset is cleared; interactions are only kept if they only * involve the given factors. */ private DifferentialExpressionAnalysisConfig fixConfigForSubset(List<ExperimentalFactor> factors, DifferentialExpressionAnalysisConfig config) { DifferentialExpressionAnalysisConfig newConfig = new DifferentialExpressionAnalysisConfig(); // // /* // * Drop factors that are constant in the subset. // */ // Map<ExperimentalFactor, Collection<FactorValue>> ef2FvsUsedInSubset = new HashMap<ExperimentalFactor, // Collection<FactorValue>> (); // for ( BioMaterial bm : samplesInSubset ) { // for ( FactorValue fv : bm.getFactorValues() ) { // ExperimentalFactor ef = fv.getExperimentalFactor(); // if ( !ef2FvsUsedInSubset.containsKey( ef ) ) { // ef2FvsUsedInSubset.put( ef, new HashSet<FactorValue>() ); // } // ef2FvsUsedInSubset.get( ef ).add( fv ); // } // } // // Collection<ExperimentalFactor> efsToUse = new HashSet<ExperimentalFactor>(); // for ( ExperimentalFactor ef : factors ) { // Collection<FactorValue> fvsUsed = ef2FvsUsedInSubset.get( ef ); // if ( fvsUsed.size() > 1 ) { // efsToUse.add( ef ); // } // } newConfig.setBaseLineFactorValues(null); if (!config.getInteractionsToInclude().isEmpty()) { Collection<Collection<ExperimentalFactor>> newInteractionsToInclude = new HashSet<Collection<ExperimentalFactor>>(); for (Collection<ExperimentalFactor> interactors : config.getInteractionsToInclude()) { if (factors.containsAll(interactors)) { newInteractionsToInclude.add(interactors); } } newConfig.setInteractionsToInclude(newInteractionsToInclude); } newConfig.setSubsetFactor(null); newConfig.setFactorsToInclude(factors); newConfig.setQvalueThreshold(config.getQvalueThreshold()); return newConfig; } /** * Remove factors which are no longer usable, based on the subset. * * @param expressionDataDoubleMatrix * @param eesubSet * @param factors * @return */ private Collection<ExperimentalFactor> fixFactorsForSubset(ExpressionDataDoubleMatrix dmatrix, ExpressionExperimentSubSet eesubSet, List<ExperimentalFactor> factors) { List<ExperimentalFactor> result = new ArrayList<ExperimentalFactor>(); QuantitationType quantitationType = dmatrix.getQuantitationTypes().iterator().next(); ExperimentalFactor interceptFactor = determineInterceptFactor(factors, quantitationType); /* * Remove any constant factors, unless they are that intercept. */ for (ExperimentalFactor f : factors) { if (f.getType().equals(FactorType.CONTINUOUS)) { result.add(f); } else if (interceptFactor != null && interceptFactor.equals(f)) { result.add(f); } else { Collection<FactorValue> levels = new HashSet<FactorValue>(); for (BioAssay ba : eesubSet.getBioAssays()) { BioMaterial bm = ba.getSampleUsed(); for (FactorValue fv : bm.getFactorValues()) { if (fv.getExperimentalFactor().equals(f)) { levels.add(fv); } } } if (levels.size() > 1) { result.add(f); } else { log.info("Dropping " + f + " from subset"); } } } return result; } /** * Needed to compute the number of genes tested/detected. * * @param resultLists * @return */ private Map<CompositeSequence, Collection<Gene>> getProbeToGeneMap( Map<String, ? extends Collection<DifferentialExpressionAnalysisResult>> resultLists) { Map<CompositeSequence, Collection<Gene>> result = new HashMap<CompositeSequence, Collection<Gene>>(); for (Collection<DifferentialExpressionAnalysisResult> resultList : resultLists.values()) { for (DifferentialExpressionAnalysisResult d : resultList) { CompositeSequence probe = d.getProbe(); result.put(probe, new HashSet<Gene>()); } } // testing environment, etc. if (result.isEmpty()) { return new HashMap<CompositeSequence, Collection<Gene>>(); } return compositeSequenceService.getGenes(result.keySet()); } /** * Fill in the ranks and qvalues in the results. * * @param resultLists * @param pvaluesForQvalue Map of factorName to results. */ private void getRanksAndQvalues( Map<String, ? extends Collection<DifferentialExpressionAnalysisResult>> resultLists, Map<String, List<Double>> pvaluesForQvalue) { /* * qvalues and ranks, requires second pass over the result objects. */ for (String fName : pvaluesForQvalue.keySet()) { Collection<Double> pvals = pvaluesForQvalue.get(fName); if (pvals.isEmpty()) { log.warn("No pvalues for " + fName + ", ignoring."); continue; } log.info(pvals.size() + " pvalues for " + fName); Double[] pvalArray = pvals.toArray(new Double[] {}); for (int i = 0; i < pvalArray.length; i++) { assert pvalArray[i] != null; assert !Double.isNaN(pvalArray[i]); // if ( Double.isNaN( pvalArray[i] ) ) { // log.warn( "Pvalue was NaN" ); // } // // if ( pvalArray[i] == null ) { // log.warn( "Pvalue was null" ); // pvalArray[i] = Double.NaN; // } } double[] ranks = super.computeRanks(ArrayUtils.toPrimitive(pvalArray)); if (ranks == null) { log.error("Ranks could not be computed " + fName); continue; } assert pvalArray.length == resultLists.get(fName).size() : pvalArray.length + " != " + resultLists.get(fName).size(); assert pvalArray.length == ranks.length; double[] qvalues = super.benjaminiHochberg(pvalArray); assert qvalues.length == resultLists.get(fName).size(); if (qvalues == null) { log.warn("Corrected pvalues could not be computed for " + fName); continue; } int i = 0; for (DifferentialExpressionAnalysisResult pr : resultLists.get(fName)) { pr.setCorrectedPvalue(nan2Null(qvalues[i])); pr.setRank(nan2Null(ranks[i])); i++; } } } /** * @param factors * @return */ private Map<String, Collection<ExperimentalFactor>> getRNames(List<ExperimentalFactor> factors) { final Map<String, Collection<ExperimentalFactor>> label2Factors = new LinkedHashMap<String, Collection<ExperimentalFactor>>(); for (ExperimentalFactor experimentalFactor : factors) { label2Factors.put(ExperimentalDesignUtils.nameForR(experimentalFactor), new HashSet<ExperimentalFactor>()); label2Factors.get(ExperimentalDesignUtils.nameForR(experimentalFactor)).add(experimentalFactor); } return label2Factors; } /** * @param bioAssaySet * @param config * @param label2Factors * @param baselineConditions * @param interceptFactor * @param interactionFactorLists * @param oneSampleTtest * @param resultLists * @param subsetFactorValue * @return Analysis (nonpersistent) */ private DifferentialExpressionAnalysis makeAnalysisEntity(BioAssaySet bioAssaySet, DifferentialExpressionAnalysisConfig config, final Map<String, Collection<ExperimentalFactor>> label2Factors, Map<ExperimentalFactor, FactorValue> baselineConditions, ExperimentalFactor interceptFactor, List<String[]> interactionFactorLists, boolean oneSampleTtest, Map<String, ? extends Collection<DifferentialExpressionAnalysisResult>> resultLists, FactorValue subsetFactorValue) { DifferentialExpressionAnalysis expressionAnalysis = super.initAnalysisEntity(bioAssaySet); /* * Complete analysis config */ expressionAnalysis.setName(this.getClass().getSimpleName()); expressionAnalysis.setDescription("Linear model with " + config.getFactorsToInclude().size() + " factors" + (interceptFactor == null ? "" : " with intercept treated as factor") + (interactionFactorLists.isEmpty() ? "" : " with interaction") + (subsetFactorValue == null ? "" : "Using subset " + bioAssaySet + " subset value= " + subsetFactorValue)); expressionAnalysis.setSubsetFactorValue(subsetFactorValue); Collection<ExpressionAnalysisResultSet> resultSets = makeResultSets(label2Factors, baselineConditions, oneSampleTtest, expressionAnalysis, resultLists); expressionAnalysis.setResultSets(resultSets); return expressionAnalysis; } /** * Add a contrast to the given result. * * @param probeAnalysisResult * @param experimentalFactor * @param term * @param factorName * @param contrastPvalue * @param tstats * @param coeffs */ private void makeContrast(DifferentialExpressionAnalysisResult probeAnalysisResult, Collection<ExperimentalFactor> experimentalFactors, String term, String factorName, Double contrastPvalue, Map<String, Double> tstats, Map<String, Double> coeffs) { assert experimentalFactors.size() == 1 || experimentalFactors.size() == 2; Double contrastTstat = tstats.get(term); Double coefficient = coeffs.get(term); // no reason to store this. Note: Tiny coefficient could also be treated as a missing value, but hopefully such // situations would have been dealt with at a lower level. if (coefficient == null || contrastTstat == null) { return; } ContrastResult contrast = ContrastResult.Factory.newInstance(); contrast.setPvalue(nan2Null(contrastPvalue)); contrast.setTstat(nan2Null(contrastTstat)); contrast.setCoefficient(nan2Null(coefficient)); List<ExperimentalFactor> factorList = new ArrayList<ExperimentalFactor>(experimentalFactors); boolean isInteraction = false; if (factorList.size() == 2) { isInteraction = true; } /* * The coefficient can be treated as fold-change if the data are log-transformed. This is because the * coefficient in the contrast is the (fitted;estimated) difference between the means, and log(x) - log(y) = * log(x/y). Limma uses this same trick. */ contrast.setLogFoldChange(nan2Null(coefficient)); if (term.contains(ExperimentalDesignUtils.FACTOR_VALUE_RNAME_PREFIX)) { // otherwise, it's continuous, and // we don't put in a // factorvalue. String[] terms = new String[2]; String[] factorNames = new String[2]; if (term.contains(":")) { terms = StringUtils.split(term, ":"); factorNames = StringUtils.split(factorName, ":"); } else { terms[0] = term; factorNames[0] = factorName; } String firstTerm = terms[0]; String secondTerm = terms[1]; Long factorValueId = null; try { factorValueId = Long.parseLong( firstTerm.replace(factorNames[0] + ExperimentalDesignUtils.FACTOR_VALUE_RNAME_PREFIX, "")); } catch (NumberFormatException e) { throw new RuntimeException("Failed to parse: " + firstTerm + " into a factorvalue id"); } for (ExperimentalFactor f : factorList) { for (FactorValue fv : f.getFactorValues()) { if (fv.getId().equals(factorValueId)) { contrast.setFactorValue(fv); break; } } } if (isInteraction) { log.debug("Interaction term"); assert secondTerm != null; try { factorValueId = Long.parseLong(secondTerm .replace(factorNames[1] + ExperimentalDesignUtils.FACTOR_VALUE_RNAME_PREFIX, "")); } catch (NumberFormatException e) { throw new RuntimeException("Failed to parse: " + secondTerm + " into a factorvalue id"); } for (ExperimentalFactor f : factorList) { for (FactorValue fv : f.getFactorValues()) { if (fv.getId().equals(factorValueId)) { contrast.setSecondFactorValue(fv); break; } } } if (contrast.getSecondFactorValue() == null) { throw new IllegalStateException("Failed to get interaction contrast second factorvalue"); } } if (contrast.getFactorValue() == null) { throw new IllegalStateException("Failed to get contrast factorvalue"); } if (contrast.getSecondFactorValue() != null && contrast.getSecondFactorValue().equals(contrast.getFactorValue())) { throw new IllegalStateException( "Contrast for interactions must be for two different factor values, got the same one twice"); } } probeAnalysisResult.getContrasts().add(contrast); } /** * @param designMatrix * @param namedMatrix * @return */ private DoubleMatrix<String, String> makeDataMatrix(ObjectMatrix<String, String, Object> designMatrix, DoubleMatrix<CompositeSequence, BioMaterial> namedMatrix) { /* * Convert the data into a string-keyed matrix. */ DoubleMatrix<String, String> sNamedMatrix = new DenseDoubleMatrix<String, String>(namedMatrix.asArray()); for (int i = 0; i < namedMatrix.rows(); i++) { sNamedMatrix.addRowName(namedMatrix.getRowName(i).getId().toString()); } sNamedMatrix.setColumnNames(designMatrix.getRowNames()); return sNamedMatrix; } /** * @param designMatrix * @param interactionFactorLists * @param baselineConditions * @return */ private DesignMatrix makeDesignMatrix(ObjectMatrix<String, String, Object> designMatrix, List<String[]> interactionFactorLists, Map<ExperimentalFactor, FactorValue> baselineConditions) { /* * Determine the factors and interactions to include. */ DesignMatrix properDesignMatrix = new DesignMatrix(designMatrix, true); if (!(interactionFactorLists == null) && !interactionFactorLists.isEmpty()) { for (String[] in : interactionFactorLists) { // we actually only support (tested etc.) one interaction at a time, but this should actually work for // multiple properDesignMatrix.addInteraction(in); } } for (ExperimentalFactor ef : baselineConditions.keySet()) { if (ExperimentalDesignUtils.isContinuous(ef)) { continue; } String factorName = ExperimentalDesignUtils.nameForR(ef); String baselineFactorValue = ExperimentalDesignUtils.nameForR(baselineConditions.get(ef), true); /* * If this is a subset, it is possible the baseline chosen is not eligible for the subset. */ log.info(ef); assert baselineConditions.get(ef).getExperimentalFactor().equals(ef) : baselineConditions.get(ef) + " is not a value of " + ef; properDesignMatrix.setBaseline(factorName, baselineFactorValue); } return properDesignMatrix; } /** * @param label2Factors * @param baselineConditions * @param oneSampleTtest * @param expressionAnalysis * @param resultLists * @return */ private Collection<ExpressionAnalysisResultSet> makeResultSets( final Map<String, Collection<ExperimentalFactor>> label2Factors, Map<ExperimentalFactor, FactorValue> baselineConditions, boolean oneSampleTtest, DifferentialExpressionAnalysis expressionAnalysis, Map<String, ? extends Collection<DifferentialExpressionAnalysisResult>> resultLists) { Map<CompositeSequence, Collection<Gene>> probeToGeneMap = getProbeToGeneMap(resultLists); /* * Result sets */ log.info("Processing " + resultLists.size() + " resultSets"); // StopWatch timer = new StopWatch(); // timer.start(); Collection<ExpressionAnalysisResultSet> resultSets = new HashSet<ExpressionAnalysisResultSet>(); for (String fName : resultLists.keySet()) { Collection<DifferentialExpressionAnalysisResult> results = resultLists.get(fName); Collection<ExperimentalFactor> factorsUsed = new HashSet<ExperimentalFactor>(); factorsUsed.addAll(label2Factors.get(fName)); FactorValue baselineGroup = null; if (!oneSampleTtest && factorsUsed.size() == 1 /* not interaction */) { ExperimentalFactor factor = factorsUsed.iterator().next(); assert baselineConditions.containsKey(factor); baselineGroup = baselineConditions.get(factor); } Collection<HitListSize> hitListSizes = computeHitListSizes(results, probeToGeneMap); int numberOfProbesTested = results.size(); int numberOfGenesTested = getNumberOfGenesTested(results, probeToGeneMap); // make List into Set ExpressionAnalysisResultSet resultSet = ExpressionAnalysisResultSet.Factory.newInstance(factorsUsed, numberOfProbesTested, numberOfGenesTested, null, baselineGroup, new HashSet<DifferentialExpressionAnalysisResult>(results), expressionAnalysis, null /* * pvalue * dists */, hitListSizes); resultSets.add(resultSet); log.info("Finished with result set for " + fName); } return resultSets; } /** * @param config * @param dmatrix * @param samplesUsed * @param subsetFactor * @return */ private Map<FactorValue, ExpressionDataDoubleMatrix> makeSubSets(DifferentialExpressionAnalysisConfig config, ExpressionDataDoubleMatrix dmatrix, List<BioMaterial> samplesUsed, ExperimentalFactor subsetFactor) { if (subsetFactor.getType().equals(FactorType.CONTINUOUS)) { throw new IllegalArgumentException("You cannot subset on a continuous factor (has a Measurement)"); } if (config.getFactorsToInclude().contains(subsetFactor)) { throw new IllegalArgumentException( "You cannot analyze a factor and use it for subsetting at the same time."); } Map<FactorValue, List<BioMaterial>> subSetSamples = new HashMap<FactorValue, List<BioMaterial>>(); for (FactorValue fv : subsetFactor.getFactorValues()) { assert fv.getMeasurement() == null; subSetSamples.put(fv, new ArrayList<BioMaterial>()); } for (BioMaterial sample : samplesUsed) { boolean ok = false; for (FactorValue fv : sample.getFactorValues()) { if (fv.getExperimentalFactor().equals(subsetFactor)) { subSetSamples.get(fv).add(sample); ok = true; break; } } if (!ok) { throw new IllegalArgumentException( "Cannot subset on a factor unless each sample has a value for it. Missing value for: " + sample + " " + sample.getBioAssaysUsedIn()); } } Map<FactorValue, ExpressionDataDoubleMatrix> subMatrices = new HashMap<FactorValue, ExpressionDataDoubleMatrix>(); for (FactorValue fv : subSetSamples.keySet()) { List<BioMaterial> samplesInSubset = subSetSamples.get(fv); if (samplesInSubset.isEmpty()) { throw new IllegalArgumentException("The subset was empty for fv: " + fv); } assert samplesInSubset.size() < samplesUsed.size(); samplesInSubset = ExpressionDataMatrixColumnSort.orderByExperimentalDesign(samplesInSubset, config.getFactorsToInclude()); ExpressionDataDoubleMatrix subMatrix = new ExpressionDataDoubleMatrix(samplesInSubset, dmatrix); subMatrices.put(fv, subMatrix); } return subMatrices; } /** * Important bit. Run the analysis * * @param namedMatrix * @param factorNameMap * @param modelFormula * @param interactionFactorLists * @param interceptFactor * @param designMatrix * @param baselineConditions * @param quantitationType * @return results */ private Map<String, LinearModelSummary> runAnalysis( final DoubleMatrix<CompositeSequence, BioMaterial> namedMatrix, final DoubleMatrix<String, String> sNamedMatrix, final Map<String, Collection<ExperimentalFactor>> factorNameMap, final String modelFormula, DesignMatrix designMatrix, ExperimentalFactor interceptFactor, List<String[]> interactionFactorLists, Map<ExperimentalFactor, FactorValue> baselineConditions, QuantitationType quantitationType) { final Map<String, LinearModelSummary> rawResults = new ConcurrentHashMap<String, LinearModelSummary>(); Future<?> f = runAnalysisFuture(designMatrix, sNamedMatrix, rawResults, quantitationType); StopWatch timer = new StopWatch(); timer.start(); long lasttime = 0; // this analysis should take just 10 or 20 seconds for most data sets. double MAX_ANALYSIS_TIME = 60 * 1000 * 20; // 20 minutes. double updateIntervalMillis = 60 * 1000;// 1 minute while (!f.isDone()) { try { Thread.sleep(1000); if (timer.getTime() - lasttime > updateIntervalMillis) { log.info(String.format("Analysis running, %.1f minutes elapsed ...", timer.getTime() / 60000.00)); lasttime = timer.getTime(); } } catch (InterruptedException e) { log.warn("Analysis interrupted!"); return rawResults; } if (timer.getTime() > MAX_ANALYSIS_TIME) { log.error("Analysis is taking too long, something bad must have happened; cancelling"); f.cancel(true); throw new RuntimeException("Analysis was taking too long, it was cancelled"); } } if (timer.getTime() > updateIntervalMillis) { log.info(String.format("Analysis finished in %.1f minutes.", timer.getTime() / 60000.00)); } try { f.get(); } catch (InterruptedException e) { log.warn("Job was interrupted"); return rawResults; } catch (ExecutionException e) { throw new RuntimeException(e); } assert rawResults.size() == namedMatrix.rows() : "expected " + namedMatrix.rows() + " results, got " + rawResults.size(); return rawResults; } /** * Linear models solved * * @param designMatrix * @param data * @param rawResults * @param quantitationType * @return */ private Future<?> runAnalysisFuture(final DesignMatrix designMatrix, final DoubleMatrix<String, String> data, final Map<String, LinearModelSummary> rawResults, final QuantitationType quantitationType) { ExecutorService service = Executors.newSingleThreadExecutor(); Future<?> f = service.submit(new Runnable() { @Override public void run() { StopWatch timer = new StopWatch(); timer.start(); LeastSquaresFit fit = null; if (quantitationType.getScale().equals(ScaleType.COUNT)) { log.info(" Performing weighted least squares regression on COUNT data "); MeanVarianceEstimator mv = new MeanVarianceEstimator(designMatrix, data); fit = new LeastSquaresFit(designMatrix, data, mv.getWeights()); } else { fit = new LeastSquaresFit(designMatrix, data); } log.info("Model fit data matrix " + data.rows() + " x " + data.columns() + ": " + timer.getTime() + "ms"); timer.reset(); timer.start(); Map<String, LinearModelSummary> res = fit.summarizeByKeys(true); log.info("Model summarize/ANOVA: " + timer.getTime() + "ms"); rawResults.putAll(res); log.info("Analysis phase done ..."); } }); service.shutdown(); return f; } }