ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalyzerServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalyzerServiceImpl.java

Source

/*
 og* The Gemma project
 * 
 * Copyright (c) 2006 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.analysis.expression.diff;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.commons.lang.time.StopWatch;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import ubic.basecode.dataStructure.matrix.DenseDoubleMatrix;
import ubic.basecode.dataStructure.matrix.DoubleMatrix;
import ubic.basecode.io.ByteArrayConverter;
import ubic.basecode.io.writer.MatrixWriter;
import ubic.basecode.math.distribution.Histogram;
import ubic.basecode.util.FileTools;
import ubic.gemma.analysis.service.ExpressionDataFileService;
import ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysis;
import ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysisResult;
import ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysisService;
import ubic.gemma.model.analysis.expression.diff.DifferentialExpressionResultService;
import ubic.gemma.model.analysis.expression.diff.ExpressionAnalysisResultSet;
import ubic.gemma.model.analysis.expression.diff.HitListSize;
import ubic.gemma.model.analysis.expression.diff.PvalueDistribution;
import ubic.gemma.model.common.auditAndSecurity.AuditTrailService;
import ubic.gemma.model.common.auditAndSecurity.eventType.DifferentialExpressionAnalysisEvent;
import ubic.gemma.model.common.auditAndSecurity.eventType.FailedDifferentialExpressionAnalysisEvent;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.expression.designElement.CompositeSequenceService;
import ubic.gemma.model.expression.experiment.BioAssaySet;
import ubic.gemma.model.expression.experiment.ExperimentalFactor;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet;
import ubic.gemma.model.expression.experiment.FactorValue;
import ubic.gemma.model.genome.Gene;

/**
 * Differential expression service to run the differential expression analysis (and persist the results using the
 * appropriate data access objects).
 * <p>
 * Note that there is also a DifferentialExpressionAnalysisService (which handled CRUD for analyses). In contrast this
 * _does_ the analysis.
 * 
 * @author keshav
 * @version $Id: DifferentialExpressionAnalyzerServiceImpl.java,v 1.28 2013/05/10 21:39:18 paul Exp $
 */
@Component
public class DifferentialExpressionAnalyzerServiceImpl implements DifferentialExpressionAnalyzerService {

    /**
     * Defines the different types of analyses our linear modeling framework supports:
     * <ul>
     * <li>GENERICLM - generic linear regression (interactions are omitted, but this could change)
     * <li>OSTTEST - one sample t-test
     * <li>OWA - one-way ANOVA
     * <li>TTEST - two sample t-test
     * <li>TWA - two way ANOVA with interaction
     * <li>TWANI - two-way ANOVA with no interaction
     * </ul>
     * 
     * @author Paul
     * @version $Id: DifferentialExpressionAnalyzerServiceImpl.java,v 1.28 2013/05/10 21:39:18 paul Exp $
     */
    public enum AnalysisType {
        GENERICLM, OSTTEST /* one-sample */, OWA /* one-way ANOVA */, TTEST, TWA /* with interactions */, TWANI /*
                                                                                                                 * no
                                                                                                                 * interactions
                                                                                                                 */
    }

    public static final String FACTOR_NAME_MANGLING_DELIMITER = "__";

    private static Log log = LogFactory.getLog(DifferentialExpressionAnalyzerServiceImpl.class);

    private static final int MINIMUM_NUMBER_OF_HITS_TO_SAVE = 50;

    @Autowired
    private AnalysisSelectionAndExecutionService analysisSelectionAndExecutionService;

    @Autowired
    private AuditTrailService auditTrailService = null;

    @Autowired
    private CompositeSequenceService compositeSequenceService;

    @Autowired
    private DifferentialExpressionAnalysisService differentialExpressionAnalysisService = null;

    @Autowired
    private DifferentialExpressionResultService differentialExpressionResultService;

    @Autowired
    private ExpressionDataFileService expressionDataFileService;

    @Autowired
    private DifferentialExpressionAnalysisHelperService helperService;

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalyzerService#deleteOldAnalyses(ubic.gemma.model.
     * expression.experiment.ExpressionExperiment)
     */
    @Override
    public int deleteAnalyses(ExpressionExperiment expressionExperiment) {
        Collection<DifferentialExpressionAnalysis> diffAnalysis = differentialExpressionAnalysisService
                .findByInvestigation(expressionExperiment);

        int result = 0;
        if (diffAnalysis == null || diffAnalysis.isEmpty()) {
            log.debug("No differential expression analyses to delete for " + expressionExperiment.getShortName());
            return result;
        }

        for (DifferentialExpressionAnalysis de : diffAnalysis) {
            log.info("Deleting old differential expression analysis for experiment "
                    + expressionExperiment.getShortName() + ": Analysis ID=" + de.getId());
            differentialExpressionAnalysisService.delete(de);

            deleteStatistics(expressionExperiment, de);
            deleteAnalysisFiles(de);
            result++;
        }

        return result;
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalyzerService#deleteOldAnalysis(ubic.gemma.model.
     * expression.experiment.ExpressionExperiment,
     * ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysis)
     */
    @Override
    public void deleteAnalysis(ExpressionExperiment expressionExperiment,
            DifferentialExpressionAnalysis existingAnalysis) {
        log.info("Deleting old differential expression analysis for experiment "
                + expressionExperiment.getShortName() + " Analysis ID=" + existingAnalysis.getId());
        differentialExpressionAnalysisService.delete(existingAnalysis);

        deleteStatistics(expressionExperiment, existingAnalysis);
        try {
            expressionDataFileService.deleteDiffExArchiveFile(existingAnalysis);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Remove old files which will otherwise be cruft.
     * 
     * @param ee
     * @param analysis
     */
    public void deleteStatistics(ExpressionExperiment ee, DifferentialExpressionAnalysis analysis) {

        File f = prepareDirectoryForDistributions(ee);

        String histFileName = FileTools.cleanForFileName(ee.getShortName()) + ".an" + analysis.getId() + "."
                + "pvalues" + DifferentialExpressionFileUtils.PVALUE_DIST_SUFFIX;
        File oldf = new File(f, histFileName);
        if (oldf.exists() && oldf.canWrite()) {
            if (!oldf.delete()) {
                log.warn("Could not delete: " + oldf);
            }
        }

        // histFileName = FileTools.cleanForFileName( ee.getShortName() ) + ".an" + analysis.getId() + "." + "qvalues"
        // + DifferentialExpressionFileUtils.PVALUE_DIST_SUFFIX;
        // oldf = new File( f, histFileName );
        // if ( oldf.exists() && oldf.canWrite() ) {
        // if ( !oldf.delete() ) {
        // log.warn( "Could not delete: " + oldf );
        // }
        // }
        //
        // histFileName = FileTools.cleanForFileName( ee.getShortName() ) + ".an" + analysis.getId() + "." + "scores"
        // + DifferentialExpressionFileUtils.PVALUE_DIST_SUFFIX;
        // oldf = new File( f, histFileName );
        // if ( oldf.exists() && oldf.canWrite() ) {
        // if ( !oldf.delete() ) {
        // log.warn( "Could not delete: " + oldf );
        // }
        // }
    }

    /*
     * 
     */
    @Override
    public Collection<ExpressionAnalysisResultSet> extendAnalysis(ExpressionExperiment ee,
            DifferentialExpressionAnalysis toUpdate) {

        /*
         * One way to do this is redo without saving, and then copy the results over to the given result sets that
         * match. But that requires matching up old and new result sets.
         */
        differentialExpressionAnalysisService.thaw(toUpdate);
        DifferentialExpressionAnalysisConfig config = copyConfig(toUpdate, null);

        assert config.getQvalueThreshold() == null;

        Collection<DifferentialExpressionAnalysis> results = redoWithoutSave(ee, toUpdate, config);

        /*
         * Match up old and new...
         */

        extendResultSets(results, toUpdate.getResultSets());
        return toUpdate.getResultSets();

    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalyzerService#getAnalyses(ubic.gemma.model.expression
     * .experiment.ExpressionExperiment)
     */
    @Override
    public Collection<DifferentialExpressionAnalysis> getAnalyses(ExpressionExperiment expressionExperiment) {
        Collection<DifferentialExpressionAnalysis> expressionAnalyses = differentialExpressionAnalysisService
                .getAnalyses(expressionExperiment);
        differentialExpressionAnalysisService.thaw(expressionAnalyses);
        return expressionAnalyses;
    }

    /**
     * Made public for testing purposes only.
     * 
     * @param expressionExperiment
     * @param analysis
     * @param config
     * @return
     */
    @Override
    public DifferentialExpressionAnalysis persistAnalysis(ExpressionExperiment expressionExperiment,
            DifferentialExpressionAnalysis analysis, DifferentialExpressionAnalysisConfig config) {

        deleteOldAnalyses(expressionExperiment, analysis, config.getFactorsToInclude());
        StopWatch timer = new StopWatch();
        timer.start();
        Collection<ExpressionAnalysisResultSet> resultSets = analysis.getResultSets();

        analysis.setResultSets(new HashSet<ExpressionAnalysisResultSet>());

        // first transaction, gets us an ID
        DifferentialExpressionAnalysis persistentAnalysis = helperService.persistStub(analysis);

        // second set of transactions creates the empty resultSets.
        for (ExpressionAnalysisResultSet rs : resultSets) {
            Collection<DifferentialExpressionAnalysisResult> results = rs.getResults();

            rs.setResults(new HashSet<DifferentialExpressionAnalysisResult>());
            ExpressionAnalysisResultSet prs = helperService.create(rs);
            assert prs != null;
            for (DifferentialExpressionAnalysisResult r : results) {
                r.setResultSet(prs);
            }
            analysis.getResultSets().add(prs);
            rs.getResults().addAll(results);

            prs.setQvalueThresholdForStorage(config.getQvalueThreshold());
            addPvalueDistribution(prs);

        }

        // we do this here because now we have IDs for everything.
        expressionDataFileService.getDiffExpressionAnalysisArchiveFile(expressionExperiment, analysis, resultSets);

        for (ExpressionAnalysisResultSet rs : resultSets) {
            removeUnwantedResults(config.getQvalueThreshold(), rs.getResults());
        }

        // third transaction - add results.
        log.info("Saving results");
        helperService.addResults(persistentAnalysis, resultSets);

        // final transaction: audit.
        auditTrailService.addUpdateEvent(expressionExperiment,
                DifferentialExpressionAnalysisEvent.Factory.newInstance(),
                persistentAnalysis.getDescription() + "; analysis id=" + persistentAnalysis.getId());

        if (timer.getTime() > 5000) {
            log.info("Save results: " + timer.getTime() + "ms");
        }

        return persistentAnalysis;

    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalyzerService#redoAnalysis(ubic.gemma.model.expression
     * .experiment.ExpressionExperiment, ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysis)
     */
    @Override
    public Collection<DifferentialExpressionAnalysis> redoAnalysis(ExpressionExperiment ee,
            DifferentialExpressionAnalysis copyMe) {
        return this.redoAnalysis(ee, copyMe, DifferentialExpressionAnalysisConfig.DEFAULT_QVALUE_THRESHOLD);
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalyzerService#redoAnalysis(ubic.gemma.model.expression
     * .experiment.ExpressionExperiment, ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysis,
     * java.lang.Double)
     */
    @Override
    public Collection<DifferentialExpressionAnalysis> redoAnalysis(ExpressionExperiment ee,
            DifferentialExpressionAnalysis copyMe, Double qValueThreshold) {

        if (!differentialExpressionAnalysisService.canDelete(copyMe)) {
            throw new IllegalArgumentException(
                    "Cannot redo the analysis because it is included in a meta-analysis (or something). "
                            + "Delete the constraining entity first.");
        }

        differentialExpressionAnalysisService.thaw(copyMe);

        log.info("Will base analysis on old one: " + copyMe);
        DifferentialExpressionAnalysisConfig config = copyConfig(copyMe, qValueThreshold);

        Collection<DifferentialExpressionAnalysis> results = redoWithoutSave(ee, copyMe, config);

        /*
         * Finally, persist it.
         */
        return persistAnalyses(ee, results, config);
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalyzerService#runDifferentialExpressionAnalyses(ubic
     * .gemma.model.expression.experiment.ExpressionExperiment,
     * ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalysisConfig)
     */
    @Override
    public Collection<DifferentialExpressionAnalysis> runDifferentialExpressionAnalyses(
            ExpressionExperiment expressionExperiment, DifferentialExpressionAnalysisConfig config) {
        try {
            Collection<DifferentialExpressionAnalysis> diffExpressionAnalyses = analysisSelectionAndExecutionService
                    .analyze(expressionExperiment, config);

            diffExpressionAnalyses = persistAnalyses(expressionExperiment, diffExpressionAnalyses, config);

            return diffExpressionAnalyses;
        } catch (Exception e) {
            auditTrailService.addUpdateEvent(expressionExperiment,
                    FailedDifferentialExpressionAnalysisEvent.Factory.newInstance(),
                    ExceptionUtils.getFullStackTrace(e));
            throw new RuntimeException(e);
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalyzerService#updateScoreDistributionFiles(ubic.gemma
     * .model.expression.experiment.ExpressionExperiment)
     */
    @Override
    public void updateScoreDistributionFiles(ExpressionExperiment ee) {

        Collection<DifferentialExpressionAnalysis> analyses = this.getAnalyses(ee);

        if (analyses.size() == 0) {
            log.info("No  analyses for experiment " + ee.getShortName()
                    + ".  The differential expression analysis may not have been run on this experiment yet.");
            return;
        }

        for (DifferentialExpressionAnalysis analysis : analyses) {
            writeDistributions(ee, this.differentialExpressionAnalysisService.thawFully(analysis));
        }

    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalyzerService#updateSummaries(ubic.gemma.model.analysis
     * .expression.diff.DifferentialExpressionAnalysis)
     */
    @Override
    public void updateSummaries(DifferentialExpressionAnalysis analysis) {

        DiffExAnalyzer lma = analysisSelectionAndExecutionService.getAnalyzer();
        Map<CompositeSequence, Collection<Gene>> probe2GeneMap = new HashMap<CompositeSequence, Collection<Gene>>();

        log.info("Reading the analysis ...");
        differentialExpressionAnalysisService.thaw(analysis);
        for (ExpressionAnalysisResultSet resultSet : analysis.getResultSets()) {
            differentialExpressionResultService.thaw(resultSet);
            List<DifferentialExpressionAnalysisResult> results = new ArrayList<DifferentialExpressionAnalysisResult>(
                    resultSet.getResults());
            for (DifferentialExpressionAnalysisResult d : results) {
                CompositeSequence probe = d.getProbe();
                probe2GeneMap.put(probe, new HashSet<Gene>());
            }
        }

        log.info("Retrieving gene-element information ...");
        probe2GeneMap = compositeSequenceService.getGenes(probe2GeneMap.keySet());

        if (probe2GeneMap.isEmpty())
            throw new IllegalStateException("The elements do not map to genes");

        int i = 1;
        for (ExpressionAnalysisResultSet resultSet : analysis.getResultSets()) {
            log.info("Updating stats for " + i + "/" + analysis.getResultSets().size()
                    + " resultsets for the analysis");
            List<DifferentialExpressionAnalysisResult> results = new ArrayList<DifferentialExpressionAnalysisResult>(
                    resultSet.getResults());
            Collection<HitListSize> hitlists = lma.computeHitListSizes(results, probe2GeneMap);
            resultSet.getHitListSizes().clear();
            resultSet.getHitListSizes().addAll(hitlists);
            resultSet.setNumberOfGenesTested(lma.getNumberOfGenesTested(results, probe2GeneMap));
            resultSet.setNumberOfProbesTested(results.size());
            differentialExpressionResultService.update(resultSet);
            i++;
        }

        log.info("Writing distributions");
        writeDistributions(analysis.getExperimentAnalyzed(), analysis);
        log.info("Done");

    }

    /**
     * @param expressionExperiment
     * @param newAnalysis
     * @param factors
     * @return
     */
    protected int deleteOldAnalyses(ExpressionExperiment expressionExperiment,
            DifferentialExpressionAnalysis newAnalysis, Collection<ExperimentalFactor> factors) {
        Collection<DifferentialExpressionAnalysis> diffAnalyses = differentialExpressionAnalysisService
                .findByInvestigation(expressionExperiment);
        int numDeleted = 0;
        if (diffAnalyses == null || diffAnalyses.isEmpty()) {
            log.info("No differential expression analyses to delete for " + expressionExperiment.getShortName());
            return numDeleted;
        }

        this.differentialExpressionAnalysisService.thaw(diffAnalyses);

        for (DifferentialExpressionAnalysis existingAnalysis : diffAnalyses) {

            Collection<ExperimentalFactor> factorsInAnalysis = new HashSet<ExperimentalFactor>();

            for (ExpressionAnalysisResultSet resultSet : existingAnalysis.getResultSets()) {
                factorsInAnalysis.addAll(resultSet.getExperimentalFactors());
            }

            FactorValue subsetFactorValueForExisting = existingAnalysis.getSubsetFactorValue();

            /*
             * Match if: factors are the same, and if this is a subset, it's the same subset factorvalue.
             */
            if (factorsInAnalysis.size() == factors.size() && factorsInAnalysis.containsAll(factors)
                    && (subsetFactorValueForExisting == null
                            || subsetFactorValueForExisting.equals(newAnalysis.getSubsetFactorValue()))) {

                log.info("Deleting analysis with ID=" + existingAnalysis.getId());
                deleteAnalysis(expressionExperiment, existingAnalysis);

                numDeleted++;
            }
        }

        if (numDeleted == 0) {
            log.info("None of the other existing analyses were eligible for deletion");
        }
        return numDeleted;
    }

    /**
     * @param resultSet
     * @return
     */
    private Histogram addPvalueDistribution(ExpressionAnalysisResultSet resultSet) {
        Histogram pvalHist = new Histogram("", 100, 0.0, 1.0);

        for (DifferentialExpressionAnalysisResult result : resultSet.getResults()) {

            Double pvalue = result.getPvalue();
            if (pvalue != null)
                pvalHist.fill(pvalue);
        }

        PvalueDistribution pvd = PvalueDistribution.Factory.newInstance();
        pvd.setNumBins(100);
        ByteArrayConverter bac = new ByteArrayConverter();
        pvd.setBinCounts(bac.doubleArrayToBytes(pvalHist.getArray()));
        resultSet.setPvalueDistribution(pvd); // do not save yet.
        return pvalHist;
    }

    /**
     * @param temprs
     * @param oldrs
     * @return
     */
    private boolean configsAreEqual(ExpressionAnalysisResultSet temprs, ExpressionAnalysisResultSet oldrs) {
        return temprs.getBaselineGroup().equals(oldrs.getBaselineGroup())
                && temprs.getExperimentalFactors().size() == oldrs.getExperimentalFactors().size()
                && temprs.getExperimentalFactors().containsAll(oldrs.getExperimentalFactors());
    }

    /**
     * @param copyMe
     * @param qValueThreshold
     * @return
     */
    private DifferentialExpressionAnalysisConfig copyConfig(DifferentialExpressionAnalysis copyMe,
            Double qValueThreshold) {
        DifferentialExpressionAnalysisConfig config = new DifferentialExpressionAnalysisConfig();
        config.setQvalueThreshold(qValueThreshold); // this might be the same as the original, or not.

        if (copyMe.getSubsetFactorValue() != null) {
            config.setSubsetFactor(copyMe.getSubsetFactorValue().getExperimentalFactor());
        }

        Collection<ExpressionAnalysisResultSet> resultSets = copyMe.getResultSets();
        Collection<ExperimentalFactor> factorsFromOldExp = new HashSet<ExperimentalFactor>();
        for (ExpressionAnalysisResultSet rs : resultSets) {
            Collection<ExperimentalFactor> oldfactors = rs.getExperimentalFactors();
            factorsFromOldExp.addAll(oldfactors);

            /*
             * If we included the interaction before, include it again.
             */
            if (oldfactors.size() == 2) {
                log.info("Including interaction term");
                config.getInteractionsToInclude().add(oldfactors);
            }

        }

        if (factorsFromOldExp.isEmpty()) {
            throw new IllegalStateException("Old analysis didn't have any factors");
        }

        config.getFactorsToInclude().addAll(factorsFromOldExp);
        return config;
    }

    /**
     * Delete any flat files that might have been generated.
     * 
     * @param expressionExperiment
     * @param de
     */
    private void deleteAnalysisFiles(DifferentialExpressionAnalysis analysis) {
        try {
            expressionDataFileService.deleteDiffExArchiveFile(analysis);
        } catch (IOException e) {
            log.error("Error during deletion of old files for analyses to be deleted: " + e.getMessage());
        }
    }

    /**
     * @param oldrs
     * @param temprs
     */
    private void extendResultSet(ExpressionAnalysisResultSet oldrs, ExpressionAnalysisResultSet temprs) {
        assert oldrs.getId() != null;

        /*
         * Copy the results over.
         */
        Map<CompositeSequence, DifferentialExpressionAnalysisResult> p2der = new HashMap<CompositeSequence, DifferentialExpressionAnalysisResult>();

        for (DifferentialExpressionAnalysisResult der : oldrs.getResults()) {
            p2der.put(der.getProbe(), der);
        }

        Collection<DifferentialExpressionAnalysisResult> toAdd = new ArrayList<DifferentialExpressionAnalysisResult>();
        for (DifferentialExpressionAnalysisResult newr : temprs.getResults()) {
            if (!p2der.containsKey(newr.getProbe())) {
                toAdd.add(newr);

            }
            newr.setResultSet(oldrs);
        }

        if (toAdd.isEmpty()) {
            log.warn("Somewhat surprisingly, no new results were added");
        } else {
            log.info(toAdd.size() + " transient results added to the old analysis result set: " + oldrs.getId());
        }

        boolean added = oldrs.getResults().addAll(toAdd);
        assert added;

        assert oldrs.getResults().size() >= toAdd.size();
    }

    /**
     * @param results
     * @param toUpdateResultSets
     */
    private void extendResultSets(Collection<DifferentialExpressionAnalysis> results,
            Collection<ExpressionAnalysisResultSet> toUpdateResultSets) {
        for (DifferentialExpressionAnalysis a : results) {
            boolean found = false;
            // we should find a matching version for each resultset.

            for (ExpressionAnalysisResultSet oldrs : toUpdateResultSets) {

                assert oldrs.getId() != null;
                oldrs = this.differentialExpressionResultService.thaw(oldrs);

                for (ExpressionAnalysisResultSet temprs : a.getResultSets()) {
                    /*
                     * Compare the config
                     */
                    if (configsAreEqual(temprs, oldrs)) {
                        found = true;

                        extendResultSet(oldrs, temprs);

                        break;
                    }
                }

                if (!found)
                    throw new IllegalStateException("Failed to find a matching existing result set for " + oldrs);
            }

        }
    }

    /**
     * @param factorNames
     * @param histograms
     * @param dists
     */
    private void fillDists(List<String> factorNames, List<Histogram> histograms,
            DoubleMatrix<String, String> dists) {

        assert !histograms.isEmpty();

        int i = 0;
        for (Histogram h : histograms) {
            if (i == 0) {
                dists.setRowNames(Arrays.asList(h.getBinEdgesStrings()));
                dists.setColumnNames(factorNames);
            }

            double[] binHeights = h.getArray();
            for (int j = 0; j < binHeights.length; j++) {
                dists.set(j, i, binHeights[j]);

            }
            i++;
        }
    }

    /**
     * @param expressionExperiment
     * @param diffExpressionAnalyses
     * @return
     */
    private Collection<DifferentialExpressionAnalysis> persistAnalyses(ExpressionExperiment expressionExperiment,
            Collection<DifferentialExpressionAnalysis> diffExpressionAnalyses,
            DifferentialExpressionAnalysisConfig config) {

        Collection<DifferentialExpressionAnalysis> results = new HashSet<DifferentialExpressionAnalysis>();
        for (DifferentialExpressionAnalysis analysis : diffExpressionAnalyses) {
            DifferentialExpressionAnalysis persistentAnalysis = persistAnalysis(expressionExperiment, analysis,
                    config);
            results.add(persistentAnalysis);
        }
        return results;
    }

    /**
     * @param expressionExperiment
     * @return the directory where the files should be written.
     */
    private File prepareDirectoryForDistributions(BioAssaySet expressionExperiment) {
        if (expressionExperiment instanceof ExpressionExperimentSubSet) {
            ExpressionExperimentSubSet ss = (ExpressionExperimentSubSet) expressionExperiment;
            ExpressionExperiment source = ss.getSourceExperiment();

            File dir = DifferentialExpressionFileUtils.getBaseDifferentialDirectory(
                    FileTools.cleanForFileName(source.getShortName()) + ".Subset" + ss.getId());
            FileTools.createDir(dir.toString());
            return dir;
        } else if (expressionExperiment instanceof ExpressionExperiment) {
            File dir = DifferentialExpressionFileUtils.getBaseDifferentialDirectory(
                    FileTools.cleanForFileName(((ExpressionExperiment) expressionExperiment).getShortName()));
            FileTools.createDir(dir.toString());
            return dir;
        } else {
            throw new IllegalStateException(
                    "Cannot handle bioassay sets of type=" + expressionExperiment.getClass());
        }

    }

    /**
     * @param ee
     * @param copyMe
     * @param config
     * @return
     */
    private Collection<DifferentialExpressionAnalysis> redoWithoutSave(ExpressionExperiment ee,
            DifferentialExpressionAnalysis copyMe, DifferentialExpressionAnalysisConfig config) {

        Collection<DifferentialExpressionAnalysis> results = new HashSet<DifferentialExpressionAnalysis>();

        BioAssaySet experimentAnalyzed = copyMe.getExperimentAnalyzed();
        assert experimentAnalyzed != null;
        if (experimentAnalyzed.equals(ee)) {
            results = analysisSelectionAndExecutionService.analyze(ee, config);
        } else if (experimentAnalyzed instanceof ExpressionExperimentSubSet
                && ((ExpressionExperimentSubSet) experimentAnalyzed).getSourceExperiment().equals(ee)) {
            DifferentialExpressionAnalysis subsetAnalysis = analysisSelectionAndExecutionService
                    .analyze((ExpressionExperimentSubSet) experimentAnalyzed, config);

            results.add(subsetAnalysis);
        } else {
            throw new IllegalStateException(
                    "Cannot redo an analysis for one experiment if the analysis is for another (" + ee
                            + " is the proposed target, but analysis is from " + experimentAnalyzed);
        }

        return results;
    }

    /**
     * @param qvalueThreshold
     * @param results
     */
    private void removeUnwantedResults(Double qvalueThreshold,
            Collection<DifferentialExpressionAnalysisResult> results) {

        if (qvalueThreshold == null) {
            log.info("No qvalue threshold was set, retaining all " + results.size() + " results");
            return;
        }
        Double workingThreshold = qvalueThreshold;

        int i = trimAboveThreshold(results, workingThreshold);

        /*
         * We want to have a minimum number so we always have something to look at.
         */
        if (i < MINIMUM_NUMBER_OF_HITS_TO_SAVE && results.size() > MINIMUM_NUMBER_OF_HITS_TO_SAVE) {
            List<DifferentialExpressionAnalysisResult> rl = new ArrayList<DifferentialExpressionAnalysisResult>(
                    results);
            Collections.sort(rl, new Comparator<DifferentialExpressionAnalysisResult>() {
                @Override
                public int compare(DifferentialExpressionAnalysisResult o1,
                        DifferentialExpressionAnalysisResult o2) {
                    return o1.getPvalue().compareTo(o2.getPvalue());
                }
            });

            int indexOfLast = Math.min(results.size(), MINIMUM_NUMBER_OF_HITS_TO_SAVE) - 1;
            workingThreshold = rl.get(indexOfLast).getCorrectedPvalue();

            if (workingThreshold == null || Double.isNaN(workingThreshold)) {
                throw new IllegalStateException("Threshold was null or NaN");
            }
            i = trimAboveThreshold(results, workingThreshold);
        }

        log.info("Retained " + i + " results meeting qvalue of " + workingThreshold);

        /*
         * If we set a maximum value, it has to be some fraction of the total genes, at which point the results should
         * be discarded as too non-specific. We can't throw an exception, as there might be other factors in the same
         * analysis that are okay.
         */

    }

    /**
     * Print the distributions to a file.
     * 
     * @param extraSuffix eg qvalue, pvalue, score
     * @param histograms each column is a histogram
     * @param expressionExperiment
     * @param resulstsets in the same order as columns in the
     */
    private void saveDistributionMatrixToFile(DoubleMatrix<String, String> histograms,
            BioAssaySet expressionExperiment, List<ExpressionAnalysisResultSet> resultSetList) {

        Long analysisId = resultSetList.iterator().next().getAnalysis().getId();

        assert analysisId != null;

        File f = prepareDirectoryForDistributions(expressionExperiment);

        String shortName = null;

        if (expressionExperiment instanceof ExpressionExperimentSubSet) {
            ExpressionExperimentSubSet ss = (ExpressionExperimentSubSet) expressionExperiment;
            ExpressionExperiment source = ss.getSourceExperiment();
            shortName = source.getShortName();
        } else if (expressionExperiment instanceof ExpressionExperiment) {
            shortName = ((ExpressionExperiment) expressionExperiment).getShortName();
        } else {
            throw new IllegalStateException(
                    "Cannot handle bioassay sets of type=" + expressionExperiment.getClass());
        }

        String histFileName = FileTools.cleanForFileName(shortName) + ".an" + analysisId + "." + "pvalues"
                + DifferentialExpressionFileUtils.PVALUE_DIST_SUFFIX;

        File outputFile = new File(f, histFileName);

        // log.info( outputFile );
        try {
            FileWriter out = new FileWriter(outputFile, true /*
                                                              * clobber, but usually won't since file names are per
                                                              * analyssi
                                                              */);
            out.write("# Gemma: differential expression statistics - " + "pvalues" + "\n");
            out.write("# Generated=" + (new Date()) + "\n");
            out.write(ExpressionDataFileService.DISCLAIMER);
            out.write("# exp=" + expressionExperiment.getId() + " " + shortName + " "
                    + expressionExperiment.getName() + " \n");

            for (ExpressionAnalysisResultSet resultSet : resultSetList) {
                BioAssaySet analyzedSet = resultSet.getAnalysis().getExperimentAnalyzed();

                if (analyzedSet instanceof ExpressionExperimentSubSet) {
                    out.write("# SubSet id=" + analyzedSet.getId() + " " + analyzedSet.getName() + " "
                            + analyzedSet.getDescription() + "\n");
                }

                Long resultSetId = resultSet.getId();
                out.write("# ResultSet id=" + resultSetId + ", analysisId=" + analysisId + "\n");

            }
            MatrixWriter<String, String> writer = new MatrixWriter<String, String>(out);
            writer.writeMatrix(histograms, true);

        } catch (IOException e) {
            throw new RuntimeException(e);
        }

    }

    /**
     * @param results
     * @param qvalueThreshold
     * @return
     */
    private int trimAboveThreshold(Collection<DifferentialExpressionAnalysisResult> results,
            Double qvalueThreshold) {
        int i = 0;
        for (Iterator<DifferentialExpressionAnalysisResult> it = results.iterator(); it.hasNext();) {
            DifferentialExpressionAnalysisResult r = it.next();

            if (r.getPvalue() == null || Double.isNaN(r.getPvalue()) || r.getCorrectedPvalue() == null
                    || r.getCorrectedPvalue() >= qvalueThreshold) {
                it.remove();
            } else {
                i++;
            }

        }
        return i;
    }

    /**
     * Print the p-value and score distributions. Note that if there are multiple analyses for the experiment, each one
     * will get a set of files. The analysis must be 'thawed' already.
     * 
     * @param expressionExperiment
     * @param diffExpressionAnalysis - could be on a subset of the experiment.
     */
    private void writeDistributions(BioAssaySet expressionExperiment,
            DifferentialExpressionAnalysis diffExpressionAnalysis) {

        assert diffExpressionAnalysis.getId() != null;

        /*
         * write histograms
         * 
         * Of 1) pvalues, // (following not used now) 2) scores, 3) qvalues
         * 
         * Put all pvalues in one file etc so we don't get 9 files for a 2x anova with interactions.
         */
        StopWatch timer = new StopWatch();
        timer.start();
        List<Histogram> pvalueHistograms = new ArrayList<Histogram>();

        List<ExpressionAnalysisResultSet> resultSetList = new ArrayList<ExpressionAnalysisResultSet>();
        resultSetList.addAll(diffExpressionAnalysis.getResultSets());

        List<String> factorNames = new ArrayList<String>();

        for (ExpressionAnalysisResultSet resultSet : resultSetList) {

            String factorName = "";

            // these will be headings on the
            for (ExperimentalFactor factor : resultSet.getExperimentalFactors()) {
                // Make a unique column heading.
                factorName = factorName + (factorName.equals("") ? "" : ":") + factor.getName()
                        + FACTOR_NAME_MANGLING_DELIMITER + factor.getId();
            }
            factorNames.add(factorName);

            Histogram pvalHist = addPvalueDistribution(resultSet);
            this.differentialExpressionResultService.update(resultSet);

            pvalueHistograms.add(pvalHist);

        }

        DoubleMatrix<String, String> pvalueDists = new DenseDoubleMatrix<String, String>(100, resultSetList.size());

        fillDists(factorNames, pvalueHistograms, pvalueDists);

        saveDistributionMatrixToFile(pvalueDists, expressionExperiment, resultSetList);

        if (timer.getTime() > 5000) {
            log.info("Done writing distributions: " + timer.getTime() + "ms");
        }
    }

}