ubic.gemma.persistence.service.expression.bioAssayData.ProcessedExpressionDataVectorDaoImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.persistence.service.expression.bioAssayData.ProcessedExpressionDataVectorDaoImpl.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2008 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.persistence.service.expression.bioAssayData;

import org.apache.commons.lang3.time.StopWatch;
import org.hibernate.*;
import org.hibernate.type.LongType;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Repository;
import ubic.basecode.dataStructure.matrix.DenseDoubleMatrix;
import ubic.basecode.dataStructure.matrix.DoubleMatrix;
import ubic.basecode.util.BatchIterator;
import ubic.gemma.core.analysis.preprocess.normalize.QuantileNormalizer;
import ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix;
import ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrixUtil;
import ubic.gemma.model.common.quantitationtype.QuantitationType;
import ubic.gemma.model.common.quantitationtype.QuantitationTypeImpl;
import ubic.gemma.model.common.quantitationtype.ScaleType;
import ubic.gemma.model.common.quantitationtype.StandardQuantitationType;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.arrayDesign.TechnologyType;
import ubic.gemma.model.expression.bioAssay.BioAssay;
import ubic.gemma.model.expression.bioAssay.BioAssayValueObject;
import ubic.gemma.model.expression.bioAssayData.*;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.expression.designElement.CompositeSequenceValueObject;
import ubic.gemma.model.expression.experiment.BioAssaySet;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet;
import ubic.gemma.model.expression.experiment.ExpressionExperimentValueObject;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.persistence.service.AbstractDao;
import ubic.gemma.persistence.util.CommonQueries;
import ubic.gemma.persistence.util.EntityUtils;

import java.util.*;

/**
 * @author Paul
 */
@Repository
public class ProcessedExpressionDataVectorDaoImpl extends
        DesignElementDataVectorDaoImpl<ProcessedExpressionDataVector> implements ProcessedExpressionDataVectorDao {

    /**
     * Don't attempt to renormalize data that is smaller than this. This avoids unnecessary normalization in tests, and
     * in data sets where normalization is more likely to harm than good.
     */
    private static final int MIN_SIZE_FOR_RENORMALIZATION = 4000;

    private final ProcessedDataVectorCache processedDataVectorCache;

    @Autowired
    public ProcessedExpressionDataVectorDaoImpl(SessionFactory sessionFactory,
            ProcessedDataVectorCache processedDataVectorCache) {
        super(ProcessedExpressionDataVector.class, sessionFactory);
        this.processedDataVectorCache = processedDataVectorCache;
    }

    @Override
    public void clearCache() {
        processedDataVectorCache.clearCache();
    }

    @Override
    public ExpressionExperiment createProcessedDataVectors(ExpressionExperiment ee) {
        if (ee == null) {
            throw new IllegalStateException("ExpressionExperiment cannot be null");
        }

        /*
         * Initialize objects, get some parameters, locate the vectors we're going to use.
         */
        ExpressionExperiment expressionExperiment = (ExpressionExperiment) this.getSessionFactory()
                .getCurrentSession().get(ExpressionExperiment.class, ee.getId());

        assert expressionExperiment != null;

        this.removeProcessedDataVectors(expressionExperiment);

        Hibernate.initialize(expressionExperiment);
        Hibernate.initialize(expressionExperiment.getQuantitationTypes());
        Hibernate.initialize(expressionExperiment.getProcessedExpressionDataVectors());

        expressionExperiment.getProcessedExpressionDataVectors().clear();

        AbstractDao.log.info("Computing processed expression vectors for " + expressionExperiment);

        boolean isTwoChannel = this.isTwoChannel(expressionExperiment);

        Collection<RawExpressionDataVector> missingValueVectors = new HashSet<>();
        if (isTwoChannel) {
            missingValueVectors = this.getMissingValueVectors(expressionExperiment);
        }

        Collection<RawExpressionDataVector> rawPreferredDataVectors = this
                .getPreferredDataVectors(expressionExperiment);
        if (rawPreferredDataVectors.isEmpty()) {
            throw new IllegalArgumentException("No preferred data vectors for " + expressionExperiment);
        }

        removeDuplicateElements(rawPreferredDataVectors);

        RawExpressionDataVector preferredDataVectorExemplar = rawPreferredDataVectors.iterator().next();
        QuantitationType preferredMaskedDataQuantitationType = this
                .getPreferredMaskedDataQuantitationType(preferredDataVectorExemplar.getQuantitationType());

        /* log-transform if necessary */
        Collection<RawExpressionDataVector> preferredDataVectors = ensureLog2Scale(rawPreferredDataVectors,
                preferredMaskedDataQuantitationType);

        Map<CompositeSequence, DoubleVectorValueObject> maskedVectorObjects = this
                .maskAndUnpack(preferredDataVectors, missingValueVectors);

        /*
         * Note that we used to not normalize count data, but we've removed this restriction; and in any case we have
         * moved to using non-count summaries for the primary data type.
         */
        if (preferredMaskedDataQuantitationType.getType().equals(StandardQuantitationType.COUNT)) {
            /*
             * Backfill target
             */
            AbstractDao.log.warn("Preferred data are counts; please convert to log2cpm");
        }

        if (!preferredMaskedDataQuantitationType.getIsRatio()
                && maskedVectorObjects.size() > ProcessedExpressionDataVectorDaoImpl.MIN_SIZE_FOR_RENORMALIZATION) {
            AbstractDao.log.info("Normalizing the data");
            this.renormalize(maskedVectorObjects);
        } else {
            AbstractDao.log.info("Normalization skipped for this data set (not suitable)");
        }

        /*
         * Done with processing, now build the vectors and persist; Do a sanity check that we don't have more than we
         * should
         */
        int i = 0;
        Collection<CompositeSequence> seenDes = new HashSet<>();
        for (CompositeSequence cs : maskedVectorObjects.keySet()) {

            DoubleVectorValueObject dvvo = maskedVectorObjects.get(cs);

            if (seenDes.contains(cs)) {
                // defensive programming, this happens.
                throw new IllegalStateException("Duplicated design element: " + cs
                        + "; make sure the experiment has only one 'preferred' quantitation type. "
                        + "Perhaps you need to run vector merging following an array design switch?");
            }

            ProcessedExpressionDataVector vec = (ProcessedExpressionDataVector) dvvo.toDesignElementDataVector(ee,
                    cs, preferredMaskedDataQuantitationType);

            expressionExperiment.getProcessedExpressionDataVectors().add(vec);
            seenDes.add(cs);
            if (++i % 5000 == 0) {
                AbstractDao.log.info(i + " vectors built");
            }
        }

        AbstractDao.log.info("Persisting " + expressionExperiment.getProcessedExpressionDataVectors().size()
                + " processed data vectors");

        expressionExperiment.getQuantitationTypes().add(preferredMaskedDataQuantitationType);
        expressionExperiment
                .setNumberOfDataVectors(expressionExperiment.getProcessedExpressionDataVectors().size());

        this.getSessionFactory().getCurrentSession().update(expressionExperiment);
        assert expressionExperiment.getNumberOfDataVectors() != null;

        this.processedDataVectorCache.clearCache(expressionExperiment.getId());

        return expressionExperiment;

    }

    @Override
    public Collection<DoubleVectorValueObject> getProcessedDataArrays(BioAssaySet expressionExperiment) {
        return this.getProcessedDataArrays(expressionExperiment, -1);
    }

    @Override
    public Collection<DoubleVectorValueObject> getProcessedDataArrays(BioAssaySet expressionExperiment,
            Collection<Long> genes) {
        Collection<BioAssaySet> expressionExperiments = new HashSet<>();
        expressionExperiments.add(expressionExperiment);
        return this.handleGetProcessedExpressionDataArrays(expressionExperiments, genes);
    }

    @Override
    public Collection<DoubleVectorValueObject> getProcessedDataArrays(BioAssaySet ee, int limit) {

        Collection<ProcessedExpressionDataVector> pedvs = this.getProcessedVectors(this.getExperiment(ee), limit);

        if (pedvs.isEmpty()) {
            AbstractDao.log.warn("No processed vectors for experiment " + ee);
            return new HashSet<>();
        }

        Collection<Long> probes = new ArrayList<>();
        for (ProcessedExpressionDataVector pedv : pedvs) {
            probes.add(pedv.getDesignElement().getId());
        }

        if (probes.isEmpty()) {
            return this.unpack(pedvs).values();
        }

        Map<Long, Collection<Long>> cs2gene = CommonQueries.getCs2GeneMapForProbes(probes,
                this.getSessionFactory().getCurrentSession());

        Collection<BioAssayDimension> bioAssayDimensions = this.getBioAssayDimensions(ee);

        if (bioAssayDimensions.size() == 1) {
            return this.unpack(pedvs, cs2gene).values();
        }

        /*
         * deal with 'misalignment problem'
         */

        BioAssayDimension longestBad = this.checkRagged(bioAssayDimensions);

        if (longestBad != null) {
            return this.unpack(pedvs, cs2gene, longestBad);
        }
        return this.unpack(pedvs, cs2gene).values();
    }

    @Override
    public Collection<DoubleVectorValueObject> getProcessedDataArrays(
            Collection<? extends BioAssaySet> expressionExperiments, Collection<Long> genes) {
        return this.handleGetProcessedExpressionDataArrays(expressionExperiments, genes);
    }

    @Override
    public Collection<DoubleVectorValueObject> getProcessedDataArraysByProbe(Collection<? extends BioAssaySet> ees,
            Collection<CompositeSequence> probes) {

        if (probes.isEmpty())
            return new HashSet<>();

        Collection<Long> probeIds = EntityUtils.getIds(probes);

        return this.getProcessedDataArraysByProbeIds(ees, probeIds);

    }

    @Override
    public Collection<DoubleVectorValueObject> getProcessedDataArraysByProbeIds(BioAssaySet ee,
            Collection<Long> probes) {
        return this.getProcessedDataArraysByProbeIds(Collections.singleton(ee), probes);
    }

    @Override
    public Collection<ProcessedExpressionDataVector> getProcessedVectors(ExpressionExperiment ee) {
        //language=HQL
        final String queryString = " from ProcessedExpressionDataVector dedv where dedv.expressionExperiment.id = :ee";

        //noinspection unchecked
        return this.getSessionFactory().getCurrentSession().createQuery(queryString).setParameter("ee", ee.getId())
                .list();
    }

    @Override
    public Map<ExpressionExperiment, Map<Gene, Collection<Double>>> getRanks(
            Collection<ExpressionExperiment> expressionExperiments, Collection<Gene> genes, RankMethod method) {

        Collection<ArrayDesign> arrayDesigns = CommonQueries.getArrayDesignsUsed(
                EntityUtils.getIds(expressionExperiments), this.getSessionFactory().getCurrentSession()).keySet();

        // this could be further improved by getting probes specific to experiments in batches.
        Map<CompositeSequence, Collection<Gene>> cs2gene = CommonQueries.getCs2GeneMap(genes, arrayDesigns,
                this.getSessionFactory().getCurrentSession());

        if (cs2gene.isEmpty()) {
            AbstractDao.log.warn("No composite sequences found for genes");
            return new HashMap<>();
        }
        Map<ExpressionExperiment, Map<Gene, Collection<Double>>> result = new HashMap<>();

        BatchIterator<CompositeSequence> batchIterator = new BatchIterator<>(cs2gene.keySet(), 500);

        for (Collection<CompositeSequence> batch : batchIterator) {

            //language=HQL
            final String queryString = "select distinct dedv.expressionExperiment, dedv.designElement, dedv.rankByMean, "
                    + "dedv.rankByMax from ProcessedExpressionDataVector dedv "
                    + " where dedv.designElement in ( :cs ) and dedv.expressionExperiment in (:ees) ";

            List qr = this.getSessionFactory().getCurrentSession().createQuery(queryString)
                    .setParameter("cs", batch).setParameterList("ees", expressionExperiments).list();

            for (Object o : qr) {
                Object[] oa = (Object[]) o;
                ExpressionExperiment e = (ExpressionExperiment) oa[0];
                CompositeSequence d = (CompositeSequence) oa[1];
                Double rMean = oa[2] == null ? Double.NaN : (Double) oa[2];
                Double rMax = oa[3] == null ? Double.NaN : (Double) oa[3];

                if (!result.containsKey(e)) {
                    result.put(e, new HashMap<Gene, Collection<Double>>());
                }

                Map<Gene, Collection<Double>> rMap = result.get(e);

                Collection<Gene> genes4probe = cs2gene.get(d);

                this.addToGene(method, rMap, rMean, rMax, genes4probe);
            }
        }
        return result;
    }

    @Override
    public Map<Gene, Collection<Double>> getRanks(ExpressionExperiment expressionExperiment, Collection<Gene> genes,
            RankMethod method) {
        Map<CompositeSequence, Collection<Gene>> cs2gene = CommonQueries.getCs2GeneMap(genes,
                this.getSessionFactory().getCurrentSession());
        if (cs2gene.keySet().size() == 0) {
            AbstractDao.log.warn("No composite sequences found for genes");
            return new HashMap<>();
        }

        //language=HQL
        final String queryString = "select distinct dedv.designElement, dedv.rankByMean, dedv.rankByMax from ProcessedExpressionDataVector dedv "
                + " where dedv.designElement in ( :cs ) and dedv.expressionExperiment.id = :eeid ";

        List qr = this.getSessionFactory().getCurrentSession().createQuery(queryString)
                .setParameterList("cs", cs2gene.keySet()).setParameter("eeid", expressionExperiment.getId()).list();

        Map<Gene, Collection<Double>> result = new HashMap<>();
        for (Object o : qr) {
            Object[] oa = (Object[]) o;
            CompositeSequence d = (CompositeSequence) oa[0];
            Double rMean = oa[1] == null ? Double.NaN : (Double) oa[1];
            Double rMax = oa[2] == null ? Double.NaN : (Double) oa[2];

            Collection<Gene> genes4probe = cs2gene.get(d);

            this.addToGene(method, result, rMean, rMax, genes4probe);
        }
        return result;

    }

    @Override
    public Map<CompositeSequence, Double> getRanks(ExpressionExperiment expressionExperiment, RankMethod method) {
        //language=HQL
        final String queryString = "select dedv.designElement, dedv.rankByMean, dedv.rankByMax from ProcessedExpressionDataVector dedv "
                + "where dedv.expressionExperiment.id = :ee";
        List qr = this.getSessionFactory().getCurrentSession().createQuery(queryString)
                .setParameter("ee", expressionExperiment.getId()).list();
        Map<CompositeSequence, Double> result = new HashMap<>();
        for (Object o : qr) {
            Object[] oa = (Object[]) o;
            CompositeSequence d = (CompositeSequence) oa[0];
            Double rMean = oa[1] == null ? Double.NaN : (Double) oa[1];
            Double rMax = oa[2] == null ? Double.NaN : (Double) oa[2];
            switch (method) {
            case mean:
                result.put(d, rMean);
                break;
            case max:
                result.put(d, rMax);
                break;
            default:
                break;
            }
        }
        return result;

    }

    @Override
    public Map<ExpressionExperiment, Map<Gene, Map<CompositeSequence, Double[]>>> getRanksByProbe(
            Collection<ExpressionExperiment> expressionExperiments, Collection<Gene> genes) {
        Map<CompositeSequence, Collection<Gene>> cs2gene = CommonQueries.getCs2GeneMap(genes,
                this.getSessionFactory().getCurrentSession());

        if (cs2gene.keySet().size() == 0) {
            AbstractDao.log.warn("No composite sequences found for genes");
            return new HashMap<>();
        }

        //language=HQL
        final String queryString = "select distinct dedv.expressionExperiment, dedv.designElement, dedv.rankByMean, dedv.rankByMax "
                + "from ProcessedExpressionDataVector dedv " + " inner join dedv.designElement de  "
                + " where dedv.designElement.id in ( :cs ) and dedv.expressionExperiment.id in (:ees) ";

        List qr = this.getSessionFactory().getCurrentSession().createQuery(queryString)
                .setParameterList("cs", EntityUtils.getIds(cs2gene.keySet()))
                .setParameterList("ees", EntityUtils.getIds(expressionExperiments)).list();

        Map<ExpressionExperiment, Map<Gene, Map<CompositeSequence, Double[]>>> resultNew = new HashMap<>();
        for (Object o : qr) {
            Object[] oa = (Object[]) o;
            ExpressionExperiment e = (ExpressionExperiment) oa[0];
            CompositeSequence d = (CompositeSequence) oa[1];
            Double rMean = (Double) oa[2];
            Double rMax = (Double) oa[3];

            if (!resultNew.containsKey(e)) {
                resultNew.put(e, new HashMap<Gene, Map<CompositeSequence, Double[]>>());
            }

            Map<Gene, Map<CompositeSequence, Double[]>> rMapNew = resultNew.get(e);

            Collection<Gene> genes4probe = cs2gene.get(d);

            for (Gene gene : genes4probe) {
                if (!rMapNew.containsKey(gene)) {
                    rMapNew.put(gene, new HashMap<CompositeSequence, Double[]>());
                }

                // return BOTH mean and max

                if (rMean == null || rMax == null) {
                    continue;
                }
                Double[] MeanMax = new Double[] { rMean, rMax };
                rMapNew.get(gene).put(d, MeanMax);
            }
        }
        return resultNew;
    }

    @Override
    public void removeProcessedDataVectors(ExpressionExperiment expressionExperiment) {
        assert expressionExperiment != null;
        assert expressionExperiment.getId() != null;

        if (!this.getSessionFactory().getCurrentSession().contains(expressionExperiment)) {
            expressionExperiment = (ExpressionExperiment) this.getSessionFactory().getCurrentSession()
                    .get(ExpressionExperiment.class, expressionExperiment.getId());
        }

        /*
         * Get quantitation types that will be removed.
         */
        //noinspection unchecked
        List<QuantitationType> qtsToRemove = this.getSessionFactory().getCurrentSession()
                .createQuery("select distinct p.quantitationType from ExpressionExperiment e "
                        + "inner join e.processedExpressionDataVectors p where e.id = :id")
                .setParameter("id", expressionExperiment.getId()).list();

        //        Collection<ProcessedExpressionDataVector> vectors = expressionExperiment.getProcessedExpressionDataVectors();
        //        Hibernate.initialize( vectors );
        //        expressionExperiment.setProcessedExpressionDataVectors( new HashSet<ProcessedExpressionDataVector>() );
        //        this.getSessionFactory().getCurrentSession().update( expressionExperiment );

        expressionExperiment.getProcessedExpressionDataVectors().clear();

        //        if ( !vectors.isEmpty() ) {
        //            this.getSessionFactory().getCurrentSession()
        //                    .createQuery( "delete from ProcessedExpressionDataVector p where p.id in (:ids)" )
        //                    .setParameterList( "ids", EntityUtils.getIds( vectors ) ).executeUpdate();
        //        }
        if (!qtsToRemove.isEmpty()) {
            AbstractDao.log.info("Deleting " + qtsToRemove.size() + " old quantitation types");
            expressionExperiment.getQuantitationTypes().removeAll(qtsToRemove);
            this.getSessionFactory().getCurrentSession().update(expressionExperiment);
            this.getSessionFactory().getCurrentSession()
                    .createQuery("delete from QuantitationTypeImpl where id in (:ids)")
                    .setParameterList("ids", EntityUtils.getIds(qtsToRemove));
        }
    }

    /**
     * Make sure the data are on a log2 scale
     *
     * @param rawPreferredDataVectors             raw preferred data vectors
     * @param preferredMaskedDataQuantitationType preferred masked data QT
     * @return collection containing the vectors
     */
    public Collection<RawExpressionDataVector> ensureLog2Scale(
            Collection<RawExpressionDataVector> rawPreferredDataVectors,
            QuantitationType preferredMaskedDataQuantitationType) {
        Collection<RawExpressionDataVector> preferredDataVectors = new HashSet<>();

        if (!preferredMaskedDataQuantitationType.getScale().equals(ScaleType.LOG2)) {
            log.info("Converting from " + preferredMaskedDataQuantitationType.getScale() + " to log2");
            ExpressionDataDoubleMatrix matrix = ExpressionDataDoubleMatrixUtil.ensureLog2Scale(
                    preferredMaskedDataQuantitationType, new ExpressionDataDoubleMatrix(rawPreferredDataVectors));
            preferredDataVectors.addAll(matrix.toRawDataVectors());
            preferredMaskedDataQuantitationType.setScale(ScaleType.LOG2);
            this.getSession().update(preferredMaskedDataQuantitationType);
        } else {
            preferredDataVectors.addAll(rawPreferredDataVectors);
        }
        return preferredDataVectors;
    }

    @Override
    public Collection<ProcessedExpressionDataVector> find(BioAssayDimension bioAssayDimension) {
        //noinspection unchecked
        return new HashSet<>(this.getSessionFactory().getCurrentSession()
                .createQuery("select d from ProcessedExpressionDataVector d where d.bioAssayDimension = :bad")
                .setParameter("bad", bioAssayDimension).list());
    }

    @Override
    public Collection<ProcessedExpressionDataVector> find(Collection<QuantitationType> quantitationTypes) {
        //noinspection unchecked
        return new HashSet<>(this.getSessionFactory().getCurrentSession()
                .createQuery("select dev from ProcessedExpressionDataVector dev where  "
                        + "  dev.quantitationType in ( :quantitationTypes) ")
                .setParameterList("quantitationTypes", quantitationTypes).list());

    }

    @Override
    public Collection<ProcessedExpressionDataVector> find(ArrayDesign arrayDesign,
            QuantitationType quantitationType) {
        //noinspection unchecked
        return new HashSet<>(this.getSessionFactory().getCurrentSession().createQuery(
                "select dev from ProcessedExpressionDataVector dev  inner join fetch dev.bioAssayDimension bd "
                        + " inner join fetch dev.designElement de inner join fetch dev.quantitationType inner join de.arrayDesign ad where ad.id = :adid "
                        + "and dev.quantitationType = :quantitationType ")
                .setParameter("quantitationType", quantitationType).setParameter("adid", arrayDesign.getId())
                .list());
    }

    @Override
    public Collection<ProcessedExpressionDataVector> find(Collection<CompositeSequence> designElements,
            QuantitationType quantitationType) {
        if (designElements == null || designElements.size() == 0)
            return new HashSet<>();

        //noinspection unchecked
        return this.getSessionFactory().getCurrentSession()
                .createQuery(
                        "select dev from ProcessedExpressionDataVector as dev inner join dev.designElement as de "
                                + " where de in (:des) and dev.quantitationType = :qt")
                .setParameterList("des", designElements).setParameter("qt", quantitationType).list();
    }

    @Override
    public void removeDataForCompositeSequence(final CompositeSequence compositeSequence) {
        final String dedvRemovalQuery = "delete ProcessedExpressionDataVector dedv where dedv.designElement = ?";
        int deleted = this.getHibernateTemplate().bulkUpdate(dedvRemovalQuery, compositeSequence);
        AbstractDao.log.info("Deleted: " + deleted);
    }

    @Override
    public void removeDataForQuantitationType(final QuantitationType quantitationType) {
        final String dedvRemovalQuery = "delete from ProcessedExpressionDataVector as dedv where dedv.quantitationType = ?";
        int deleted = this.getHibernateTemplate().bulkUpdate(dedvRemovalQuery, quantitationType);
        AbstractDao.log.info("Deleted " + deleted + " data vector elements");
    }
    //
    //    @Override
    //    public ExpressionExperiment createProcessedDataVectors( ExpressionExperiment ee,
    //            Collection<ProcessedExpressionDataVector> data ) {
    //        if ( ee == null ) {
    //            throw new IllegalStateException( "ExpressionExperiment cannot be null" );
    //        }
    //
    //        ExpressionExperiment expressionExperiment = ( ExpressionExperiment ) this.getSessionFactory()
    //                .getCurrentSession().get( ExpressionExperiment.class, ee.getId() );
    //
    //        assert expressionExperiment != null;
    //
    //        this.removeProcessedDataVectors( expressionExperiment );
    //
    //        Hibernate.initialize( expressionExperiment );
    //        Hibernate.initialize( expressionExperiment.getQuantitationTypes() );
    //
    //        data = ExpressionDataDoubleMatrixUtil.ensureLog2Scale( data );
    //
    //        QuantitationType qt = data.iterator().next().getQuantitationType(); // assumes all are same.
    //        qt.setScale( ScaleType.LOG2 );
    //        this.getSession().update( qt );
    //
    //        expressionExperiment.getProcessedExpressionDataVectors().addAll( data );
    //
    //        this.getSessionFactory().getCurrentSession().saveOrUpdate( qt );
    //        expressionExperiment.getQuantitationTypes().add( data.iterator().next().getQuantitationType() );
    //        expressionExperiment.setNumberOfDataVectors( expressionExperiment.getProcessedExpressionDataVectors().size() );
    //
    //        this.getSessionFactory().getCurrentSession().update( expressionExperiment );
    //        assert expressionExperiment.getNumberOfDataVectors() != null;
    //
    //        this.processedDataVectorCache.clearCache( expressionExperiment.getId() );
    //
    //        return expressionExperiment;
    //
    //    }

    private void addToGene(RankMethod method, Map<Gene, Collection<Double>> result, Double rMean, Double rMax,
            Collection<Gene> genes4probe) {
        for (Gene gene : genes4probe) {
            if (!result.containsKey(gene)) {
                result.put(gene, new ArrayList<Double>());
            }
            switch (method) {
            case mean:
                result.get(gene).add(rMean);
                break;
            case max:
                result.get(gene).add(rMax);
                break;
            default:
                break;
            }
        }
    }

    /**
     * @param newResults Always provide full vectors, not subsets.
     */
    private void cacheResults(Collection<DoubleVectorValueObject> newResults) {
        /*
         * Break up by gene and EE to cache collections of vectors for EE-gene combos.
         */
        Map<Long, Map<Long, Collection<DoubleVectorValueObject>>> mapForCache = this.makeCacheMap(newResults);
        int i = 0;
        for (Long eeid : mapForCache.keySet()) {
            for (Long g : mapForCache.get(eeid).keySet()) {
                i++;
                this.processedDataVectorCache.addToCache(eeid, g, mapForCache.get(eeid).get(g));
            }
        }
        // WARNING cache size() can be slow, esp. terracotta.
        AbstractDao.log.info("Cached " + i + ", input " + newResults.size() + "; total cached: "
        /* + this.processedDataVectorCache.size() */ );
    }

    /**
     * We cache vectors at the experiment level. If we need subsets, we have to slice them out.
     *
     * @param bioAssaySets  that we exactly need the data for.
     * @param genes         that might have cached results
     * @param results       from the cache will be put here
     * @param needToSearch  experiments that need to be searched (not fully cached); this will be populated
     * @param genesToSearch that still need to be searched (not in cache)
     */
    private void checkCache(Collection<? extends BioAssaySet> bioAssaySets, Collection<Long> genes,
            Collection<DoubleVectorValueObject> results, Collection<ExpressionExperiment> needToSearch,
            Collection<Long> genesToSearch) {

        for (BioAssaySet ee : bioAssaySets) {

            ExpressionExperiment experiment = null;
            boolean needSubSet = false;
            if (ee instanceof ExpressionExperiment) {
                experiment = (ExpressionExperiment) ee;
            } else if (ee instanceof ExpressionExperimentSubSet) {
                experiment = ((ExpressionExperimentSubSet) ee).getSourceExperiment();
                needSubSet = true;
            }

            assert experiment != null;

            for (Long g : genes) {
                Collection<DoubleVectorValueObject> obs = processedDataVectorCache.get(ee, g);
                if (obs != null) {
                    if (needSubSet) {
                        obs = this.sliceSubSet((ExpressionExperimentSubSet) ee, obs);
                    }
                    results.addAll(obs);
                } else {
                    genesToSearch.add(g);
                }
            }
            /*
             * This experiment is not fully cached for the genes in question.
             */
            if (genesToSearch.size() > 0) {
                needToSearch.add(experiment);
            }
        }
    }

    /**
     * @param bioAssayDimensions See if anything is 'ragged' (fewer bioassays per biomaterial than in some other
     *                           sample)
     * @return bio assay dimension
     */
    private BioAssayDimension checkRagged(Collection<BioAssayDimension> bioAssayDimensions) {
        int s = -1;
        int longest = -1;
        BioAssayDimension longestBad = null;
        for (BioAssayDimension bad : bioAssayDimensions) {
            Collection<BioAssay> assays = bad.getBioAssays();
            if (s < 0) {
                s = assays.size();
            }

            if (assays.size() > longest) {
                longest = assays.size();
                longestBad = bad;
            }
        }
        return longestBad;
    }

    private void doQuantileNormalization(DoubleMatrix<CompositeSequence, Integer> matrix,
            Map<CompositeSequence, DoubleVectorValueObject> vectors) {

        QuantileNormalizer<CompositeSequence, Integer> normalizer = new QuantileNormalizer<>();

        DoubleMatrix<CompositeSequence, Integer> normalized = normalizer.normalize(matrix);

        for (int i = 0; i < normalized.rows(); i++) {
            double[] row = normalized.getRow(i);
            CompositeSequence cs = normalized.getRowName(i);
            DoubleVectorValueObject vec = vectors.get(cs);
            double[] data = vec.getData();
            System.arraycopy(row, 0, data, 0, row.length);
        }

    }

    private Collection<BioAssayDimension> getBioAssayDimensions(BioAssaySet ee) {
        if (ee instanceof ExpressionExperiment) {
            StopWatch timer = new StopWatch();
            timer.start();
            List r = this.getSessionFactory().getCurrentSession().createQuery(
                    // this does not look efficient.
                    "select distinct bad from ExpressionExperiment e, BioAssayDimension bad"
                            + " inner join e.bioAssays b inner join bad.bioAssays badba where e = :ee and b in (badba) ")
                    .setParameter("ee", ee).list();
            timer.stop();
            if (timer.getTime() > 100) {
                AbstractDao.log.info("Fetch " + r.size() + " bioassayDimensions for experiment id=" + ee.getId()
                        + ": " + timer.getTime() + "ms");
            }
            //noinspection unchecked
            return r;
        }

        // subset.
        return this.getBioAssayDimensions(this.getExperiment(ee));

    }

    private Map<BioAssaySet, Collection<BioAssayDimension>> getBioAssayDimensions(
            Collection<ExpressionExperiment> ees) {
        Map<BioAssaySet, Collection<BioAssayDimension>> result = new HashMap<>();

        if (ees.size() == 1) {
            ExpressionExperiment ee = ees.iterator().next();
            result.put(ee, this.getBioAssayDimensions(ee));
            return result;
        }

        StopWatch timer = new StopWatch();
        timer.start();
        //noinspection unchecked
        List<Object> r = this.getSessionFactory().getCurrentSession()
                .createQuery("select distinct e, bad from ExpressionExperiment e, BioAssayDimension bad"
                        + " inner join e.bioAssays b inner join bad.bioAssays badba where e in (:ees) and b in (badba) ")
                .setParameterList("ees", ees).list();

        for (Object o : r) {
            Object[] tup = (Object[]) o;
            BioAssaySet bas = (BioAssaySet) tup[0];
            if (!result.containsKey(bas))
                result.put(bas, new HashSet<BioAssayDimension>());

            result.get(bas).add((BioAssayDimension) tup[1]);
        }
        if (timer.getTime() > 100) {
            AbstractDao.log.info("Fetch " + r.size() + " bioAssayDimensions for " + ees.size() + " experiment(s): "
                    + timer.getTime() + "ms");
        }

        return result;

    }

    /**
     * @param data data
     * @return Pre-fetch and construct the BioAssayDimensionValueObjects. Used on the basis that the data probably
     * just
     * have one
     * (or a few) BioAssayDimensionValueObjects needed, not a different one for each vector. See bug 3629
     * for
     * details.
     */
    private Map<BioAssayDimension, BioAssayDimensionValueObject> getBioAssayDimensionValueObjects(
            Collection<? extends DesignElementDataVector> data) {
        Map<BioAssayDimension, BioAssayDimensionValueObject> badVos = new HashMap<>();
        for (DesignElementDataVector v : data) {
            BioAssayDimension bioAssayDimension = v.getBioAssayDimension();
            if (!badVos.containsKey(bioAssayDimension)) {
                badVos.put(bioAssayDimension, new BioAssayDimensionValueObject(bioAssayDimension));
            }
        }
        return badVos;
    }

    private ExpressionExperiment getExperiment(BioAssaySet bas) {
        ExpressionExperiment e;
        if (bas instanceof ExpressionExperiment) {
            e = (ExpressionExperiment) bas;
        } else if (bas instanceof ExpressionExperimentSubSet) {
            e = ((ExpressionExperimentSubSet) bas).getSourceExperiment();
        } else {
            throw new UnsupportedOperationException("Couldn't handle a " + bas.getClass());
        }
        assert e != null;
        return e;
    }

    /**
     * Determine the experiments that bioAssaySets refer to.
     *
     * @param bioAssaySets - either ExpressionExperiment or ExpressionExperimentSubSet (which has an associated
     *                     ExpressionExperiment, which is what we're after)
     * @return Note that this collection can be smaller than the input, if two bioAssaySets come from (or
     * are) the same
     * Experiment
     */
    private Collection<ExpressionExperiment> getExperiments(Collection<? extends BioAssaySet> bioAssaySets) {
        Collection<ExpressionExperiment> result = new HashSet<>();

        for (BioAssaySet bas : bioAssaySets) {
            ExpressionExperiment e = this.getExperiment(bas);

            result.add(e);
        }
        return result;
    }

    private Collection<RawExpressionDataVector> getMissingValueVectors(ExpressionExperiment ee) {
        //language=HQL
        final String queryString = "select dedv from RawExpressionDataVector dedv "
                + "inner join dedv.quantitationType q where q.type = 'PRESENTABSENT'"
                + " and dedv.expressionExperiment  = :ee ";
        //noinspection unchecked
        return this.getSessionFactory().getCurrentSession().createQuery(queryString).setParameter("ee", ee).list();
    }

    /**
     * @param ee ee
     * @return Retrieve the RAW data for the preferred quantitation type.
     */
    private Collection<RawExpressionDataVector> getPreferredDataVectors(ExpressionExperiment ee) {
        //language=HQL
        final String queryString = "select dedv from RawExpressionDataVector dedv inner join dedv.quantitationType q "
                + " where q.isPreferred = true  and dedv.expressionExperiment.id = :ee";
        //noinspection unchecked
        return this.getSessionFactory().getCurrentSession().createQuery(queryString).setParameter("ee", ee.getId())
                .list();
    }

    /**
     * Make a quantitation type for attaching to the new processed data - always log2 transformed
     *
     * @param preferredQt preferred QT
     * @return QT
     */
    private QuantitationType getPreferredMaskedDataQuantitationType(QuantitationType preferredQt) {
        QuantitationType present = QuantitationType.Factory.newInstance();

        present.setName(preferredQt.getName() + " - Processed version");
        present.setDescription(
                "Processed data (as per Gemma) for analysis, based on the preferred quantitation type raw data");

        present.setGeneralType(preferredQt.getGeneralType());
        present.setRepresentation(preferredQt.getRepresentation()); // better be a number!
        present.setScale(preferredQt.getScale());

        present.setIsBackground(false);
        present.setIsPreferred(false); // This is the correct thing to do because it's not raw data.
        present.setIsMaskedPreferred(true);
        present.setIsBackgroundSubtracted(preferredQt.getIsBackgroundSubtracted());

        present.setIsBatchCorrected(preferredQt.getIsBatchCorrected());
        present.setIsRecomputedFromRawData(preferredQt.getIsRecomputedFromRawData()); // By "RAW" we mean CEL files or Fastq etc.

        present.setIsNormalized(preferredQt.getIsNormalized());

        present.setIsRatio(preferredQt.getIsRatio());
        present.setType(preferredQt.getType());

        Long id = (Long) this.getSessionFactory().getCurrentSession().save(present);
        return (QuantitationType) this.getSessionFactory().getCurrentSession().load(QuantitationTypeImpl.class, id);
    }

    private Collection<DoubleVectorValueObject> getProcessedDataArraysByProbeIds(
            Collection<? extends BioAssaySet> ees, Collection<Long> probeIds) {
        Collection<DoubleVectorValueObject> results = new HashSet<>();

        Map<Long, Collection<Long>> cs2gene = CommonQueries.getCs2GeneMapForProbes(probeIds,
                this.getSessionFactory().getCurrentSession());

        Map<Long, Collection<Long>> noGeneProbes = new HashMap<>();
        for (Long pid : probeIds) {
            if (!cs2gene.containsKey(pid) || cs2gene.get(pid).isEmpty()) {
                noGeneProbes.put(pid, new HashSet<Long>());
                cs2gene.remove(pid);
            }
        }

        AbstractDao.log.info(cs2gene.size() + " probes associated with a gene; " + noGeneProbes.size() + " not");

        /*
         * To Check the cache we need the list of genes 1st. Get from CS2Gene list then check the cache.
         */
        Collection<Long> genes = new HashSet<>();
        for (Long cs : cs2gene.keySet()) {
            genes.addAll(cs2gene.get(cs));
        }

        Collection<ExpressionExperiment> needToSearch = new HashSet<>();
        Collection<Long> genesToSearch = new HashSet<>();
        this.checkCache(ees, genes, results, needToSearch, genesToSearch);

        if (!results.isEmpty())
            AbstractDao.log.info(results.size() + " vectors fetched from cache");

        Map<ProcessedExpressionDataVector, Collection<Long>> rawResults = new HashMap<>();

        /*
         * Small problem: noGeneProbes are never really cached since we use the gene as part of that.
         */
        if (!noGeneProbes.isEmpty()) {
            Collection<ExpressionExperiment> eesForNoGeneProbes = new HashSet<>();
            for (BioAssaySet ee : ees) {
                if (ee instanceof ExpressionExperiment) {
                    eesForNoGeneProbes.add((ExpressionExperiment) ee);
                } else {
                    eesForNoGeneProbes.add(((ExpressionExperimentSubSet) ee).getSourceExperiment());
                }
            }
            needToSearch.addAll(eesForNoGeneProbes);
            rawResults.putAll(this.getProcessedVectors(EntityUtils.getIds(eesForNoGeneProbes), noGeneProbes));
        }

        if (!rawResults.isEmpty())
            AbstractDao.log.info(rawResults.size() + " vectors retrieved so far, for noGeneProbes");

        /*
         * Non-cached items.
         */
        if (!needToSearch.isEmpty()) {
            rawResults.putAll(this.getProcessedVectors(EntityUtils.getIds(needToSearch), cs2gene));
        }

        if (!rawResults.isEmpty())
            AbstractDao.log.info(rawResults.size() + " vectors retrieved so far, after fetching non-cached.");

        /*
         * Deal with possibility of 'gaps' and unpack the vectors.
         */
        Collection<DoubleVectorValueObject> newResults = new HashSet<>();
        for (ExpressionExperiment ee : needToSearch) {

            Collection<BioAssayDimension> bioAssayDimensions = this.getBioAssayDimensions(ee);

            if (bioAssayDimensions.size() == 1) {
                newResults.addAll(this.unpack(rawResults));
            } else {
                /*
                 * See handleGetProcessedExpressionDataArrays(Collection<? extends BioAssaySet>, Collection<Gene>,
                 * boolean) and bug 1704.
                 */
                BioAssayDimension longestBad = this.checkRagged(bioAssayDimensions);
                assert longestBad != null;
                newResults.addAll(this.unpack(rawResults, longestBad));

            }

            if (!newResults.isEmpty()) {
                this.cacheResults(newResults);

                newResults = this.sliceSubsets(ees, newResults);

                results.addAll(newResults);
            }
        }

        return results;
    }

    /**
     * @param cs2gene Map of probe to genes.
     * @param ees     ees
     * @return map of vectors to genes.
     */
    private Map<ProcessedExpressionDataVector, Collection<Long>> getProcessedVectors(Collection<Long> ees,
            Map<Long, Collection<Long>> cs2gene) {

        if (ees == null || ees.size() == 0) {
            return this.getVectorsForProbesInExperiments(cs2gene);
        }
        Map<ProcessedExpressionDataVector, Collection<Long>> result = new HashMap<>();
        for (Long ee : ees) {
            result.putAll(this.getVectorsForProbesInExperiments(ee, cs2gene));
        }
        return result;

    }

    /**
     * @param limit if non-null and positive, you will get a random set of vectors for the experiment
     * @param ee    ee
     * @return processed data vectors
     */
    private Collection<ProcessedExpressionDataVector> getProcessedVectors(ExpressionExperiment ee, Integer limit) {

        if (limit == null || limit < 0) {
            return this.getProcessedVectors(ee);
        }

        StopWatch timer = new StopWatch();
        timer.start();
        List<ProcessedExpressionDataVector> result;

        Integer availableVectorCount = ee.getNumberOfDataVectors();
        if (availableVectorCount == null || availableVectorCount == 0) {
            AbstractDao.log.info("Experiment does not have vector count populated.");
            // cannot fix this here, because we're read-only.
        }

        Query q = this.getSessionFactory().getCurrentSession()
                .createQuery(" from ProcessedExpressionDataVector dedv where dedv.expressionExperiment.id = :ee");
        q.setParameter("ee", ee.getId(), LongType.INSTANCE);
        q.setMaxResults(limit);
        if (availableVectorCount != null && availableVectorCount > limit) {
            q.setFirstResult(new Random().nextInt(availableVectorCount - limit));
        }

        // we should already be read-only, so this is probably pointless.
        q.setReadOnly(true);

        // and so this probably doesn't do anything useful.
        q.setFlushMode(FlushMode.MANUAL);

        //noinspection unchecked
        result = q.list();
        if (timer.getTime() > 1000)
            AbstractDao.log
                    .info("Fetch " + limit + " vectors from " + ee.getShortName() + ": " + timer.getTime() + "ms");

        if (result.isEmpty()) {
            AbstractDao.log.warn("Experiment does not have any processed data vectors");
            return result;
        }

        this.thaw(result); // needed?
        return result;
    }

    /**
     * This is an important method for fetching vectors.
     *
     * @param genes genes
     * @param ees   ees
     * @return vectors, possibly subsetted.
     */
    private Collection<DoubleVectorValueObject> handleGetProcessedExpressionDataArrays(
            Collection<? extends BioAssaySet> ees, Collection<Long> genes) {

        // ees must be thawed first as currently implemented (?)

        Collection<DoubleVectorValueObject> results = new HashSet<>();

        /*
         * Check the cache.
         */
        Collection<ExpressionExperiment> needToSearch = new HashSet<>();
        Collection<Long> genesToSearch = new HashSet<>();
        this.checkCache(ees, genes, results, needToSearch, genesToSearch);
        AbstractDao.log.info("Using " + results.size() + " DoubleVectorValueObject(s) from cache");

        if (needToSearch.size() == 0) {
            return results;
        }

        /*
         * Get items not in the cache.
         */
        AbstractDao.log.info("Searching for vectors for " + genes.size() + " genes from " + needToSearch.size()
                + " experiments not in cache");

        Collection<ArrayDesign> arrays = CommonQueries
                .getArrayDesignsUsed(EntityUtils.getIds(this.getExperiments(ees)),
                        this.getSessionFactory().getCurrentSession())
                .keySet();
        assert !arrays.isEmpty();
        Map<Long, Collection<Long>> cs2gene = CommonQueries.getCs2GeneIdMap(genesToSearch,
                EntityUtils.getIds(arrays), this.getSessionFactory().getCurrentSession());

        if (cs2gene.size() == 0) {
            if (results.isEmpty()) {
                AbstractDao.log.warn("No composite sequences found for genes");
                return new HashSet<>();
            }
            return results;
        }

        /*
         * Fill in the map, because we want to track information on the specificity of the probes used in the data
         * vectors.
         */
        cs2gene = CommonQueries.getCs2GeneMapForProbes(cs2gene.keySet(),
                this.getSessionFactory().getCurrentSession());

        Map<ProcessedExpressionDataVector, Collection<Long>> processedDataVectors = this
                .getProcessedVectors(EntityUtils.getIds(needToSearch), cs2gene);

        Map<BioAssaySet, Collection<BioAssayDimension>> bioAssayDimensions = this
                .getBioAssayDimensions(needToSearch);

        Collection<DoubleVectorValueObject> newResults = new HashSet<>();

        /*
         * This loop is to ensure that we don't get misaligned vectors for experiments that use more than one array
         * design. See bug 1704. This isn't that common, so we try to break out as soon as possible.
         */
        for (BioAssaySet bas : needToSearch) {

            Collection<BioAssayDimension> dims = bioAssayDimensions.get(bas);

            if (dims == null || dims.isEmpty()) {
                AbstractDao.log.warn("BioAssayDimensions were null/empty unexpectedly.");
                continue;
            }

            /*
             * Get the vectors for just this experiment. This is made more efficient by removing things from the map
             * each time through.
             */
            Map<ProcessedExpressionDataVector, Collection<Long>> vecsForBas = new HashMap<>();
            if (needToSearch.size() == 1) {
                vecsForBas = processedDataVectors;
            } else {
                // isolate the vectors for the current experiment.
                for (Iterator<ProcessedExpressionDataVector> it = processedDataVectors.keySet().iterator(); it
                        .hasNext();) {
                    ProcessedExpressionDataVector v = it.next();
                    if (v.getExpressionExperiment().equals(bas)) {
                        vecsForBas.put(v, processedDataVectors.get(v));
                        it.remove(); // since we're done with it.
                    }
                }
            }

            /*
             * Now see if anything is 'ragged' (fewer bioassays per biomaterial than in some other vector)
             */
            if (dims.size() == 1) {
                newResults.addAll(this.unpack(vecsForBas));
            } else {
                BioAssayDimension longestBad = this.checkRagged(dims);
                if (longestBad == null) {
                    newResults.addAll(this.unpack(vecsForBas));
                } else {
                    newResults.addAll(this.unpack(vecsForBas, longestBad));
                }
            }
        }

        /*
         * Finally....
         */

        if (!newResults.isEmpty()) {
            this.cacheResults(newResults);
            newResults = this.sliceSubsets(ees, newResults);
            results.addAll(newResults);
        }

        return results;

    }

    /**
     * @param expressionExperiment ee
     * @return true if any platform used by the ee is two-channel (including dual-mode)
     */
    private boolean isTwoChannel(ExpressionExperiment expressionExperiment) {

        boolean isTwoChannel = false;
        Collection<ArrayDesign> arrayDesignsUsed = CommonQueries.getArrayDesignsUsed(expressionExperiment,
                this.getSessionFactory().getCurrentSession());
        for (ArrayDesign ad : arrayDesignsUsed) {
            TechnologyType technologyType = ad.getTechnologyType();

            if (technologyType == null) {
                throw new IllegalStateException(
                        "Array designs must have a technology type assigned before processed vector computation");
            }

            if (technologyType.equals(TechnologyType.TWOCOLOR) || technologyType.equals(TechnologyType.DUALMODE)) {
                isTwoChannel = true;
            }
        }
        return isTwoChannel;
    }

    private Map<Long, Map<Long, Collection<DoubleVectorValueObject>>> makeCacheMap(
            Collection<DoubleVectorValueObject> newResults) {
        Map<Long, Map<Long, Collection<DoubleVectorValueObject>>> mapForCache = new HashMap<>();
        for (DoubleVectorValueObject v : newResults) {
            ExpressionExperimentValueObject e = v.getExpressionExperiment();
            if (!mapForCache.containsKey(e.getId())) {
                mapForCache.put(e.getId(), new HashMap<Long, Collection<DoubleVectorValueObject>>());
            }
            Map<Long, Collection<DoubleVectorValueObject>> innerMap = mapForCache.get(e.getId());
            for (Long g : v.getGenes()) {
                if (!innerMap.containsKey(g)) {
                    innerMap.put(g, new HashSet<DoubleVectorValueObject>());
                }
                innerMap.get(g).add(v);
            }
        }
        return mapForCache;
    }

    /**
     * Mask missing values. This is mostly for two-color (ratiometric) data.
     *
     * @param preferredData
     * @param missingValueData
     * @return
     */
    private Map<CompositeSequence, DoubleVectorValueObject> maskAndUnpack(
            Collection<RawExpressionDataVector> preferredData,
            Collection<RawExpressionDataVector> missingValueData) {
        Map<CompositeSequence, DoubleVectorValueObject> unpackedData = this.unpack(preferredData);

        if (missingValueData.size() == 0) {
            AbstractDao.log.debug("There is no separate missing data information, simply using the data as is");
            for (DoubleVectorValueObject rv : unpackedData.values()) {
                rv.setMasked(true);
            }
            return unpackedData;
        }

        Collection<BooleanVectorValueObject> unpackedMissingValueData = this.unpackBooleans(missingValueData);
        Map<CompositeSequenceValueObject, BooleanVectorValueObject> missingValueMap = new HashMap<>();
        for (BooleanVectorValueObject bv : unpackedMissingValueData) {
            missingValueMap.put(bv.getDesignElement(), bv);
        }

        boolean warned = false;
        for (DoubleVectorValueObject rv : unpackedData.values()) {
            double[] data = rv.getData();
            CompositeSequenceValueObject de = rv.getDesignElement();
            BooleanVectorValueObject mv = missingValueMap.get(de);
            if (mv == null) {
                if (!warned && AbstractDao.log.isWarnEnabled())
                    AbstractDao.log.warn("No mask vector for " + de
                            + ", additional warnings for missing masks for this job will be skipped");
                // we're missing a mask vector for it for some reason, but still flag it as effectively masked.
                rv.setMasked(true);
                warned = true;
                continue;
            }

            boolean[] mvData = mv.getData();

            if (mvData.length != data.length) {
                throw new IllegalStateException("Missing value data didn't match data length");
            }
            for (int i = 0; i < data.length; i++) {
                if (!mvData[i]) {
                    data[i] = Double.NaN;
                }
            }
            rv.setMasked(true);
        }

        return unpackedData;
    }

    /**
     * @param rawPreferredDataVectors
     */
    private void removeDuplicateElements(Collection<RawExpressionDataVector> rawPreferredDataVectors) {
        /*
         * Remove rows that are duplicates for the same design element. This can happen for data sets that were merged.
         * We arbitrarily throw one out.
         */
        int maxWarn = 10;
        int warned = 0;
        Set<CompositeSequence> seenDes = new HashSet<>();
        Collection<RawExpressionDataVector> toRemove = new HashSet<>();
        for (RawExpressionDataVector rdv : rawPreferredDataVectors) {
            CompositeSequence de = rdv.getDesignElement();

            if (seenDes.contains(de)) {
                if (warned <= maxWarn) {
                    log.info("Duplicate vector for: " + de);
                    warned++;
                }
                if (warned == maxWarn) {
                    log.info("Further warnings skipped");
                }
                toRemove.add(rdv);
            }
            seenDes.add(de);
        }
        if (!toRemove.isEmpty()) {
            rawPreferredDataVectors.removeAll(toRemove);
            log.info("Removed " + toRemove.size() + " duplicate elements, " + rawPreferredDataVectors.size()
                    + " remain");
        }
    }

    /**
     * Quantile normalize data. This should be one of the last steps in processing before persisting
     *
     * @param vectors vectors
     */
    private void renormalize(Map<CompositeSequence, DoubleVectorValueObject> vectors) {

        int cols = vectors.values().iterator().next().getBioAssayDimension().getBioAssays().size();
        DoubleMatrix<CompositeSequence, Integer> mat = new DenseDoubleMatrix<>(vectors.size(), cols);
        for (int i = 0; i < cols; i++) {
            mat.setColumnName(i, i);
        }

        int i = 0;
        for (CompositeSequence c : vectors.keySet()) {
            DoubleVectorValueObject v = vectors.get(c);
            double[] data = v.getData();

            if (data.length != cols) {
                throw new IllegalStateException(
                        "Normalization failed: perhaps vector merge needs to be run on this experiment? (vector length="
                                + data.length + "; " + cols + " bioAssays in bioassaydimension ID="
                                + v.getBioAssayDimension().getId());
            }
            for (int j = 0; j < cols; j++) {
                mat.set(i, j, data[j]);
            }
            mat.setRowName(c, i);
            i++;
        }

        this.doQuantileNormalization(mat, vectors);

        assert mat.rows() == vectors.size();

    }

    /**
     * @param ee  ee
     * @param obs obs
     * @return Given an ExpressionExperimentSubset and vectors from the source experiment, give vectors that include
     * just the
     * data for the subset.
     */
    private Collection<DoubleVectorValueObject> sliceSubSet(ExpressionExperimentSubSet ee,
            Collection<DoubleVectorValueObject> obs) {

        Collection<DoubleVectorValueObject> sliced = new HashSet<>();
        if (obs == null || obs.isEmpty())
            return sliced;

        this.getSessionFactory().getCurrentSession().buildLockRequest(LockOptions.NONE).lock(ee);
        Hibernate.initialize(ee.getBioAssays());
        List<BioAssayValueObject> sliceBioAssays = new ArrayList<>();

        DoubleVectorValueObject exemplar = obs.iterator().next();

        BioAssayDimensionValueObject bad = new BioAssayDimensionValueObject(-1L);
        bad.setName("Subset of :" + exemplar.getBioAssayDimension().getName());
        bad.setDescription("Subset slice");
        bad.setSourceBioAssayDimension(exemplar.getBioAssayDimension());
        bad.setIsSubset(true);
        Collection<Long> subsetBioAssayIds = EntityUtils.getIds(ee.getBioAssays());

        for (BioAssayValueObject ba : exemplar.getBioAssays()) {
            if (!subsetBioAssayIds.contains(ba.getId())) {
                continue;
            }

            sliceBioAssays.add(ba);
        }

        bad.addBioAssays(sliceBioAssays);
        for (DoubleVectorValueObject vec : obs) {
            DoubleVectorValueObject s = new DoubleVectorValueObject(ee, vec, bad);
            sliced.add(s);
        }

        return sliced;
    }

    /**
     * @param ees  Experiments and/or subsets required
     * @param vecs vectors to select from and if necessary slice, obviously from the given ees.
     * @return vectors that are for the requested subset. If an ee is not a subset, vectors will be unchanged.
     * Otherwise
     * the data in a vector will be for the subset of samples in the ee subset.
     */
    private Collection<DoubleVectorValueObject> sliceSubsets(Collection<? extends BioAssaySet> ees,
            Collection<DoubleVectorValueObject> vecs) {
        Collection<DoubleVectorValueObject> results = new HashSet<>();
        if (vecs == null || vecs.isEmpty())
            return results;

        for (BioAssaySet bas : ees) {
            if (bas instanceof ExpressionExperimentSubSet) {

                for (DoubleVectorValueObject d : vecs) {
                    if (d.getExpressionExperiment().getId()
                            .equals(((ExpressionExperimentSubSet) bas).getSourceExperiment().getId())) {

                        Collection<DoubleVectorValueObject> ddvos = new HashSet<>();
                        ddvos.add(d);
                        results.addAll(this.sliceSubSet((ExpressionExperimentSubSet) bas, ddvos));// coll

                    }
                }

            } else {
                for (DoubleVectorValueObject d : vecs) {
                    if (d.getExpressionExperiment().getId().equals(bas.getId())) {
                        results.add(d);
                    }
                }
            }

        }

        return results;
    }

    private Map<CompositeSequence, DoubleVectorValueObject> unpack(
            Collection<? extends DesignElementDataVector> data) {
        Map<CompositeSequence, DoubleVectorValueObject> result = new HashMap<>();
        Map<BioAssayDimension, BioAssayDimensionValueObject> badVos = this.getBioAssayDimensionValueObjects(data);
        for (DesignElementDataVector v : data) {
            result.put(v.getDesignElement(), new DoubleVectorValueObject(v, badVos.get(v.getBioAssayDimension())));
        }
        return result;
    }

    private Map<CompositeSequence, DoubleVectorValueObject> unpack(
            Collection<? extends DesignElementDataVector> data, Map<Long, Collection<Long>> cs2GeneMap) {
        Map<CompositeSequence, DoubleVectorValueObject> result = new HashMap<>();
        Map<BioAssayDimension, BioAssayDimensionValueObject> badVos = this.getBioAssayDimensionValueObjects(data);
        for (DesignElementDataVector v : data) {
            result.put(v.getDesignElement(), new DoubleVectorValueObject(v,
                    cs2GeneMap.get(v.getDesignElement().getId()), badVos.get(v.getBioAssayDimension())));
        }
        return result;
    }

    private Collection<DoubleVectorValueObject> unpack(Collection<? extends DesignElementDataVector> data,
            Map<Long, Collection<Long>> cs2GeneMap, BioAssayDimension longestBad) {
        Collection<DoubleVectorValueObject> result = new HashSet<>();
        Map<BioAssayDimension, BioAssayDimensionValueObject> badVos = this.getBioAssayDimensionValueObjects(data);
        for (DesignElementDataVector v : data) {
            result.add(new DoubleVectorValueObject(v, badVos.get(v.getBioAssayDimension()),
                    cs2GeneMap.get(v.getDesignElement().getId()), longestBad));
        }
        return result;
    }

    private Collection<DoubleVectorValueObject> unpack(
            Map<? extends DesignElementDataVector, Collection<Long>> data) {
        Collection<DoubleVectorValueObject> result = new HashSet<>();
        Map<BioAssayDimension, BioAssayDimensionValueObject> badVos = this
                .getBioAssayDimensionValueObjects(data.keySet());

        for (DesignElementDataVector v : data.keySet()) {
            result.add(new DoubleVectorValueObject(v, data.get(v), badVos.get(v.getBioAssayDimension())));
        }
        return result;
    }

    private Collection<? extends DoubleVectorValueObject> unpack(
            Map<ProcessedExpressionDataVector, Collection<Long>> data, BioAssayDimension longestBad) {
        Collection<DoubleVectorValueObject> result = new HashSet<>();
        Map<BioAssayDimension, BioAssayDimensionValueObject> badVos = this
                .getBioAssayDimensionValueObjects(data.keySet());
        for (ProcessedExpressionDataVector v : data.keySet()) {
            result.add(
                    new DoubleVectorValueObject(v, badVos.get(v.getBioAssayDimension()), data.get(v), longestBad));
        }
        return result;
    }

    private Collection<BooleanVectorValueObject> unpackBooleans(
            Collection<? extends DesignElementDataVector> data) {
        Collection<BooleanVectorValueObject> result = new HashSet<>();

        Map<BioAssayDimension, BioAssayDimensionValueObject> badVos = this.getBioAssayDimensionValueObjects(data);

        for (DesignElementDataVector v : data) {
            result.add(new BooleanVectorValueObject(v, badVos.get(v.getBioAssayDimension())));
        }
        return result;
    }
}