ubic.gemma.core.visualization.ExperimentalDesignVisualizationServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.visualization.ExperimentalDesignVisualizationServiceImpl.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2008-2009 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package ubic.gemma.core.visualization;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import ubic.basecode.dataStructure.matrix.DoubleMatrix;
import ubic.basecode.dataStructure.matrix.DoubleMatrixFactory;
import ubic.basecode.graphics.ColorMap;
import ubic.basecode.graphics.ColorMatrix;
import ubic.basecode.graphics.MatrixDisplay;
import ubic.gemma.core.datastructure.matrix.EmptyExpressionMatrix;
import ubic.gemma.core.datastructure.matrix.ExpressionDataMatrix;
import ubic.gemma.core.datastructure.matrix.ExpressionDataMatrixColumnSort;
import ubic.gemma.model.expression.bioAssay.BioAssay;
import ubic.gemma.model.expression.bioAssay.BioAssayValueObject;
import ubic.gemma.model.expression.bioAssayData.BioAssayDimension;
import ubic.gemma.model.expression.bioAssayData.BioAssayDimensionValueObject;
import ubic.gemma.model.expression.bioAssayData.DoubleVectorValueObject;
import ubic.gemma.model.expression.biomaterial.BioMaterial;
import ubic.gemma.model.expression.experiment.*;
import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService;

import java.awt.*;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;

/**
 * Tools for visualizing experimental designs. The idea is to generate a overview of the design that can be put over
 * heat maps or line graphs.
 *
 * @author paul
 */
@Component
public class ExperimentalDesignVisualizationServiceImpl implements ExperimentalDesignVisualizationService {

    /**
     * Cache of layouts for experiments, keyed by experiment ID.
     */
    private final Map<Long, LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>>> cachedLayouts = new ConcurrentHashMap<>();
    private final Log log = LogFactory.getLog(this.getClass().getName());
    private final ExpressionExperimentService expressionExperimentService;

    @Autowired
    public ExperimentalDesignVisualizationServiceImpl(ExpressionExperimentService expressionExperimentService) {
        this.expressionExperimentService = expressionExperimentService;
    }

    @Override
    public Map<Long, LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>>> sortVectorDataByDesign(
            Collection<DoubleVectorValueObject> dedVs) {

        // cachedLayouts.clear(); // uncomment FOR DEBUGGING.

        if (dedVs == null) {
            return new HashMap<>(0);
        }

        Map<Long, LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>>> returnedLayouts = new HashMap<>(
                dedVs.size());

        StopWatch timer = new StopWatch();
        timer.start();

        /*
         * This is shared across experiments that might show up in the dedVs; this should be okay...saves computation.
         * This is the only slow part.
         */
        this.prepare(dedVs);

        /*
         * This loop is not a performance issue.
         */
        Map<DoubleVectorValueObject, List<BioAssayValueObject>> newOrderingsForBioAssayDimensions = new HashMap<>();
        for (DoubleVectorValueObject vec : dedVs) {

            if (vec.isReorganized()) {
                continue;
            }

            assert !vec.getBioAssays().isEmpty();

            LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> layout = null;

            if (cachedLayouts.containsKey(vec.getExpressionExperiment().getId())) {
                layout = cachedLayouts.get(vec.getExpressionExperiment().getId());
            } else if (vec.getExpressionExperiment().getClass()
                    .isInstance(ExpressionExperimentSubsetValueObject.class)) {
                // subset.
                layout = cachedLayouts.get(((ExpressionExperimentSubsetValueObject) vec.getExpressionExperiment())
                        .getSourceExperiment());
            }

            if (layout == null || layout.isEmpty()) {
                log.error("Did not find cached layout for " + vec.getId());
                continue;
            }

            List<BioAssayValueObject> newOrdering = new ArrayList<>(layout.keySet());

            newOrdering.retainAll(vec.getBioAssays());

            /*
             * This can happen if the vectors are out of whack with the bioassays - e.g. two platforms were used but
             * merging is not done. See bug 3775. Skipping the ordering is not the right thing to do.
             */
            if (newOrdering.isEmpty()) {

                boolean allNaN = this.allNaN(vec);

                if (allNaN) {
                    // reordering will have no effect.
                    continue;
                }

                /*
                 * Add to the layout.
                 */
                layout = this.extendLayout(vec, vec.getExpressionExperiment().getId());
                newOrdering = new ArrayList<>(layout.keySet());
                newOrdering.retainAll(vec.getBioAssays());
                assert !newOrdering.isEmpty();
            }

            newOrderingsForBioAssayDimensions.put(vec, newOrdering);

            Map<BioAssayValueObject, Integer> ordering = this.getOrdering(newOrdering);

            Long eeId;
            eeId = vec.getExpressionExperiment().getId(); // might be subset id.

            if (!returnedLayouts.containsKey(eeId)) {
                if (vec.isSliced()) {
                    LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> trimmedLayout = new LinkedHashMap<>();

                    for (BioAssayValueObject baVo : newOrdering) {
                        trimmedLayout.put(baVo, layout.get(baVo));
                    }

                    returnedLayouts.put(eeId, trimmedLayout);

                } else {
                    returnedLayouts.put(eeId, layout);
                }
            }

            /*
             * Might be a faster way.
             */
            double[] data = vec.getData();
            double[] dol = ArrayUtils.clone(data);

            // assert ordering.size() == data.length : "got " + ordering.size() + " expected " + data.length;

            List<BioAssayValueObject> oldOrdering = vec.getBioAssayDimension().getBioAssays();
            int j = 0;
            if (log.isTraceEnabled())
                log.trace("Old order: " + StringUtils.join(ArrayUtils.toObject(data), ","));
            for (BioAssayValueObject ba : oldOrdering) {

                if (ordering.get(ba) == null) {
                    assert Double.isNaN(dol[j]);
                    j++;
                    continue;
                }

                assert ordering.containsKey(ba);
                assert ordering.get(ba) != null;

                Integer targetIndex = ordering.get(ba);

                data[targetIndex] = dol[j++];

            }
            if (log.isTraceEnabled())
                log.trace("New order: " + StringUtils.join(ArrayUtils.toObject(data), ","));

            vec.setReorganized(true);

        }

        for (DoubleVectorValueObject vec : dedVs) {
            if (vec.getBioAssayDimension().isReordered())
                continue;
            List<BioAssayValueObject> newOrdering = newOrderingsForBioAssayDimensions.get(vec);
            if (newOrdering == null)
                continue; // data was empty, etc.
            vec.getBioAssayDimension().reorder(newOrdering);
        }

        if (timer.getTime() > 1500) {
            log.info("Sort vectors by design: " + timer.getTime() + "ms");
        }

        return returnedLayouts;

    }

    @Override
    public void clearCaches(Long eeId) {
        this.clearCachedLayouts(eeId);
    }

    /**
     * Test method for now, shows how this can be used.
     *
     * @param e ee
     */
    @SuppressWarnings("unused") // Test method for now, shows how this can be used.
    protected void plotExperimentalDesign(ExpressionExperiment e) {
        LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> layout = this
                .getExperimentalDesignLayout(e);

        List<String> efStrings = new ArrayList<>();
        List<String> baStrings = new ArrayList<>();
        List<double[]> rows = new ArrayList<>();
        boolean first = true;
        int i = 0;
        for (BioAssayValueObject ba : layout.keySet()) {
            baStrings.add(ba.getName());

            int j = 0;
            for (ExperimentalFactor ef : layout.get(ba).keySet()) {
                if (first) {
                    double[] nextRow = new double[layout.size()];
                    rows.add(nextRow);
                    efStrings.add(ef.getName() + " ( id=" + ef.getId() + ")"); // make sure they are unique.
                }
                double d = layout.get(ba).get(ef);

                rows.get(j)[i] = d;
                j++;
            }
            i++;
            first = false;
        }

        double[][] mat = rows.toArray(new double[][] {});

        DoubleMatrix<String, String> data = DoubleMatrixFactory.dense(mat);
        data.setRowNames(efStrings);
        data.setColumnNames(baStrings);

        ColorMatrix<String, String> cm = new ColorMatrix<>(data, ColorMap.GREENRED_COLORMAP, Color.GRAY);

        try {
            this.writeImage(cm, File.createTempFile(e.getShortName() + "_", ".png"));
        } catch (IOException e1) {
            throw new RuntimeException(e1);
        }
    }

    private boolean allNaN(DoubleVectorValueObject vec) {
        boolean allNaN = true;
        for (double d : vec.getData()) {
            if (!Double.isNaN(d)) {
                allNaN = false;
                break;
            }
        }
        return allNaN;
    }

    private void clearCachedLayouts(Long eeId) {
        this.cachedLayouts.remove(eeId);
    }

    /**
     * See bug 3775. For experiments which have more than one bioassay dimension, we typically have to "extend" the
     * layout to include more bioassays. Because the ordering is defined by the factor values associated with the
     * underlying biomaterials, this is going to be okay.
     */
    private LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> extendLayout(
            DoubleVectorValueObject vec, Long eeId) {
        BioAssayDimensionValueObject bioAssayDimension = this.getBioAssayDimensionForVector(vec);

        ExpressionExperimentValueObject ee = vec.getExpressionExperiment();
        ExpressionExperiment actualEe = this.getExperimentForVector(vec, ee);

        LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> extension = this
                .getExperimentalDesignLayout(actualEe, expressionExperimentService.getBioAssayDimensions(actualEe));

        for (BioAssayValueObject vbaVo : bioAssayDimension.getBioAssays()) {
            assert extension.containsKey(vbaVo);
        }

        for (BioAssayValueObject vbaVo : vec.getBioAssays()) {
            assert extension.containsKey(vbaVo);
        }

        cachedLayouts.get(eeId).putAll(extension);

        return cachedLayouts.get(eeId);
    }

    private BioAssayDimensionValueObject getBioAssayDimensionForVector(DoubleVectorValueObject vec) {
        BioAssayDimensionValueObject bioAssayDimension = vec.getBioAssayDimension();

        if (vec.getBioAssayDimension().getSourceBioAssayDimension() != null) {
            bioAssayDimension = vec.getBioAssayDimension().getSourceBioAssayDimension();
        }

        assert bioAssayDimension.getId() != null;

        // this actually doesn't really matter, but we're wasting time redoing it.
        assert !bioAssayDimension.isReordered();
        assert !bioAssayDimension.getIsSubset();
        return bioAssayDimension;
    }

    private LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> getExperimentalDesignLayout(
            ExpressionExperiment e) {

        if (cachedLayouts.containsKey(e.getId())) {
            return cachedLayouts.get(e.getId());
        }

        Collection<BioAssayDimension> bds = expressionExperimentService.getBioAssayDimensions(e);
        e = this.expressionExperimentService.thawLite(e);

        LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> result = this
                .getExperimentalDesignLayout(e, bds);

        cachedLayouts.put(e.getId(), result);

        return result;
    }

    /**
     * @param bds a BioAssayDimension that represents the BioAssayDimensionValueObject. This is only needed to avoid
     *            making ExpressionMatrix use value objects, otherwise we could use the BioAssayDimensionValueObject
     * @return A "Layout": a map of bioassays to map of factors to doubles that represent the position in the layout.
     */
    private LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> getExperimentalDesignLayout(
            ExpressionExperiment experiment, Collection<BioAssayDimension> bds) {

        LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> result = new LinkedHashMap<>();

        ExpressionDataMatrix<Object> mat = new EmptyExpressionMatrix(bds);

        // This is the place the actual sort order is determined.
        List<BioMaterial> bms = ExpressionDataMatrixColumnSort.orderByExperimentalDesign(mat);

        Map<Long, Double> fvV = new HashMap<>();

        assert experiment != null;
        assert experiment.getExperimentalDesign() != null;
        if (experiment.getExperimentalDesign().getExperimentalFactors().isEmpty()) {
            // Case of no experimental design; just put in a dummy factor.
            ExperimentalFactor dummyFactor = ExperimentalFactor.Factory.newInstance();
            dummyFactor.setName("No factors");
            for (BioMaterial bm : bms) {
                int j = mat.getColumnIndex(bm);

                Collection<BioAssay> bas = mat.getBioAssaysForColumn(j);

                for (BioAssay ba : bas) {
                    BioAssayValueObject baVo = new BioAssayValueObject(ba, false);
                    result.put(baVo, new LinkedHashMap<ExperimentalFactor, Double>());
                    result.get(baVo).put(dummyFactor, 0.0);
                }

            }

            return result;
        }

        assert !experiment.getExperimentalDesign().getExperimentalFactors().isEmpty();
        /*
         * Choose values to use as placeholders.
         */
        // Map<ExperimentalFactor, Map<FactorValue, Double>> continuousRanges = new HashMap<>();
        for (ExperimentalFactor ef : experiment.getExperimentalDesign().getExperimentalFactors()) {
            if (ef.getFactorValues().isEmpty()) {
                // this can happen if the design isn't complete.
                continue;
            }

            for (FactorValue fv : ef.getFactorValues()) {
                assert fv.getId() != null;
                // the id is just used as a convenience.
                fvV.put(fv.getId(), new Double(fv.getId()));

            }

        }

        assert !fvV.isEmpty();
        assert !bms.isEmpty();

        // if the same biomaterial was used in more than one bioassay (thus more than one bioassay dimension), and they
        // are in the same column, this is resolved here; we assign the same layout value for both bioassays, so the
        // ordering is the same for vectors coming from
        // either bioassay dimension.
        for (BioMaterial bm : bms) {
            int j = mat.getColumnIndex(bm);

            Collection<BioAssay> bas = mat.getBioAssaysForColumn(j);

            Collection<FactorValue> fvs = bm.getFactorValues();

            for (BioAssay ba : bas) {
                BioAssayValueObject baVo = new BioAssayValueObject(ba, false);
                result.put(baVo, new LinkedHashMap<ExperimentalFactor, Double>(fvs.size()));
                for (FactorValue fv : fvs) {
                    assert fv.getId() != null;
                    assert fvV.containsKey(fv.getId());
                    ExperimentalFactor ef = fv.getExperimentalFactor();
                    Double value;
                    if (fv.getMeasurement() != null) {
                        try {
                            value = Double.parseDouble(fv.getMeasurement().getValue());
                        } catch (NumberFormatException e) {
                            value = fvV.get(fv.getId()); // not good.
                        }
                    } else {
                        value = fvV.get(fv.getId());
                    }
                    assert result.containsKey(baVo);
                    assert value != null;
                    result.get(baVo).put(ef, value);

                }
            }

        }
        return result;
    }

    /**
     * @return the experiment; if the vector is for a subset, we return the source experiment
     */
    private ExpressionExperiment getExperimentForVector(DoubleVectorValueObject vec,
            ExpressionExperimentValueObject ee) {
        /*
         * The following is the really slow part if we don't use a cache.
         */
        ExpressionExperiment actualEe;
        if (vec.isSliced()) {
            /*
             * Then we are looking at a subset, so associate it with the original experiment.
             */
            if (!vec.getClass().isInstance(ExpressionExperimentSubsetValueObject.class)) {
                log.error("Vector is sliced, but the experiment is not a subset!");
            }
            ExpressionExperimentSubsetValueObject eesvo = (ExpressionExperimentSubsetValueObject) vec
                    .getExpressionExperiment();

            if (eesvo.getSourceExperiment() == null) {
                log.error("Vector is sliced, but the source experiment is null!");
            }

            actualEe = expressionExperimentService.load(eesvo.getSourceExperiment());
            actualEe = expressionExperimentService.thawLiter(actualEe);
        } else {
            actualEe = expressionExperimentService.load(ee.getId());
            actualEe = expressionExperimentService.thawLiter(actualEe);
        }
        return actualEe;
    }

    /**
     * Get the order that bioassays need to be in for the 'real' data.
     */
    private Map<BioAssayValueObject, Integer> getOrdering(List<BioAssayValueObject> bioAssaysInOrder) {
        assert !bioAssaysInOrder.isEmpty();
        Map<BioAssayValueObject, Integer> ordering = new HashMap<>();

        int i = 0;
        for (BioAssayValueObject bbb : bioAssaysInOrder) {
            ordering.put(bbb, i++);
        }
        return ordering;
    }

    /**
     * Gets the bioassay dimensions for the experiments associated with the given vectors. These are cached for later
     * re-use.
     */
    private void prepare(Collection<DoubleVectorValueObject> dvvOs) {

        if (dvvOs == null)
            return;

        for (DoubleVectorValueObject vec : dvvOs) {
            if (vec == null) {
                log.debug("DoubleVectorValueObject is null");
                continue;
            }

            if (vec.isReorganized()) {
                // wouldn't normally be the case...
                continue;
            }

            ExpressionExperimentValueObject ee = vec.getExpressionExperiment();

            /*
             * Problem: we can't have two layouts for one experiment, which is actually required if there is more than
             * one bioassay dimension. However, this rarely matters. See bug 3775
             */
            if (cachedLayouts.containsKey(ee.getId())) {
                continue;
            } else if (vec.getClass().isInstance(ExpressionExperimentSubsetValueObject.class)) {
                ExpressionExperimentSubsetValueObject eesvo = (ExpressionExperimentSubsetValueObject) vec
                        .getExpressionExperiment();
                if (eesvo.getSourceExperiment() != null && cachedLayouts.containsKey(eesvo.getSourceExperiment())) {
                    continue;
                }
            }

            BioAssayDimensionValueObject bioAssayDimension = this.getBioAssayDimensionForVector(vec);

            ExpressionExperiment actualEe = this.getExperimentForVector(vec, ee);

            assert bioAssayDimension != null;
            LinkedHashMap<BioAssayValueObject, LinkedHashMap<ExperimentalFactor, Double>> experimentalDesignLayout = this
                    .getExperimentalDesignLayout(actualEe,
                            expressionExperimentService.getBioAssayDimensions(actualEe));

            cachedLayouts.put(ee.getId(), experimentalDesignLayout);

        }

    }

    /**
     * Test method.
     */
    private void writeImage(ColorMatrix<String, String> matrix, File outputFile) throws IOException {
        MatrixDisplay<String, String> writer = new MatrixDisplay<>(matrix);
        writer.setCellSize(new Dimension(18, 18));
        writer.saveImage(matrix, outputFile.getAbsolutePath(), true, false, true);
    }

}