ubic.gemma.core.analysis.preprocess.TwoChannelMissingValuesImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.analysis.preprocess.TwoChannelMissingValuesImpl.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2006 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.analysis.preprocess;

import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import ubic.basecode.io.ByteArrayConverter;
import ubic.basecode.math.distribution.Histogram;
import ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix;
import ubic.gemma.core.datastructure.matrix.ExpressionDataMatrixRowElement;
import ubic.gemma.model.common.quantitationtype.*;
import ubic.gemma.model.expression.bioAssayData.BioAssayDimension;
import ubic.gemma.model.expression.bioAssayData.DesignElementDataVector;
import ubic.gemma.model.expression.bioAssayData.ProcessedExpressionDataVector;
import ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.persistence.service.common.quantitationtype.QuantitationTypeService;
import ubic.gemma.persistence.service.expression.bioAssayData.ProcessedExpressionDataVectorService;
import ubic.gemma.persistence.service.expression.bioAssayData.RawExpressionDataVectorService;
import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService;

import java.util.Collection;
import java.util.HashSet;

/**
 * Computes a missing value matrix for ratiometric data sets.
 * <p>
 * Supported formats and special cases:
 * <ul>
 * <li>Genepix: CH1B_MEDIAN etc; (various versions)</li>
 * <li>Incyte GEMTools: RAW_DATA etc (no background values)</li>
 * <li>Quantarray: CH1_BKD etc</li>
 * <li>F635.Median / F532.Median (genepix as rendered in some data sets)</li>
 * <li>CH1_SMTM (found in GPL230)</li>
 * <li>Caltech (GPL260)</li>
 * <li>Agilent (Ch2BkgMedian etc or CH2_SIG_MEAN etc)</li>
 * <li>GSE3251 (ch1.Background etc)
 * <li>GPL560 (*_CY3 vs *CY5)
 * <li>GSE1501 (NormCH2)
 * </ul>
 * <p>
 * The missing values are computed with the following considerations with respect to available data
 * </p>
 * <ol>
 * <li>If the preferred quantitation type data is a missing value, then the data are considered missing (for
 * consistency).</li>
 * <li>We then do additional checks if there is 'signal' data available.
 * <li>If there are background values, they are used to compute signal-to-noise ratios</li>
 * <li>If the signal values already contain missing data, these are still considered missing.</li>
 * <li>If there are no background values, we try to compute a threshold based on a quantile of the signal</li>
 * <li>Otherwise, values will be considered 'present' unless the signal values are zero or missing.</li>
 * </ol>
 *
 * @author pavlidis
 */
@Component
public class TwoChannelMissingValuesImpl implements TwoChannelMissingValues {

    private static final int QUANTILE_OF_SIGNAL_TO_USE_IF_NO_BKG_AVAILABLE = 1;
    private static final Log log = LogFactory.getLog(TwoChannelMissingValuesImpl.class.getName());

    @Autowired
    private ExpressionExperimentService expressionExperimentService;
    @Autowired
    private QuantitationTypeService quantitationTypeService;
    @Autowired
    private TwoChannelMissingValueHelperService twoChannelMissingValueHelperService;
    @Autowired
    private RawExpressionDataVectorService rawExpressionDataVectorService;
    @Autowired
    private ProcessedExpressionDataVectorService processedExpressionDataVectorService;

    @Override
    public Collection<RawExpressionDataVector> computeMissingValues(ExpressionExperiment ee) {
        return this.computeMissingValues(ee, TwoChannelMissingValues.DEFAULT_SIGNAL_TO_NOISE_THRESHOLD, null);
    }

    @Override
    public Collection<RawExpressionDataVector> computeMissingValues(ExpressionExperiment ee,
            double signalToNoiseThreshold, Collection<Double> extraMissingValueIndicators) {

        ee = expressionExperimentService.thawLite(ee);
        Collection<QuantitationType> usefulQuantitationTypes = ExpressionDataMatrixBuilder
                .getUsefulQuantitationTypes(ee);
        StopWatch timer = new StopWatch();
        timer.start();
        TwoChannelMissingValuesImpl.log.info("Loading vectors ...");

        Collection<RawExpressionDataVector> rawVectors = rawExpressionDataVectorService
                .find(usefulQuantitationTypes);
        Collection<ProcessedExpressionDataVector> procVectors = new HashSet<>();

        if (rawVectors.isEmpty()) {
            procVectors = processedExpressionDataVectorService.find(usefulQuantitationTypes);
            processedExpressionDataVectorService.thaw(procVectors);
        } else {
            rawExpressionDataVectorService.thaw(rawVectors);
        }

        timer.stop();
        this.logTimeInfo(timer, procVectors.size() + rawVectors.size());

        Collection<? extends DesignElementDataVector> builderVectors = new HashSet<>(
                rawVectors.isEmpty() ? procVectors : rawVectors);

        ExpressionDataMatrixBuilder builder = new ExpressionDataMatrixBuilder(builderVectors);
        Collection<BioAssayDimension> dims = builder.getBioAssayDimensions();

        /*
         * Note we have to do this one array design at a time, because we are producing DesignElementDataVectors which
         * must be associated with the correct BioAssayDimension.
         */
        TwoChannelMissingValuesImpl.log.info("Study has " + dims.size() + " bioassaydimensions");

        if (extraMissingValueIndicators != null && extraMissingValueIndicators.size() > 0) {
            TwoChannelMissingValuesImpl.log.info(
                    "There are " + extraMissingValueIndicators.size() + " manually-set missing value indicators");
        }

        ExpressionDataDoubleMatrix preferredData = builder.getPreferredData();
        ExpressionDataDoubleMatrix bkgDataA = builder.getBackgroundChannelA();
        ExpressionDataDoubleMatrix bkgDataB = builder.getBackgroundChannelB();
        ExpressionDataDoubleMatrix signalDataA = builder.getSignalChannelA();
        ExpressionDataDoubleMatrix signalDataB = builder.getSignalChannelB();

        if (builder.isAnyMissing()) {
            if (bkgDataA != null) {
                for (QuantitationType qt : bkgDataA.getQuantitationTypes()) {
                    if (builder.getNumMissingValues(qt) > 0) {
                        TwoChannelMissingValuesImpl.log.warn("Missing values in bkgDataA");
                        break;
                    }
                }
            }
            if (bkgDataB != null) {
                for (QuantitationType qt : bkgDataB.getQuantitationTypes()) {
                    if (builder.getNumMissingValues(qt) > 0) {
                        TwoChannelMissingValuesImpl.log.warn("Missing values in bkgDataB");
                        break;
                    }
                }
            }
            if (signalDataA != null) {
                for (QuantitationType qt : signalDataA.getQuantitationTypes()) {
                    if (builder.getNumMissingValues(qt) > 0) {
                        TwoChannelMissingValuesImpl.log.warn("Missing values in signalDataA");
                        break;
                    }
                }
            }
            if (signalDataB != null) {
                for (QuantitationType qt : signalDataB.getQuantitationTypes()) {
                    if (builder.getNumMissingValues(qt) > 0) {
                        TwoChannelMissingValuesImpl.log.warn("Missing values in signalDataB");
                        break;
                    }
                }
            }
        }

        Collection<RawExpressionDataVector> dimRes = this.computeMissingValues(ee, preferredData, signalDataA,
                signalDataB, bkgDataA, bkgDataB, signalToNoiseThreshold, extraMissingValueIndicators);

        return new HashSet<>(dimRes);
    }

    /**
     * Attempt to compute 'missing value' information for a two-channel data set. We attempt to do this even if we are
     * missing background intensity information or one intensity channel, though obviously it is better to have all four
     * sets of values.
     *
     * @param bkgChannelA                 background channel A
     * @param bkgChannelB                 background channel B
     * @param extraMissingValueIndicators extra missing value indicators
     * @param preferred                   preferred matrix
     * @param signalChannelA              signal channel A
     * @param signalChannelB              signal channel B
     * @param signalToNoiseThreshold      noise threshold
     * @param source                      the source
     * @return DesignElementDataVectors corresponding to a new PRESENTCALL quantitation type for the design elements and
     * biomaterial dimension represented in the inputs.
     */
    private Collection<RawExpressionDataVector> computeMissingValues(ExpressionExperiment source,
            ExpressionDataDoubleMatrix preferred, ExpressionDataDoubleMatrix signalChannelA,
            ExpressionDataDoubleMatrix signalChannelB, ExpressionDataDoubleMatrix bkgChannelA,
            ExpressionDataDoubleMatrix bkgChannelB, double signalToNoiseThreshold,
            Collection<Double> extraMissingValueIndicators) {

        boolean okToProceed = this.validate(preferred, signalChannelA, signalChannelB, bkgChannelA, bkgChannelB,
                signalToNoiseThreshold);
        Collection<RawExpressionDataVector> results = new HashSet<>();

        if (!okToProceed) {
            TwoChannelMissingValuesImpl.log.warn("Missing value computation cannot proceed");
            return results;
        }

        ByteArrayConverter converter = new ByteArrayConverter();

        int count = 0;

        ExpressionDataDoubleMatrix baseChannel = signalChannelA == null ? signalChannelB : signalChannelA;

        Double signalThreshold = Double.NaN;
        if (bkgChannelA == null && bkgChannelB == null) {
            signalThreshold = this.computeSignalThreshold(preferred, signalChannelA, signalChannelB, baseChannel);
        }
        QuantitationType present = this.getMissingDataQuantitationType(signalToNoiseThreshold, signalThreshold);
        source.getQuantitationTypes().add(present);
        for (ExpressionDataMatrixRowElement element : baseChannel.getRowElements()) {
            count = this.examineVector(source, preferred, signalChannelA, signalChannelB, bkgChannelA, bkgChannelB,
                    signalToNoiseThreshold, extraMissingValueIndicators, results, converter, count, baseChannel,
                    signalThreshold, present, element);

        }
        TwoChannelMissingValuesImpl.log.info("Finished: " + count + " vectors examined for missing values");

        results = twoChannelMissingValueHelperService.persist(source, results);

        return results;
    }

    private int examineVector(ExpressionExperiment source, ExpressionDataDoubleMatrix preferred,
            ExpressionDataDoubleMatrix signalChannelA, ExpressionDataDoubleMatrix signalChannelB,
            ExpressionDataDoubleMatrix bkgChannelA, ExpressionDataDoubleMatrix bkgChannelB,
            double signalToNoiseThreshold, Collection<Double> extraMissingValueIndicators,
            Collection<RawExpressionDataVector> results, ByteArrayConverter converter, int count,
            ExpressionDataDoubleMatrix baseChannel, Double signalThreshold, QuantitationType present,
            ExpressionDataMatrixRowElement element) {
        CompositeSequence designElement = element.getDesignElement();

        RawExpressionDataVector vect = RawExpressionDataVector.Factory.newInstance();
        vect.setQuantitationType(present);
        vect.setExpressionExperiment(source);
        vect.setDesignElement(designElement);
        assert baseChannel != null;
        vect.setBioAssayDimension(baseChannel.getBioAssayDimension(designElement));

        int numCols = preferred.columns(designElement);

        Boolean[] detectionCalls = new Boolean[numCols];
        Double[] prefRow = preferred.getRow(designElement);

        Double[] signalA = null;
        if (signalChannelA != null) {
            signalA = signalChannelA.getRow(designElement);
        }

        Double[] signalB = null;
        if (signalChannelB != null) {
            signalB = signalChannelB.getRow(designElement);
        }
        Double[] bkgA = null;
        Double[] bkgB = null;

        if (bkgChannelA != null)
            bkgA = bkgChannelA.getRow(designElement);

        if (bkgChannelB != null)
            bkgB = bkgChannelB.getRow(designElement);

        // columns only for this design element!
        boolean gaps = false; // we use this to track
        for (int col = 0; col < numCols; col++) {

            if (this.checkMissingValue(extraMissingValueIndicators, detectionCalls, prefRow, col))
                continue;

            Double bkgAV = Double.NaN;
            Double bkgBV = Double.NaN;

            if (bkgA != null)
                bkgAV = bkgA[col];
            if (bkgB != null)
                bkgBV = bkgB[col];

            Double sigAV = (signalA == null || signalA[col] == null) ? Double.NaN : signalA[col];
            Double sigBV = (signalB == null || signalB[col] == null) ? Double.NaN : signalB[col];

            /*
             * Missing values here wreak havoc. Sometimes in multiarray studies data are missing.
             */
            Boolean call = this.computeCall(signalToNoiseThreshold, signalThreshold, sigAV, sigBV, bkgAV, bkgBV);

            if (call == null)
                gaps = true;

            detectionCalls[col] = call;
        }

        if (gaps) {
            this.fillGapsInCalls(detectionCalls);
        }

        vect.setData(converter.booleanArrayToBytes(ArrayUtils.toPrimitive(detectionCalls)));
        results.add(vect);

        if (++count % 4000 == 0) {
            TwoChannelMissingValuesImpl.log.info(count + " vectors examined for missing values, " + results.size()
                    + " vectors generated so far.");
        }
        return count;
    }

    private boolean checkMissingValue(Collection<Double> extraMissingValueIndicators, Boolean[] detectionCalls,
            Double[] prefRow, int col) {
        // If the "preferred" value is already missing, we retain that, or if it is a special value
        Double pref = prefRow == null ? Double.NaN : prefRow[col];
        if (pref.isNaN() || (extraMissingValueIndicators != null && extraMissingValueIndicators.contains(pref))) {
            detectionCalls[col] = false;
            return true;
        }
        return false;
    }

    /**
     * Determine a threshold based on the data.
     */
    private Double computeSignalThreshold(ExpressionDataDoubleMatrix preferred,
            ExpressionDataDoubleMatrix signalChannelA, ExpressionDataDoubleMatrix signalChannelB,
            ExpressionDataDoubleMatrix baseChannel) {

        Double min = Double.MAX_VALUE;
        Double max = Double.MIN_VALUE;

        for (ExpressionDataMatrixRowElement element : baseChannel.getRowElements()) {
            CompositeSequence designElement = element.getDesignElement();

            int numCols = preferred.columns(designElement);
            for (int col = 0; col < numCols; col++) {

                Double[] signalA = null;
                if (signalChannelA != null) {
                    signalA = signalChannelA.getRow(designElement);
                }

                Double[] signalB = null;
                if (signalChannelB != null) {
                    signalB = signalChannelB.getRow(designElement);
                }

                Double sigAV = (signalA == null || signalA[col] == null) ? Double.NaN : signalA[col];
                Double sigBV = (signalB == null || signalB[col] == null) ? Double.NaN : signalB[col];

                if (!sigAV.isNaN() && sigAV < min) {
                    min = sigAV;
                } else if (!sigBV.isNaN() && sigBV < min) {
                    min = sigBV;
                } else if (!sigAV.isNaN() && sigAV > max) {
                    max = sigAV;
                } else if (!sigBV.isNaN() && sigBV > max) {
                    max = sigBV;
                }

            }
        }

        Histogram h = new Histogram("range", 100, min, max);
        for (ExpressionDataMatrixRowElement element : baseChannel.getRowElements()) {
            CompositeSequence designElement = element.getDesignElement();

            int numCols = preferred.columns(designElement);
            for (int col = 0; col < numCols; col++) {

                Double[] signalA = null;
                if (signalChannelA != null) {
                    signalA = signalChannelA.getRow(designElement);
                }

                Double[] signalB = null;
                if (signalChannelB != null) {
                    signalB = signalChannelB.getRow(designElement);
                }

                Double sigAV = (signalA == null || signalA[col] == null) ? Double.NaN : signalA[col];
                Double sigBV = (signalB == null || signalB[col] == null) ? Double.NaN : signalB[col];

                if (!sigAV.isNaN())
                    h.fill(sigAV);
                if (!sigBV.isNaN())
                    h.fill(sigBV);

            }
        }

        Double thresh = h
                .getApproximateQuantile(TwoChannelMissingValuesImpl.QUANTILE_OF_SIGNAL_TO_USE_IF_NO_BKG_AVAILABLE);

        TwoChannelMissingValuesImpl.log.info("Threshold based on signal=" + thresh);

        return thresh;
    }

    /**
     * Deal with cases when the data we are using to make calls are themselves missing. Note that in other cases where
     * we can't compute missingness at all, we assume everything is present.
     */
    private void fillGapsInCalls(Boolean[] detectionCalls) {
        /*
         * Make a decision on those.
         */
        // double fractionThreshold = 0.5; // if half of calls we made are present, we call gaps pesent
        double fractionThreshold = 0.0; // call all gaps present, unless everything in the row is absent (or missing)
        int numPresentCall = 0;
        int numCalls = 0;
        for (Boolean b : detectionCalls) {
            if (b != null) {
                if (b)
                    numPresentCall++;
                numCalls++;
            }
        }
        boolean decide = numCalls > 0 && numPresentCall / (double) numCalls > fractionThreshold;

        for (int i = 0; i < detectionCalls.length; i++) {
            if (detectionCalls[i] == null)
                detectionCalls[i] = decide;
        }
    }

    /**
     * Decide if the data point is 'present': it has to be above the threshold in one of the channels.
     *
     * @param signalThreshold might be used if we don't have background measurements.
     * @param bkgAV           can be null
     * @param bkgBV           can be null
     * @return call, or null if no decision could be made due to NaN in the values given.
     */
    private Boolean computeCall(double signalToNoiseThreshold, Double signalThreshold, Double sigAV, Double sigBV,
            Double bkgAV, Double bkgBV) {

        if (!sigAV.isNaN() && !bkgAV.isNaN() && sigAV > bkgAV * signalToNoiseThreshold)
            return true;

        if (!sigBV.isNaN() && !bkgBV.isNaN() && sigBV > bkgBV * signalToNoiseThreshold)
            return true;

        // if no background valeues, use the signal threshold, if we have one; both values must meet.
        if (!Double.isNaN(signalThreshold) && bkgAV.isNaN() && bkgBV.isNaN()) {
            return (sigAV > signalThreshold || sigBV > signalThreshold);
        }

        // if both signals are unusable, false.
        if ((sigAV.isNaN() || sigAV == 0) && (sigBV.isNaN() || sigBV == 0)) {
            return false;
        }

        /*
         * We couldn't decide because none of the above calculations could be done.
         */
        if ((sigAV.isNaN() || bkgAV.isNaN()) && (sigBV.isNaN() || bkgBV.isNaN()))
            return null;

        // default: keep.
        return true;
    }

    /**
     * Construct the quantitation type that will be used for the generated DesignElementDataVEctors.
     */
    private QuantitationType getMissingDataQuantitationType(double signalToNoiseThreshold, Double signalThreshold) {
        QuantitationType present = QuantitationType.Factory.newInstance();
        present.setName("Detection call");

        if (!signalThreshold.isNaN()) {
            present.setDescription(
                    "Detection call based on signal threshold of " + signalThreshold + " (Computed by Gemma)");
        } else {
            present.setDescription("Detection call based on signal to noise threshold of " + signalToNoiseThreshold
                    + " (Computed by Gemma)");
        }
        present.setGeneralType(GeneralType.CATEGORICAL);
        present.setIsBackground(false);
        present.setRepresentation(PrimitiveType.BOOLEAN);
        present.setScale(ScaleType.OTHER);
        present.setIsPreferred(false);
        present.setIsMaskedPreferred(false);
        present.setIsBackgroundSubtracted(false);
        present.setIsNormalized(false);
        present.setIsRatio(false);
        present.setType(StandardQuantitationType.PRESENTABSENT);
        present.setIsBatchCorrected(false);
        present.setIsRecomputedFromRawData(false);

        return this.quantitationTypeService.create(present);
    }

    private void logTimeInfo(StopWatch timer, int size) {

        TwoChannelMissingValuesImpl.log
                .info(String.format("Loaded in %.2fs. Thawing %d vectors", timer.getTime() / 1000.0, size));
    }

    /**
     * Check to make sure all the pieces are correctly in place to do the computation.
     *
     * @return true if okay, false if not.
     */
    private boolean validate(ExpressionDataDoubleMatrix preferred, ExpressionDataDoubleMatrix signalChannelA,
            ExpressionDataDoubleMatrix signalChannelB, ExpressionDataDoubleMatrix bkgChannelA,
            ExpressionDataDoubleMatrix bkgChannelB, double signalToNoiseThreshold) {
        // not exhaustive...
        if (preferred == null || (signalChannelA == null && signalChannelB == null)) {
            TwoChannelMissingValuesImpl.log.warn(
                    "Must have at least preferred and one intensity data matrix, missing value computation should not proceed");
            return false;
        }

        if ((bkgChannelA != null && bkgChannelA.rows() == 0) || (bkgChannelB != null && bkgChannelB.rows() == 0)) {
            TwoChannelMissingValuesImpl.log.warn("Background values must not be empty when non-null");
            return false;
        }

        if (signalChannelA != null && signalChannelB != null) {
            if (!(signalChannelA.rows() == signalChannelB.rows())) {
                TwoChannelMissingValuesImpl.log.warn("Collection sizes probably should match in channel A and B "
                        + signalChannelA.rows() + " != " + signalChannelB.rows());
            }

            if (!(signalChannelA.rows() == preferred.rows())) { // vectors with all-missing data are already removed
                TwoChannelMissingValuesImpl.log
                        .warn("Collection sizes probably should match in channel A and preferred type "
                                + signalChannelA.rows() + " != " + preferred.rows());
            }
            int numSamplesA = signalChannelA.columns();
            int numSamplesB = signalChannelB.columns();

            if (numSamplesA != numSamplesB || numSamplesB != preferred.columns()) {
                TwoChannelMissingValuesImpl.log.warn("Number of samples doesn't match!");
                return false;
            }
        }

        if ((bkgChannelA != null && bkgChannelB != null) && bkgChannelA.rows() != bkgChannelB.rows())
            TwoChannelMissingValuesImpl.log.warn("Collection sizes probably should match for background  "
                    + bkgChannelA.rows() + " != " + bkgChannelB.rows());

        if (signalToNoiseThreshold <= 0.0) {
            TwoChannelMissingValuesImpl.log.warn("Signal-to-noise threshold must be greater than zero");
            return false;
        }

        return true;

    }

}