ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalysisUtil.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.analysis.expression.diff.DifferentialExpressionAnalysisUtil.java

Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2007 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.analysis.expression.diff;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import ubic.gemma.datastructure.matrix.ExpressionDataMatrix;
import ubic.gemma.model.expression.bioAssay.BioAssay;
import ubic.gemma.model.expression.biomaterial.BioMaterial;
import ubic.gemma.model.expression.experiment.BioAssaySet;
import ubic.gemma.model.expression.experiment.ExperimentalFactor;
import ubic.gemma.model.expression.experiment.FactorType;
import ubic.gemma.model.expression.experiment.FactorValue;

/**
 * A helper class for the differential expression analyzers. This class contains helper methods commonly needed when
 * performing an analysis.
 * 
 * @author keshav
 * @version $Id: DifferentialExpressionAnalysisUtil.java,v 1.8 2013/05/17 14:20:16 paul Exp $
 */
public class DifferentialExpressionAnalysisUtil {

    private static Log log = LogFactory.getLog(DifferentialExpressionAnalysisUtil.class);

    /**
     * Returns true if the block design is complete and there are at least 2 biological replicates for each "group",
     * false otherwise. When determining completeness, a biomaterial's factor values are only considered if they are
     * equivalent to one of the input experimental factors.
     * 
     * @param expressionExperiment
     * @param factors to consider completeness for.
     * @return boolean
     */
    public static boolean blockComplete(BioAssaySet expressionExperiment, Collection<ExperimentalFactor> factors) {

        Collection<BioMaterial> biomaterials = getBioMaterials(expressionExperiment);

        /*
         * Get biomaterials with only those factor values equal to the factor values in the input factors. Only these
         * factor values in each biomaterial will be used to determine completeness.
         */
        Collection<BioMaterial> biomaterialsWithGivenFactorValues = filterFactorValuesFromBiomaterials(factors,
                biomaterials);

        boolean completeBlock = checkBlockDesign(biomaterialsWithGivenFactorValues, factors);

        boolean hasAllReps = checkBiologicalReplicates(expressionExperiment, factors);

        return completeBlock && hasAllReps;
    }

    /**
     * See if there are at least two samples for each factor value combination.
     * 
     * @param expressionExperiment
     * @param factors
     * @return
     */
    protected static boolean checkBiologicalReplicates(BioAssaySet expressionExperiment,
            Collection<ExperimentalFactor> factors) {

        Collection<BioMaterial> biomaterials = getBioMaterials(expressionExperiment);

        for (BioMaterial firstBm : biomaterials) {

            Collection<FactorValue> factorValuesToCheck = getRelevantFactorValues(factors, firstBm);

            boolean match = false;
            for (BioMaterial secondBm : biomaterials) {

                if (firstBm.equals(secondBm))
                    continue;

                Collection<FactorValue> factorValuesToCompareTo = getRelevantFactorValues(factors, secondBm);

                if (factorValuesToCheck.size() == factorValuesToCompareTo.size()
                        && factorValuesToCheck.containsAll(factorValuesToCompareTo)) {
                    log.debug("Replicate found for biomaterial " + firstBm + ".");
                    match = true;
                    break;
                }
            }
            if (!match) {
                log.warn("No replicate found for biomaterial " + firstBm + ".");
                return false;
            }
        }

        return true;

    }

    /**
     * Check that <em>at least one</em> of the given factors is valid: the factorvalues are measurements, or that there
     * are at least two assays for at least one factor value.
     * 
     * @param expressionExperiment
     * @param experimentalFactors
     * @return
     */
    public static boolean checkValidForLm(BioAssaySet expressionExperiment,
            Collection<ExperimentalFactor> experimentalFactors) {
        for (ExperimentalFactor experimentalFactor : experimentalFactors) {
            boolean ok = checkValidForLm(expressionExperiment, experimentalFactor);
            if (ok)
                return true;
        }
        return false;
    }

    /**
     * Check that the factorvalues are measurements, or that there are at least two assays for at least one factor
     * value. Otherwise the model fit will be perfect and pvalues will not be returned.
     * 
     * @param expressionExperiment
     * @param experimentalFactor
     * @return true if it's okay, false otherwise.
     */
    public static boolean checkValidForLm(BioAssaySet expressionExperiment, ExperimentalFactor experimentalFactor) {

        if (experimentalFactor.getFactorValues().size() < 2) {
            return false;
        }

        if (experimentalFactor.getType().equals(FactorType.CONTINUOUS)) {
            return true;
        }

        /*
         * Check to make sure more than one factor value is actually used.
         */
        boolean replicatesok = false;
        Map<FactorValue, Integer> counts = new HashMap<FactorValue, Integer>();
        for (BioAssay ba : (Collection<BioAssay>) expressionExperiment.getBioAssays()) {
            BioMaterial bm = ba.getSampleUsed();
            for (FactorValue fv : bm.getFactorValues()) {
                if (fv.getExperimentalFactor().equals(experimentalFactor)) {

                    if (!counts.containsKey(fv)) {
                        counts.put(fv, 0);
                    }

                    counts.put(fv, counts.get(fv) + 1);
                    if (counts.get(fv) > 1) {
                        replicatesok = true;
                    }

                }

            }
        }

        if (counts.size() < 2) {
            log.warn(experimentalFactor + " has only " + counts.size()
                    + " levels used in the current set, it cannot be analyzed");
            return false;
        }

        return replicatesok;
    }

    /**
     * Returns a List of all the different types of biomaterials across all bioassays in the experiment. If there is
     * more than one biomaterial per bioassay, a {@link RuntimeException} is thrown.
     * 
     * @param expressionExperiment
     * @return
     * @throws Exception
     */
    public static List<BioMaterial> getBioMaterialsForBioAssays(ExpressionDataMatrix<?> matrix) {

        List<BioMaterial> biomaterials = new ArrayList<BioMaterial>();

        Collection<BioAssay> assays = new ArrayList<BioAssay>();
        for (int i = 0; i < matrix.columns(); i++) {
            Collection<BioAssay> bioassays = matrix.getBioAssaysForColumn(i);
            /*
             * Note: we could use addAll here. The mapping of bioassays to biomaterials is many-to-one (e.g., when the
             * HGU133A and B arrays were both used on the same set of samples).
             */
            assays.add(bioassays.iterator().next());
        }

        for (BioAssay assay : assays) {
            BioMaterial material = assay.getSampleUsed();

            biomaterials.add(material);

        }

        return biomaterials;
    }

    /**
     * Returns the factors that can be used by R for a one way anova or t-test. There requirement here is that there is
     * only one factor value per factor per biomaterial (of course), and all factor values are from the same
     * experimental factor.
     * <p>
     * FIXME use the ExperimentalFactor as the input, not the FactorValues.
     * 
     * @param factorValues
     * @param samplesUsed
     * @return list of strings representing the factor, in the same order as the supplied samplesUsed.
     */
    public static List<String> getRFactorsFromFactorValuesForOneWayAnova(Collection<FactorValue> factorValues,
            List<BioMaterial> samplesUsed) {

        List<String> rFactors = new ArrayList<String>();

        for (BioMaterial biomaterial : samplesUsed) {
            Collection<FactorValue> factorValuesFromBioMaterial = biomaterial.getFactorValues();

            /*
             * Actually, it's fine if there are multiple factorValues, so long as they are from different factors.
             */

            if (factorValuesFromBioMaterial.size() != 1) {

                /*
                 * Check to see if they are from different factors.
                 */
                ExperimentalFactor seen = null;
                for (FactorValue fv : factorValuesFromBioMaterial) {
                    if (seen != null && fv.getExperimentalFactor().equals(seen)) {
                        throw new RuntimeException(
                                "There should only be one factorvalue per factor per biomaterial, " + biomaterial
                                        + " had multiple values for " + seen);
                    }
                    seen = fv.getExperimentalFactor();
                }
            }

            boolean found = false;
            for (FactorValue candidate : factorValuesFromBioMaterial) {
                if (factorValues.contains(candidate)) {
                    rFactors.add(candidate.getId() + "_f");
                    found = true;
                    break;
                }
            }

            if (!found) {
                throw new IllegalStateException("No match for factorvalue on " + biomaterial
                        + " among possible factorValues: " + StringUtils.join(factorValues, "\n"));
            }
        }
        return rFactors;
    }

    /**
     * Returns the factors that can be used by R for a two way anova. Each sample must have a factor value equal to one
     * of the supplied factor values. This assumes that "equals" works correctly on the factor values.
     * 
     * @param experimentalFactor
     * @param samplesUsed the samples we want to assign to the various factors
     * @return R factor representation, in the same order as the given samplesUsed.
     */
    public static List<String> getRFactorsFromFactorValuesForTwoWayAnova(ExperimentalFactor experimentalFactor,
            List<BioMaterial> samplesUsed) {

        List<String> rFactors = new ArrayList<String>();

        for (BioMaterial sampleUsed : samplesUsed) {
            Collection<FactorValue> factorValuesFromBioMaterial = sampleUsed.getFactorValues();
            boolean match = false;

            for (FactorValue factorValue : factorValuesFromBioMaterial) {
                for (FactorValue candidateMatch : experimentalFactor.getFactorValues()) {
                    if (candidateMatch.equals(factorValue)) {
                        rFactors.add(factorValue.getId().toString());
                        match = true;
                        break;
                    }
                }
            }
            if (!match)
                throw new IllegalStateException(
                        "None of the Factor values of the biomaterial match the supplied factor values.");
        }

        return rFactors;
    }

    /**
     * Returns true if all of the following conditions hold true: each biomaterial has more than 2 factor values, each
     * biomaterial has a factor value from one of the input factors paired with a factor value from the other input
     * factors, and all factor values from 1 factor have been paired with all factor values from the other factors,
     * across all biomaterials.
     * 
     * @param biomaterials
     * @param factorValues
     * @return false if not a complete block design.
     */
    protected static boolean checkBlockDesign(Collection<BioMaterial> biomaterials,
            Collection<ExperimentalFactor> experimentalFactors) {

        Collection<Set<FactorValue>> factorValuePairings = generateFactorValuePairings(experimentalFactors);

        /* check to see if the biomaterial's factor value pairing is one of the possible combinations */
        Map<Collection<FactorValue>, BioMaterial> seenPairings = new HashMap<Collection<FactorValue>, BioMaterial>();
        for (BioMaterial m : biomaterials) {

            Collection<FactorValue> factorValuesFromBioMaterial = m.getFactorValues();

            if (factorValuesFromBioMaterial.size() < experimentalFactors.size()) {
                log.warn("Biomaterial must have at least " + experimentalFactors.size()
                        + "factor value.  Incomplete block design. " + m);
                return false;
            }

            /*
             * Find a combination of factors used in the model that this biomaterial has.
             */
            boolean ok = false;
            for (Set<FactorValue> pairing : factorValuePairings) {
                if (factorValuesFromBioMaterial.containsAll(pairing)) {
                    ok = true;
                    break;
                }
            }

            if (!ok) {
                /*
                 * This amounts to a missing value.
                 */
                throw new IllegalArgumentException(
                        "Biomaterial does not have a combination of factors matching the model; design error?: "
                                + m);
                // continue;
            }

            seenPairings.put(factorValuesFromBioMaterial, m);
        }
        if (seenPairings.size() != factorValuePairings.size()) {
            log.warn("Biomaterial not paired with all factor values for each of " + experimentalFactors.size()
                    + " experimental factors.  Found " + seenPairings.size() + " pairings but should have "
                    + factorValuePairings.size() + ".  Incomplete block design.");
            return false;
        }
        return true;

    }

    /**
     * Generates all possible factor value pairings for the given experimental factors.
     * 
     * @param experimentalFactors
     * @return A collection of hashsets, where each hashset is a pairing.
     */
    protected static Collection<Set<FactorValue>> generateFactorValuePairings(
            Collection<ExperimentalFactor> experimentalFactors) {
        /* set up the possible pairings */
        Collection<FactorValue> allFactorValues = new HashSet<FactorValue>();
        for (ExperimentalFactor experimentalFactor : experimentalFactors) {
            allFactorValues.addAll(experimentalFactor.getFactorValues());
        }

        Collection<FactorValue> allFactorValuesCopy = allFactorValues;

        Collection<Set<FactorValue>> factorValuePairings = new HashSet<Set<FactorValue>>();

        for (FactorValue factorValue : allFactorValues) {
            for (FactorValue f : allFactorValuesCopy) {
                if (f.getExperimentalFactor().equals(factorValue.getExperimentalFactor()))
                    continue;

                HashSet<FactorValue> factorValuePairing = new HashSet<FactorValue>();
                factorValuePairing.add(factorValue);
                factorValuePairing.add(f);

                if (!factorValuePairings.contains(factorValuePairing)) {
                    factorValuePairings.add(factorValuePairing);
                }
            }
        }
        return factorValuePairings;
    }

    /**
     * Returns biomaterials with 'filtered' factor values. That is, each biomaterial will only contain those factor
     * values equivalent to a factor value from one of the input experimental factors.
     * <p>
     * FIXME I don't like the way this code modifies the factorvalues associated with the biomaterial.
     * 
     * @param factors
     * @param biomaterials
     * @return Collection<BioMaterial>
     */
    private static Collection<BioMaterial> filterFactorValuesFromBiomaterials(
            Collection<ExperimentalFactor> factors, Collection<BioMaterial> biomaterials) {

        assert !biomaterials.isEmpty();
        assert !factors.isEmpty();

        Collection<FactorValue> allFactorValuesFromGivenFactors = new HashSet<FactorValue>();
        for (ExperimentalFactor ef : factors) {
            allFactorValuesFromGivenFactors.addAll(ef.getFactorValues());
        }

        Collection<BioMaterial> biomaterialsWithGivenFactorValues = new HashSet<BioMaterial>();
        int numHaveAny = 0;
        for (BioMaterial b : biomaterials) {
            Collection<FactorValue> biomaterialFactorValues = b.getFactorValues();
            Collection<FactorValue> factorValuesToConsider = new HashSet<FactorValue>();
            factorValuesToConsider.addAll(biomaterialFactorValues);
            for (FactorValue biomaterialFactorValue : biomaterialFactorValues) {
                numHaveAny++;
                if (!allFactorValuesFromGivenFactors.contains(biomaterialFactorValue)) {
                    factorValuesToConsider.remove(biomaterialFactorValue);
                }
            }
            b.setFactorValues(factorValuesToConsider);
            biomaterialsWithGivenFactorValues.add(b);
        }

        if (numHaveAny == 0) {
            throw new IllegalStateException("No biomaterials had any factor values");
        }

        return biomaterialsWithGivenFactorValues;
    }

    /**
     * Returns a collection of all the different types of biomaterials across all bioassays in the experiment.
     * 
     * @param expressionExperiment
     * @return
     */
    private static List<BioMaterial> getBioMaterials(BioAssaySet ee) {

        List<BioMaterial> biomaterials = new ArrayList<BioMaterial>();

        /* look for 1 bioassay/matrix column and 1 biomaterial/bioassay */
        for (BioAssay assay : (Collection<BioAssay>) ee.getBioAssays()) {
            BioMaterial material = assay.getSampleUsed();
            biomaterials.add(material);
        }

        return biomaterials;
    }

    /**
     * Isolate a biomaterial's factor values for a specific factor(s).
     * 
     * @param factors
     * @param biomaterial
     * @return the factor values the biomaterial has for the given factors.
     */
    private static Collection<FactorValue> getRelevantFactorValues(Collection<ExperimentalFactor> factors,
            BioMaterial biomaterial) {
        Collection<FactorValue> factorValues = biomaterial.getFactorValues();

        Collection<FactorValue> factorValuesToCheck = new HashSet<FactorValue>();
        for (FactorValue factorValue : factorValues) {
            if (factors.contains(factorValue.getExperimentalFactor())) {
                factorValuesToCheck.add(factorValue);
            }
        }
        return factorValuesToCheck;
    }

}