ubic.gemma.core.analysis.preprocess.batcheffects.BatchInfoPopulationHelperServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.analysis.preprocess.batcheffects.BatchInfoPopulationHelperServiceImpl.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2012 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */
package ubic.gemma.core.analysis.preprocess.batcheffects;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import ubic.gemma.core.analysis.util.ExperimentalDesignUtils;
import ubic.gemma.model.association.GOEvidenceCode;
import ubic.gemma.model.common.description.Characteristic;
import ubic.gemma.model.expression.biomaterial.BioMaterial;
import ubic.gemma.model.expression.experiment.*;
import ubic.gemma.persistence.service.expression.biomaterial.BioMaterialService;
import ubic.gemma.persistence.service.expression.experiment.ExperimentalDesignService;
import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService;

import java.text.DateFormat;
import java.util.*;

/**
 * @author paul
 */
@Service
public class BatchInfoPopulationHelperServiceImpl implements BatchInfoPopulationHelperService {

    /**
     * For RNA-seq, the minimum number of samples per batch, if possible
     */
    private static final double MINIMUM_SAMPLES_PER_RNASEQ_BATCH = 2.0;

    /**
     * For microarrays (that come with scan timestamps): How many hours do we allow to pass between samples, before we
     * consider them to be a separate batch (if they are not run on the same day). This 'slack' is necessary to allow
     * for the possibility that all the hybridizations were run together, but the scanning took a while to complete. Of
     * course we are still always recording the actual dates, this is only used for the creation of ExperimentalFactors.
     * Making this value too small causes the data to be broken into many batches. I experimented with a value of 2, but
     * this seemed too low. Anything greater than 24 doesn't make much sense.
     */
    private static final int MAX_GAP_BETWEEN_SAMPLES_TO_BE_SAME_BATCH = 8;

    private static final Log log = LogFactory.getLog(BatchInfoPopulationHelperServiceImpl.class);

    private static final String FASTQ_HEADER_EXTRATION_FAILURE_INDICATOR = "FAILURE";

    @Autowired
    private BioMaterialService bioMaterialService = null;

    @Autowired
    private ExperimentalDesignService experimentalDesignService;

    @Autowired
    private ExpressionExperimentService experimentService;

    @Override
    @Transactional
    public ExperimentalFactor createRnaSeqBatchFactor(ExpressionExperiment ee, Map<BioMaterial, String> headers) {
        /*
         * Go through the headers and convert to factor values.
         */
        Map<String, Collection<String>> batchIdToHeaders = this.convertHeadersToBatches(headers.values());

        Map<String, FactorValue> headerToFv = new HashMap<>();
        ExperimentalFactor ef = createExperimentalFactor(ee, batchIdToHeaders, headerToFv);
        bioMaterialService.associateBatchFactor(headers, headerToFv);

        return ef;
    }

    @Override
    @Transactional
    public ExperimentalFactor createBatchFactor(ExpressionExperiment ee, Map<BioMaterial, Date> dates) {

        /*
         * Go through the dates and convert to factor values.
         */
        Map<String, Collection<Date>> datesToBatch = this.convertDatesToBatches(new HashSet<>(dates.values()));

        Map<Date, FactorValue> d2fv = new HashMap<>();
        ExperimentalFactor ef = createExperimentalFactor(ee, datesToBatch, d2fv);
        bioMaterialService.associateBatchFactor(dates, d2fv);

        return ef;
    }

    /**
     * Apply some heuristics to condense the dates down to batches. For example, we might assume dates very close
     * together (for example, in the same day or within MAX_GAP_BETWEEN_SAMPLES_TO_BE_SAME_BATCH, and we avoid singleton
     * batches) are to be treated as the same batch (see implementation for details).
     *
     * @param  allDates all dates
     * @return          map of batch identifiers to dates
     */
    Map<String, Collection<Date>> convertDatesToBatches(Collection<Date> allDates) {
        List<Date> lDates = new ArrayList<>(allDates);
        Collections.sort(lDates);
        Map<String, Collection<Date>> result = new LinkedHashMap<>();

        int batchNum = 1;
        DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT);
        String batchDateString = "";

        boolean mergedAnySingletons = false;

        /*
         * Iterate over dates
         */
        Date lastDate = null;
        Date nextDate;
        for (int i = 0; i < lDates.size(); i++) {
            Date currentDate = lDates.get(i);

            if (i < lDates.size() - 1) {
                nextDate = lDates.get(i + 1);
            } else {
                nextDate = null;
            }

            if (lastDate == null) {
                // Start our first batch.
                batchDateString = this.formatBatchName(batchNum, df, currentDate);
                result.put(batchDateString, new HashSet<Date>());
                result.get(batchDateString).add(currentDate);
                lastDate = currentDate;
                continue;
            }

            /*
             * Decide whether we have entered a new batch.
             *
             * Rules:
             *
             * - Processing on the same day is always considered the same batch.
             *
             * - Gaps of less then MAX_GAP_BETWEEN_SAMPLES_TO_BE_SAME_BATCH hours are considered the same batch even if
             * the day is different. Allows for "overnight running" of batches.
             *
             * And then two rules that keep us from having batches with just one sample. Such batches buy us nothing at
             * all.
             *
             * - A "singleton" batch at the end of the series is always combined with the last batch.
             *
             * - A "singleton" batch in the middle is combined with either the next or previous batch, whichever is
             * nearer in time.
             */
            if (this.gapIsLarge(lastDate, currentDate) && result.get(batchDateString).size() > 1) {

                if (nextDate == null) {
                    /*
                     * We're at the last sample, and it's a singleton. We fall through and allow adding it to the end of
                     * the last batch.
                     */
                    BatchInfoPopulationHelperServiceImpl.log
                            .warn("Singleton at the end of the series, combining with the last batch: gap is "
                                    + String.format("%.2f", (currentDate.getTime() - lastDate.getTime())
                                            / (double) (1000 * 60 * 60 * 24))
                                    + " hours.");
                    mergedAnySingletons = true;
                } else if (this.gapIsLarge(currentDate, nextDate)) {
                    /*
                     * Then we have a singleton that will be stranded when we go to the next date. Do we combine it
                     * forwards or backwards? We choose the smaller gap.
                     */
                    long backwards = currentDate.getTime() - lastDate.getTime();
                    long forwards = nextDate.getTime() - currentDate.getTime();

                    if (forwards < backwards) {
                        // Start a new batch.
                        BatchInfoPopulationHelperServiceImpl.log
                                .warn("Singleton resolved by waiting for the next batch: gap is "
                                        + String.format("%.2f", (nextDate.getTime() - currentDate.getTime())
                                                / (double) (1000 * 60 * 60 * 24))
                                        + " hours.");
                        batchNum++;
                        batchDateString = this.formatBatchName(batchNum, df, currentDate);
                        result.put(batchDateString, new HashSet<Date>());
                        mergedAnySingletons = true;
                    } else {
                        BatchInfoPopulationHelperServiceImpl.log
                                .warn("Singleton resolved by adding to the last batch: gap is "
                                        + String.format("%.2f", (currentDate.getTime() - lastDate.getTime())
                                                / (double) (1000 * 60 * 60 * 24))
                                        + " hours.");
                        // don't start a new batch, fall through.
                    }

                } else {
                    batchNum++;
                    batchDateString = this.formatBatchName(batchNum, df, currentDate);
                    result.put(batchDateString, new HashSet<Date>());
                }

            }
            // else we fall through and add the current date to the current batch.

            // express the constraint that we don't allow batches of size 1, even if we would have normally left it in
            // its own batch.
            if (result.get(batchDateString).size() == 1 && this.gapIsLarge(lastDate, currentDate)) {
                mergedAnySingletons = true;
                BatchInfoPopulationHelperServiceImpl.log
                        .warn("Stranded singleton automatically being merged into a larger batch");
            }

            result.get(batchDateString).add(currentDate);
            lastDate = currentDate;
        }

        if (mergedAnySingletons && result.size() == 1) {
            // The implication is that if we didn't have the singleton merging, we would have more than one batch.
            BatchInfoPopulationHelperServiceImpl.log
                    .warn("Singleton merging resulted in all batches being combined");
        }

        return result;

    }

    /**
     * Apply some heuristics to condense the fastq headers to batches. This operates only on strings to make testing
     * easier.
     *
     * @param  headers collection of fastq headers for all samples.
     * @return         map of batch names to the headers (sample-specific) for that batch.
     */
    Map<String, Collection<String>> convertHeadersToBatches(Collection<String> headers) {
        Map<String, Collection<String>> result = new LinkedHashMap<>();

        Map<FastqHeaderData, Collection<String>> batchInfos = new HashMap<>();

        // parse headers; keep track of the platforms, which we use as a fallback if the headers are not clean
        Set<String> platforms = new HashSet<>();
        for (String header : headers) {
            FastqHeaderData batchInfoForSample = parseFASTQHeaderForBatch(header);

            String device = batchInfoForSample.getDevice();
            if (device.startsWith("GPL")) {
                platforms.add(device);
            }

            if (!batchInfos.containsKey(batchInfoForSample)) {
                batchInfos.put(batchInfoForSample, new HashSet<String>());
            }
            batchInfos.get(batchInfoForSample).add(header);
        }

        // key step ...
        batch(batchInfos, headers.size());

        // switch to using string keys for batch identifiers (and check for bad headers)
        boolean anyBadHeaders = false;
        for (FastqHeaderData fhd : batchInfos.keySet()) {

            if (!fhd.hadUsableHeader) {
                anyBadHeaders = true;
            }
            String batchIdentifier = fhd.toString();
            if (!result.containsKey(batchIdentifier)) {
                result.put(batchIdentifier, new HashSet<String>());
            }
            Collection<String> headersInBatch = batchInfos.get(fhd);
            result.get(batchIdentifier).addAll(headersInBatch);
        }

        //  If *any* of them are no good (and platform was constant), we fail noisily
        if (platforms.size() == 1 && anyBadHeaders) {
            throw new RuntimeException(
                    "Batch could not be determined: at least one unusable header and platform is constant");
        }

        // otherwise, having just one batch means as far we can tell there was only one batch.
        log.info(result.size() + " batches detected");

        return result;

    }

    /*
     * 
     * RNA-seq: See how many batches we have for each level of granularity; pick the best number. This is pretty crude,
     * and involves recreating the map multiple times
     */
    private void batch(Map<FastqHeaderData, Collection<String>> batchInfos, int numSamples) {

        int numBatches = batchInfos.size();

        /*
         * There's two problems. First, it could be there are no batches at all. Second, there could be too many
         * batches.
         */
        if (numBatches == 1) {
            // no batches - this will get sorted out later, proceed
            return;
        } else if (numBatches == numSamples
                || (double) numBatches / (double) numSamples < MINIMUM_SAMPLES_PER_RNASEQ_BATCH) {
            // too few samples per batch. Try to reduce resolution and recount.
            Map<FastqHeaderData, Collection<String>> updatedBatchInfos = dropResolution(batchInfos);

            if (updatedBatchInfos.size() == batchInfos.size()) {
                return;
            }

            batchInfos = updatedBatchInfos;

            batch(batchInfos, numSamples); // recurse
        } else {
            // reasonable number of samples per batch -- proceed. 
            return;
        }

    }

    /*
     * RNAseq: Update the batch info with a lower resolution. This is only effective if we have a usable header for all
     * samples.
     */
    private Map<FastqHeaderData, Collection<String>> dropResolution(
            Map<FastqHeaderData, Collection<String>> batchInfos) {

        Map<FastqHeaderData, Collection<String>> result = new HashMap<>();
        for (FastqHeaderData fhd : batchInfos.keySet()) {

            if (!fhd.hadUseableHeader()) {
                // cannot drop resolution.
                result.put(fhd, batchInfos.get(fhd));
                continue;
            }

            FastqHeaderData updated = fhd.dropResolution();

            if (updated.equals(fhd))
                return batchInfos;

            if (!result.containsKey(updated)) {
                result.put(updated, new HashSet<String>());
            }
            result.get(updated).addAll(batchInfos.get(fhd));
        }

        return result;

    }

    /**
     * We expect something like: @SRR5938435.1.1 D8ZGT8Q1:199:C5GKYACXX:5:1101:1224:1885 length=100
     * Only interested middle section (D8ZGT8Q1:199:C5GKYACXX:5 of the example);
     * 
     * We augment the original header with the GPL id, which is only used if the machine etc. cannot be read from the
     * rest of the header
     * 
     * Format 1: <platform id>;;;<machine_id>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:<is
     * filtered>:<control
     * number>:<index sequence>; we can use the first four fields
     * 
     * Format 2: <platform id>;;;<machine_id>:<lane>:<tile>:<x_coord>:<y_coord>; we can use machine and lane.
     * 
     * @param  header FASTQ header (can be multi-headers for cases where there is more than on FASTQ file)
     * @return        representation of the batch info, which is going to be a portion of the header string
     */
    FastqHeaderData parseFASTQHeaderForBatch(String header) {

        if (!header.contains(BatchInfoPopulationServiceImpl.MULTIFASTQHEADER_DELIMITER)) {
            throw new UnsupportedOperationException(
                    "Header does not appear to be in the expected format: " + header);
        }

        String[] headers = header.split(BatchInfoPopulationServiceImpl.MULTIFASTQHEADER_DELIMITER);
        FastqHeaderData currentBatch = null;

        String platform = headers[0]; // e.g. GPL134

        for (String field : headers) {
            if (StringUtils.isBlank(field))
                continue;
            if (field.equals(platform))
                continue; // skip the first field.

            // first actual header
            String[] fields = field.split("\\s");
            String[] arr = fields[1].split(":");

            FastqHeaderData fqd = null;
            if (field.equals(FASTQ_HEADER_EXTRATION_FAILURE_INDICATOR)) {
                // no actual headers available, only platform
                fqd = new FastqHeaderData(platform);
            } else if (fields.length != 3) {
                // again, no usable headers, only platform
                fqd = new FastqHeaderData(platform);
            } else if (arr.length == 7) {
                fqd = new FastqHeaderData(arr[0], arr[1], arr[2], arr[3]);
            } else if (arr.length == 5) {
                // device and lane are the only usable fields
                fqd = new FastqHeaderData(arr[0], arr[1]);
            } else if (!fields[1].contains(":")) {
                // not a valid header, we're expecting at least five fields delimited by :
                fqd = new FastqHeaderData(platform);
            } else {
                // something else but also not usable.
                fqd = new FastqHeaderData(platform);
            }

            if (currentBatch == null) {
                currentBatch = fqd;
            } else {
                if (currentBatch.equals(fqd)) {
                    continue;
                }

                // from a different run
                currentBatch.add(fqd);

            }

        }
        return currentBatch;
    }

    class FastqHeaderData {

        @Override
        public String toString() {
            String s = null;

            if (this.device != null) {
                s = "Device=" + device;
            }
            if (this.run != null) {
                s = s + ":Run=" + run;
            }
            if (this.flowCell != null) {
                s = s + ":Flowcell=" + flowCell;
            }
            if (this.lane != null) {
                s = s + ":Lane=" + lane;
            }
            return s;
        }

        /**
         * @return
         */
        public FastqHeaderData dropResolution() {
            if (this.lane != null) {
                return new FastqHeaderData(this.device, this.run, this.flowCell, null);
            } else if (this.flowCell != null) {
                return new FastqHeaderData(this.device, this.run, null, null);
            } else if (this.run != null) {
                return new FastqHeaderData(this.device, null, null, null);
            }
            return this; // might want to return null if we need to signal a stopping condition.
        }

        private String device = null;

        protected String getDevice() {
            return device;
        }

        // assumption here: the header we are adding has the same fields as the original.
        public void add(FastqHeaderData fqd) {

            if (this.equals(fqd))
                return;

            if (this.device != null && !this.device.equals(fqd.getDevice())) {
                this.device = this.device + "/" + fqd.getDevice();
            }

            if (this.run != null && !this.run.equals(fqd.getRun())) {
                this.run = this.run + "/" + fqd.getRun();
            }

            if (this.flowCell != null && !this.flowCell.equals(fqd.getFlowCell())) {
                this.flowCell = this.flowCell + "/" + fqd.getFlowCell();
            }

            if (this.lane != null && !this.lane.equals(fqd.getLane())) {
                this.lane = this.lane + "/" + fqd.getLane();
            }

        }

        protected String getLane() {
            return lane;
        }

        protected String getFlowCell() {
            return flowCell;
        }

        protected String getRun() {
            return run;
        }

        private String lane = null;
        private String flowCell = null;
        private String run = null;
        private boolean hadUsableHeader = false;

        protected boolean hadUseableHeader() {
            return hadUsableHeader;
        }

        FastqHeaderData(String device, String lane) {
            this.device = device;
            this.lane = lane;
            this.hadUsableHeader = true;
        }

        /**
         * Only for use when we don't have a useable device:run etc.
         * 
         * @param device e.g. GPLXXXX
         */
        FastqHeaderData(String platform) {
            this.device = platform;
            this.hadUsableHeader = false;
        }

        FastqHeaderData(String device, String run, String flowCell, String lane) {
            this(device, lane);
            this.flowCell = flowCell;
            this.run = run;
            this.hadUsableHeader = true;
        }

        private BatchInfoPopulationHelperServiceImpl getOuterType() {
            return BatchInfoPopulationHelperServiceImpl.this;
        }

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + getOuterType().hashCode();
            result = prime * result + ((device == null) ? 0 : device.hashCode());
            result = prime * result + ((flowCell == null) ? 0 : flowCell.hashCode());
            result = prime * result + ((lane == null) ? 0 : lane.hashCode());
            result = prime * result + ((run == null) ? 0 : run.hashCode());
            return result;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null) {
                return false;
            }
            if (getClass() != obj.getClass()) {
                return false;
            }
            FastqHeaderData other = (FastqHeaderData) obj;
            if (!getOuterType().equals(other.getOuterType())) {
                return false;
            }

            if (device == null) {
                if (other.device != null) {
                    return false;
                }
            } else if (!device.equals(other.device)) {
                return false;
            }
            if (flowCell == null) {
                if (other.flowCell != null) {
                    return false;
                }
            } else if (!flowCell.equals(other.flowCell)) {
                return false;
            }
            if (lane == null) {
                if (other.lane != null) {
                    return false;
                }
            } else if (!lane.equals(other.lane)) {
                return false;
            }
            if (run == null) {
                if (other.run != null) {
                    return false;
                }
            } else if (!run.equals(other.run)) {
                return false;
            }
            return true;
        }

    }

    /*
     * 
     * For RNA-seq, descriptorsToBatch is a map of batchids to headers
     * for microarrays, it a map of batchids to dates
     * d2fv is populated by this call to be a map of headers or dates to factor values
     */
    private <T> ExperimentalFactor createExperimentalFactor(ExpressionExperiment ee,
            Map<String, Collection<T>> descriptorsToBatch, Map<T, FactorValue> d2fv) {
        ExperimentalFactor ef = null;
        if (descriptorsToBatch == null || descriptorsToBatch.size() < 2) {
            if (descriptorsToBatch != null) {

                /*
                 * Corner case. It's possible we are not sure there are actually batches or not
                 * because of the lack of information. The example would be when we don't have usable FASTQ headers, and
                 * all the GPL ids are the same.
                 */
                String batchIdentifier = descriptorsToBatch.keySet().iterator().next();
                if (batchIdentifier.startsWith("GPL")) {
                    throw new RuntimeException(
                            "No reliable batch information was available: no usable FASTQ headers and only one GPL ID associated with the experiment.");
                }

                // Otherwise, we trust that either the FASTQ headers or dates are a reasonable representation.
                BatchInfoPopulationHelperServiceImpl.log
                        .info("There is only one 'batch', no factor will be created");

            }

        } else {
            log.info("Persisting information on " + descriptorsToBatch.size() + " batches");

            ef = this.makeFactorForBatch(ee);

            for (String batchId : descriptorsToBatch.keySet()) {
                FactorValue fv = FactorValue.Factory.newInstance();
                fv.setIsBaseline(false); /* we could set true for the first batch, but nobody cares. */
                fv.setValue(batchId);
                Collection<Characteristic> chars = new HashSet<>();
                Characteristic c = Characteristic.Factory.newInstance();
                c.setCategory(ExperimentalDesignUtils.BATCH_FACTOR_CATEGORY_NAME);
                c.setValue(batchId);
                c.setCategoryUri(ExperimentalDesignUtils.BATCH_FACTOR_CATEGORY_URI);
                c.setEvidenceCode(GOEvidenceCode.IIA);

                chars.add(c);
                fv.setCharacteristics(chars);
                fv.setExperimentalFactor(ef);

                /*
                 * persist
                 */
                fv.setCharacteristics(chars);
                experimentService.addFactorValue(ee, fv);

                for (T d : descriptorsToBatch.get(batchId)) {
                    d2fv.put(d, fv);
                }
            }
        }
        return ef;
    }

    private ExperimentalFactor makeFactorForBatch(ExpressionExperiment ee) {
        ExperimentalDesign ed = ee.getExperimentalDesign();
        ExperimentalFactor ef = ExperimentalFactor.Factory.newInstance();
        ef.setType(FactorType.CATEGORICAL);
        ef.setCategory(this.getBatchFactorCategory());
        ef.setExperimentalDesign(ed);
        ef.setName(ExperimentalDesignUtils.BATCH_FACTOR_NAME);
        ef.setDescription(
                "Scan date or similar proxy for 'sample processing batch'" + " extracted from the raw data files.");

        ef = this.persistFactor(ee, ef);
        return ef;
    }

    private ExperimentalFactor persistFactor(ExpressionExperiment ee, ExperimentalFactor factor) {
        ExperimentalDesign ed = experimentalDesignService.load(ee.getExperimentalDesign().getId());

        if (ed == null) {
            throw new IllegalStateException("No experimental design for " + ee);
        }

        return this.experimentService.addFactor(ee, factor);

    }

    private String formatBatchName(int batchNum, DateFormat df, Date d) {
        String batchDateString;
        batchDateString = ExperimentalDesignUtils.BATCH_FACTOR_NAME_PREFIX
                + StringUtils.leftPad(Integer.toString(batchNum), 2, "0") + "_"
                + df.format(DateUtils.truncate(d, Calendar.HOUR));
        return batchDateString;
    }

    /**
     * @param  earlierDate earlier date
     * @param  date        data
     * @return             false if 'date' is considered to be in the same batch as 'earlierDate', true if we should
     *                     treat it as a
     *                     separate batch.
     */
    private boolean gapIsLarge(Date earlierDate, Date date) {
        return !DateUtils.isSameDay(date, earlierDate)
                && DateUtils
                        .addHours(earlierDate,
                                BatchInfoPopulationHelperServiceImpl.MAX_GAP_BETWEEN_SAMPLES_TO_BE_SAME_BATCH)
                        .before(date);
    }

    private Characteristic getBatchFactorCategory() {
        Characteristic c = Characteristic.Factory.newInstance();
        c.setCategory(ExperimentalDesignUtils.BATCH_FACTOR_CATEGORY_NAME);
        c.setValue(ExperimentalDesignUtils.BATCH_FACTOR_CATEGORY_NAME);
        c.setValueUri(ExperimentalDesignUtils.BATCH_FACTOR_CATEGORY_URI);
        c.setCategoryUri(ExperimentalDesignUtils.BATCH_FACTOR_CATEGORY_URI);
        c.setEvidenceCode(GOEvidenceCode.IIA);
        return c;
    }

}