ubic.gemma.core.loader.expression.arrayDesign.ArrayDesignSequenceAlignmentServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.loader.expression.arrayDesign.ArrayDesignSequenceAlignmentServiceImpl.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2006 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.loader.expression.arrayDesign;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.reflect.FieldUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import ubic.basecode.util.StringUtil;
import ubic.gemma.core.analysis.report.ArrayDesignReportService;
import ubic.gemma.core.analysis.sequence.ProbeMapUtils;
import ubic.gemma.core.apps.Blat;
import ubic.gemma.core.apps.ShellDelegatingBlat;
import ubic.gemma.core.externalDb.GoldenPathQuery;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.genome.PhysicalLocation;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.sequenceAnalysis.BlatResult;
import ubic.gemma.persistence.persister.Persister;
import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService;
import ubic.gemma.persistence.service.genome.biosequence.BioSequenceService;
import ubic.gemma.persistence.util.SequenceBinUtils;

import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;

/**
 * Aligns sequences from array designs to the genome, using blat, and persists the blat results.
 * Note: to avoid having very long transactions, this does not run transactionally at the level of a platform. Thus it
 * is possible for it to die with a platform half-processed. But if we don't do this the transactions are too big and
 * cause various deadlocking problems.
 *
 * @author pavlidis
 */
@Component

public class ArrayDesignSequenceAlignmentServiceImpl implements ArrayDesignSequenceAlignmentService {

    private static final Log log = LogFactory.getLog(ArrayDesignSequenceAlignmentServiceImpl.class.getName());

    private final ArrayDesignReportService arrayDesignReportService;
    private final ArrayDesignService arrayDesignService;
    private final BioSequenceService bioSequenceService;
    private final Persister persisterHelper;

    @Autowired
    public ArrayDesignSequenceAlignmentServiceImpl(ArrayDesignReportService arrayDesignReportService,
            ArrayDesignService arrayDesignService, BioSequenceService bioSequenceService,
            Persister persisterHelper) {
        this.arrayDesignReportService = arrayDesignReportService;
        this.arrayDesignService = arrayDesignService;
        this.bioSequenceService = bioSequenceService;
        this.persisterHelper = persisterHelper;
    }

    /**
     * @param ad platform
     * @return all sequences, across all taxa that might be represented on the array design
     */
    public static Collection<BioSequence> getSequences(ArrayDesign ad) {
        return ArrayDesignSequenceAlignmentServiceImpl.getSequences(ad, null);
    }

    /**
     * @param ad    platform
     * @param taxon (specified in case array has multiple taxa)
     * @return bio sequences
     */
    @SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use
    public static Collection<BioSequence> getSequences(ArrayDesign ad, Taxon taxon) {

        Collection<CompositeSequence> compositeSequences = ad.getCompositeSequences();
        Collection<BioSequence> sequencesToBlat = new HashSet<>();
        int numWithNoBioSequence = 0;
        int numWithNoSequenceData = 0;
        boolean warned = false;
        for (CompositeSequence cs : compositeSequences) {
            BioSequence bs = cs.getBiologicalCharacteristic();

            if (!warned && (numWithNoBioSequence > 20 || numWithNoSequenceData > 20)) {
                warned = true;
                ArrayDesignSequenceAlignmentServiceImpl.log.warn(
                        "More than 20 composite sequences don't have sequence information, no more warnings...");
            }

            if (bs == null) {
                ++numWithNoBioSequence;
                if (!warned) {
                    ArrayDesignSequenceAlignmentServiceImpl.log.warn(cs + " had no associated biosequence object");
                }
                continue;
            }

            if (bs.getTaxon() == null) {
                warned = true;
                ArrayDesignSequenceAlignmentServiceImpl.log.warn("There is no taxon defined for this biosequence ");
                continue;
            }
            // if the taxon is null that means we want this run for all taxa for that array
            if (taxon != null && !bs.getTaxon().equals(taxon)) {
                continue;
            }

            //noinspection ConstantConditions // Better readability
            if (!warned && (numWithNoBioSequence > 20 || numWithNoSequenceData > 20)) {
                warned = true;
                ArrayDesignSequenceAlignmentServiceImpl.log.warn(
                        "More than 20 composite sequences don't have sequence information, no more warnings...");
            }

            if (StringUtils.isBlank(bs.getSequence())) {
                ++numWithNoSequenceData;
                if (!warned) {
                    ArrayDesignSequenceAlignmentServiceImpl.log
                            .warn(cs + " had " + bs + " but no sequence, skipping");
                }
                continue;
            }
            sequencesToBlat.add(bs);

        }
        if (numWithNoBioSequence > 0 || numWithNoSequenceData > 0) {
            ArrayDesignSequenceAlignmentServiceImpl.log.warn(numWithNoBioSequence
                    + " composite sequences lacked biosequence associations; " + numWithNoSequenceData
                    + " lacked sequence data ( out of " + compositeSequences.size() + " total).");
        }
        return sequencesToBlat;
    }

    @Override
    public Collection<BlatResult> processArrayDesign(ArrayDesign design, boolean sensitive) {
        return this.processArrayDesign(design, sensitive, null);
    }

    @Override
    public Collection<BlatResult> processArrayDesign(ArrayDesign ad, Taxon taxon,
            Collection<BlatResult> rawBlatResults) {

        ArrayDesignSequenceAlignmentServiceImpl.log.info("Looking for old results to remove...");

        ad = arrayDesignService.thaw(ad);

        arrayDesignService.deleteAlignmentData(ad);
        // Blat file processing can only be run on one taxon at a time
        taxon = this.validateTaxaForBlatFile(ad, taxon);

        Collection<BioSequence> sequencesToBlat = ArrayDesignSequenceAlignmentServiceImpl.getSequences(ad);
        sequencesToBlat = bioSequenceService.thaw(sequencesToBlat);

        // if the blat results were loaded from a file, we have to replace the
        // query sequences with the actual ones
        // attached to the array design. We have to do this by name because the
        // sequence name is what the files contain.
        // Note that if there is ambiguity there will be problems!
        Map<String, BioSequence> seqMap = new HashMap<>();
        for (BioSequence bioSequence : sequencesToBlat) {
            seqMap.put(bioSequence.getName(), bioSequence);
        }

        ExternalDatabase searchedDatabase = ShellDelegatingBlat.getSearchedGenome(taxon);

        Collection<BlatResult> toSkip = new HashSet<>();
        for (BlatResult result : rawBlatResults) {

            /*
             * If the sequences don't have ids, replace them with the actual sequences associated with the array design.
             */
            if (result.getQuerySequence().getId() == null) {
                String querySeqName = result.getQuerySequence().getName();
                BioSequence actualSequence = seqMap.get(querySeqName);
                if (actualSequence == null) {
                    ArrayDesignSequenceAlignmentServiceImpl.log
                            .debug("Array design does not contain a sequence with name " + querySeqName);
                    toSkip.add(result);
                    continue;
                }
                result.setQuerySequence(actualSequence);
            } else {
                result.getQuerySequence().setTaxon(taxon);
            }

            result.setSearchedDatabase(searchedDatabase);
            try {
                FieldUtils.writeField(result.getTargetChromosome(), "taxon", taxon, true);
            } catch (IllegalAccessException e) {
                e.printStackTrace();
            }
            result.getTargetChromosome().getSequence().setTaxon(taxon);

        }

        if (toSkip.size() > 0) {
            ArrayDesignSequenceAlignmentServiceImpl.log.warn(
                    toSkip.size() + " blat results were for sequences not on " + ad + "; they will be ignored.");
            rawBlatResults.removeAll(toSkip);
        }

        Map<BioSequence, Collection<BlatResult>> goldenPathAlignments = new HashMap<>();
        this.getGoldenPathAlignments(sequencesToBlat, taxon, goldenPathAlignments);
        for (BioSequence sequence : goldenPathAlignments.keySet()) {
            rawBlatResults.addAll(goldenPathAlignments.get(sequence));
        }

        Collection<BlatResult> results = this.persistBlatResults(rawBlatResults);

        arrayDesignReportService.generateArrayDesignReport(ad.getId());

        return results;
    }

    @Override
    public Taxon validateTaxaForBlatFile(ArrayDesign arrayDesign, Taxon taxon) {

        if (taxon == null) {
            Collection<Taxon> taxaOnArray = arrayDesignService.getTaxa(arrayDesign.getId());
            if (taxaOnArray != null && taxaOnArray.size() == 1 && taxaOnArray.iterator().next() != null) {
                return taxaOnArray.iterator().next();
            }
            throw new IllegalArgumentException((taxaOnArray == null ? "?" : taxaOnArray.size())
                    + " taxon found for " + arrayDesign + " specify which taxon to run");

        }
        return taxon;
    }

    @Override
    public Collection<BlatResult> processArrayDesign(ArrayDesign design) {
        return this.processArrayDesign(design, false, null);
    }

    @Override
    public Collection<BlatResult> processArrayDesign(ArrayDesign design, Blat blat) {
        return this.processArrayDesign(design, false, blat);
    }

    /**
     * If necessary, copy the sequence length information over from the blat result to the given sequence. This is often
     * needed when we get the blat results from golden path.
     *
     * @param sequence that may not have length information
     * @param result   used to get length information
     */
    private void copyLengthInformation(BioSequence sequence, BlatResult result) {
        if (result.getQuerySequence() != null && result.getQuerySequence().getLength() == null) {
            long length = result.getQuerySequence().getLength();
            if (sequence.getLength() == null)
                sequence.setLength(length);
            sequence.setIsApproximateLength(false);
            sequence.setDescription(StringUtil.append(sequence.getDescription(),
                    "Length information from GoldenPath annotations.", " -- "));
            bioSequenceService.update(sequence);
        }
    }

    /**
     * @param taxon whose database will be queried
     * @return Map of biosequences to collections of blat results.
     */
    private Map<BioSequence, Collection<BlatResult>> getAlignments(Collection<BioSequence> sequencesToBlat,
            boolean sensitive, Taxon taxon, Blat blat) {

        Map<BioSequence, Collection<BlatResult>> results = new HashMap<>();
        sequencesToBlat = bioSequenceService.thaw(sequencesToBlat);
        try {

            Collection<BioSequence> needBlat = this.getGoldenPathAlignments(sequencesToBlat, taxon, results);

            if (needBlat.size() > 0) {
                ArrayDesignSequenceAlignmentServiceImpl.log
                        .info("Running blat on " + needBlat.size() + " sequences");
                Map<BioSequence, Collection<BlatResult>> moreResults = blat.blatQuery(needBlat, sensitive, taxon);
                results.putAll(moreResults);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return results;
    }

    /**
     * Check if there are alignment results in the goldenpath database, in which case we do not reanalyze the sequences.
     *
     * @param sequencesToBlat The full set of sequences that need analysis.
     * @param results         Will be stored here.
     * @return the sequences which ARE NOT found in goldenpath and which therefore DO need blat.
     */
    private Collection<BioSequence> getGoldenPathAlignments(Collection<BioSequence> sequencesToBlat, Taxon taxon,
            Map<BioSequence, Collection<BlatResult>> results) {

        GoldenPathQuery gpq = new GoldenPathQuery(taxon);

        Collection<BioSequence> needBlat = new HashSet<>();
        int count = 0;
        int totalFound = 0;
        for (BioSequence sequence : sequencesToBlat) {
            boolean found = false;
            if (sequence.getSequenceDatabaseEntry() != null) {
                Collection<BlatResult> brs = gpq.findAlignments(sequence.getSequenceDatabaseEntry().getAccession());

                if (brs != null && brs.size() > 0) {
                    for (BlatResult result : brs) {
                        this.copyLengthInformation(sequence, result);
                        result.setQuerySequence(sequence);
                    }
                    results.put(sequence, brs);
                    found = true;
                    totalFound++;
                }
            }

            if (++count % 1000 == 0 && totalFound > 0) {
                ArrayDesignSequenceAlignmentServiceImpl.log.info(
                        "Alignments in Golden Path database for " + totalFound + "/" + count + " checked so far.");
            }

            if (!found) {
                needBlat.add(sequence);
            }

        }

        if (totalFound > 0) {
            ArrayDesignSequenceAlignmentServiceImpl.log
                    .info("Found " + totalFound + "/" + count + " alignments in Golden Path database");
        }
        return needBlat;
    }

    /**
     * @param brs, assumed to be from alignments to the genome for the array design (that is, we don't consider aligning
     *             mouse to human)
     */
    @SuppressWarnings("unchecked")
    private Collection<BlatResult> persistBlatResults(Collection<BlatResult> brs) {

        Collection<Integer> seen = new HashSet<>();
        int duplicates = 0;
        for (BlatResult br : brs) {

            Integer hash = ProbeMapUtils.hashBlatResult(br);
            if (seen.contains(hash)) {
                duplicates++;
                continue;
            }
            seen.add(hash);

            assert br.getQuerySequence() != null;
            assert br.getQuerySequence().getName() != null;
            Taxon taxon = br.getQuerySequence().getTaxon();
            assert taxon != null;

            try {
                FieldUtils.writeField(br.getTargetChromosome(), "taxon", taxon, true);
            } catch (IllegalAccessException e) {
                e.printStackTrace();
            }
            br.getTargetChromosome().getSequence().setTaxon(taxon);

            PhysicalLocation pl = br.getTargetAlignedRegion();
            if (pl == null) {
                pl = PhysicalLocation.Factory.newInstance();
                pl.setChromosome(br.getTargetChromosome());
                pl.setNucleotide(br.getTargetStart());
                assert br.getTargetEnd() != null && br.getTargetStart() != null;
                pl.setNucleotideLength(br.getTargetEnd().intValue() - br.getTargetStart().intValue());
                pl.setStrand(br.getStrand());
                br.setTargetAlignedRegion(pl);
                pl.setBin(SequenceBinUtils.binFromRange(br.getTargetStart().intValue(),
                        br.getTargetEnd().intValue()));
            }

        }

        if (duplicates > 0) {
            ArrayDesignSequenceAlignmentServiceImpl.log.info(duplicates + " duplicate BLAT hits skipped");
        }

        return (Collection<BlatResult>) persisterHelper.persist(brs);
    }

    private Collection<BlatResult> processArrayDesign(ArrayDesign ad, boolean sensitive, Blat blat) {

        if (blat == null)
            blat = new ShellDelegatingBlat();

        Collection<BlatResult> allResults = new HashSet<>();

        if (sensitive)
            ArrayDesignSequenceAlignmentServiceImpl.log.info("Running in 'sensitive' mode if possible");

        Collection<Taxon> taxa = arrayDesignService.getTaxa(ad.getId());
        boolean first = true;
        for (Taxon taxon : taxa) {

            Collection<BioSequence> sequencesToBlat = ArrayDesignSequenceAlignmentServiceImpl.getSequences(ad,
                    taxon);

            Map<BioSequence, Collection<BlatResult>> results = this.getAlignments(sequencesToBlat, sensitive, taxon,
                    blat);

            ArrayDesignSequenceAlignmentServiceImpl.log
                    .info("Got BLAT results for " + results.keySet().size() + " query sequences");

            Map<String, BioSequence> nameMap = new HashMap<>();
            for (BioSequence bs : results.keySet()) {
                if (nameMap.containsKey(bs.getName())) {
                    throw new IllegalStateException(
                            "All distinct sequences on the array must have unique names; found " + bs.getName()
                                    + " more than once.");
                }
                nameMap.put(bs.getName(), bs);
            }

            int noResults = 0;
            int count = 0;

            // We only remove the results here, after we have at least one set of blat results.
            if (first) {
                ArrayDesignSequenceAlignmentServiceImpl.log.info("Looking for old results to remove...");
                arrayDesignService.deleteAlignmentData(ad);
            }

            for (BioSequence sequence : sequencesToBlat) {
                if (sequence == null) {
                    ArrayDesignSequenceAlignmentServiceImpl.log.warn("Null sequence!");
                    continue;
                }
                Collection<BlatResult> brs = results.get(nameMap.get(sequence.getName()));
                if (brs == null) {
                    ++noResults;
                    continue;
                }
                for (BlatResult result : brs) {
                    result.setQuerySequence(sequence); // must do this to replace
                    // placeholder instance.
                }
                allResults.addAll(this.persistBlatResults(brs));

                if (++count % 2000 == 0) {
                    ArrayDesignSequenceAlignmentServiceImpl.log.info("Checked results for " + count + " queries, "
                            + allResults.size() + " blat results so far.");
                }

            }

            ArrayDesignSequenceAlignmentServiceImpl.log
                    .info(noResults + "/" + sequencesToBlat.size() + " sequences had no blat results");
            first = false;
        }

        arrayDesignReportService.generateArrayDesignReport(ad.getId());

        return allResults;

    }
}