ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceAlignmentServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceAlignmentServiceImpl.java

Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2006 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.loader.expression.arrayDesign;

import java.io.IOException;
import java.sql.SQLException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import ubic.basecode.util.StringUtil;
import ubic.gemma.analysis.report.ArrayDesignReportService;
import ubic.gemma.apps.Blat;
import ubic.gemma.externalDb.GoldenPathQuery;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.arrayDesign.ArrayDesignService;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.genome.PhysicalLocation;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.biosequence.BioSequenceService;
import ubic.gemma.model.genome.sequenceAnalysis.BlatResult;
import ubic.gemma.model.genome.sequenceAnalysis.BlatResultService;
import ubic.gemma.persistence.Persister;
import ubic.gemma.util.SequenceBinUtils;

/**
 * Aligns sequences from array designs to the genome, using blat, and persists the blat results.
 * <p>
 * Note: to avoid having very long transactions, this does not run transactionally at the level of a platform. Thus it
 * is possible for it to die with a platform half-processed. But if we don't do this the transactions are too big and
 * cause various deadlocking problems.
 * 
 * @author pavlidis
 * @version $Id: ArrayDesignSequenceAlignmentServiceImpl.java,v 1.3 2013/01/02 23:26:40 paul Exp $
 */
@Component
public class ArrayDesignSequenceAlignmentServiceImpl implements ArrayDesignSequenceAlignmentService {

    private static Log log = LogFactory.getLog(ArrayDesignSequenceAlignmentServiceImpl.class.getName());

    @Autowired
    BlatResultService blatResultService;

    @Autowired
    ArrayDesignService arrayDesignService;

    @Autowired
    Persister persisterHelper;

    @Autowired
    BioSequenceService bioSequenceService;

    @Autowired
    ArrayDesignReportService arrayDesignReportService;

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceAlignmentService#processArrayDesign(ubic.gemma.model
     * .expression.arrayDesign.ArrayDesign, boolean)
     */
    @Override
    public Collection<BlatResult> processArrayDesign(ArrayDesign ad, boolean sensitive) {

        Collection<BlatResult> allResults = new HashSet<BlatResult>();

        if (sensitive)
            log.info("Running in 'sensitive' mode if possible");

        Collection<Taxon> taxa = arrayDesignService.getTaxa(ad.getId());
        boolean first = true;
        for (Taxon taxon : taxa) {

            Collection<BioSequence> sequencesToBlat = getSequences(ad, taxon);

            Map<BioSequence, Collection<BlatResult>> results = getAlignments(sequencesToBlat, sensitive, taxon);

            log.info("Got BLAT results for " + results.keySet().size() + " query sequences");

            Map<String, BioSequence> nameMap = new HashMap<String, BioSequence>();
            for (BioSequence bs : results.keySet()) {
                if (nameMap.containsKey(bs.getName())) {
                    throw new IllegalStateException(
                            "All distinct sequences on the array must have unique names; found " + bs.getName()
                                    + " more than once.");
                }
                nameMap.put(bs.getName(), bs);
            }

            int noresults = 0;
            int count = 0;

            // We only delete the results here, after we have at least one set of blat results.
            if (first) {
                log.info("Looking for old results to remove...");
                arrayDesignService.deleteAlignmentData(ad);
            }

            for (BioSequence sequence : sequencesToBlat) {
                if (sequence == null) {
                    log.warn("Null sequence!");
                    continue;
                }
                Collection<BlatResult> brs = results.get(nameMap.get(sequence.getName()));
                if (brs == null) {
                    ++noresults;
                    continue;
                }
                for (BlatResult result : brs) {
                    result.setQuerySequence(sequence); // must do this to replace
                    // placeholder instance.
                }
                allResults.addAll(persistBlatResults(brs));

                if (++count % 2000 == 0) {
                    log.info("Checked results for " + count + " queries, " + allResults.size()
                            + " blat results so far.");
                }

            }

            log.info(noresults + "/" + sequencesToBlat.size() + " sequences had no blat results");
            first = false;
        }

        arrayDesignReportService.generateArrayDesignReport(ad.getId());

        return allResults;

    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceAlignmentService#processArrayDesign(ubic.gemma.model
     * .expression.arrayDesign.ArrayDesign, ubic.gemma.model.genome.Taxon, java.util.Collection)
     */
    @Override
    public Collection<BlatResult> processArrayDesign(ArrayDesign ad, Taxon taxon,
            Collection<BlatResult> rawBlatResults) {

        log.info("Looking for old results to remove...");

        ad = arrayDesignService.thaw(ad);

        arrayDesignService.deleteAlignmentData(ad);
        // Blat file processing can only be run on one taxon at a time
        taxon = validateTaxaForBlatFile(ad, taxon);

        Collection<BioSequence> sequencesToBlat = getSequences(ad);
        sequencesToBlat = bioSequenceService.thaw(sequencesToBlat);

        // if the blat results were loaded from a file, we have to replace the
        // querysequences with the actual ones
        // attached to the array design. We have to do this by name because the
        // sequence name is what the files contain.
        // Note that if there is ambiguity there will be problems!
        Map<String, BioSequence> seqMap = new HashMap<String, BioSequence>();
        for (BioSequence bioSequence : sequencesToBlat) {
            seqMap.put(bioSequence.getName(), bioSequence);
        }

        ExternalDatabase searchedDatabase = Blat.getSearchedGenome(taxon);

        Collection<BlatResult> toSkip = new HashSet<BlatResult>();
        for (BlatResult result : rawBlatResults) {

            /*
             * If the sequences don't have ids, replace them with the actual sequences associated with the array design.
             */
            if (result.getQuerySequence().getId() == null) {
                String querySeqName = result.getQuerySequence().getName();
                BioSequence actualSequence = seqMap.get(querySeqName);
                if (actualSequence == null) {
                    log.debug("Array design does not contain a sequence with name " + querySeqName);
                    toSkip.add(result);
                    continue;
                }
                result.setQuerySequence(actualSequence);
            } else {
                result.getQuerySequence().setTaxon(taxon);
            }

            result.setSearchedDatabase(searchedDatabase);
            result.getTargetChromosome().setTaxon(taxon);
            result.getTargetChromosome().getSequence().setTaxon(taxon);

        }

        if (toSkip.size() > 0) {
            log.warn(toSkip.size() + " blat results were for sequences not on " + ad + "; they will be ignored.");
            rawBlatResults.removeAll(toSkip);
        }

        Map<BioSequence, Collection<BlatResult>> goldenPathAlignments = new HashMap<BioSequence, Collection<BlatResult>>();
        getGoldenPathAlignments(sequencesToBlat, taxon, goldenPathAlignments);
        for (BioSequence sequence : goldenPathAlignments.keySet()) {
            rawBlatResults.addAll(goldenPathAlignments.get(sequence));
        }

        Collection<BlatResult> results = persistBlatResults(rawBlatResults);

        arrayDesignReportService.generateArrayDesignReport(ad.getId());

        return results;
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceAlignmentService#validateTaxaForBlatFile(ubic.gemma
     * .model.expression.arrayDesign.ArrayDesign, ubic.gemma.model.genome.Taxon)
     */
    @Override
    public Taxon validateTaxaForBlatFile(ArrayDesign arrayDesign, Taxon taxon) {

        if (taxon == null) {
            Collection<Taxon> taxaOnArray = arrayDesignService.getTaxa(arrayDesign.getId());
            if (taxaOnArray != null && taxaOnArray.size() == 1 && taxaOnArray.iterator().next() != null) {
                return taxaOnArray.iterator().next();
            }
            throw new IllegalArgumentException((taxaOnArray == null ? "?" : taxaOnArray.size())
                    + " taxon found for " + arrayDesign + " specifiy which taxon to run");

        }
        return taxon;
    }

    /**
     * If necessary, copy the sequence length information over from the blat result to the given sequence. This is often
     * needed when we get the blat results from golden path.
     * 
     * @param sequence that may not have length information
     * @param result used to get length information
     */
    private void copyLengthInformation(BioSequence sequence, BlatResult result) {
        if (result.getQuerySequence() != null && result.getQuerySequence().getLength() == null) {
            long length = result.getQuerySequence().getLength();
            if (sequence.getLength() == null)
                sequence.setLength(length);
            sequence.setIsApproximateLength(false);
            sequence.setDescription(StringUtil.append(sequence.getDescription(),
                    "Length information from GoldenPath annotations.", " -- "));
            bioSequenceService.update(sequence);
        }
    }

    /**
     * @param sequencesToBlat
     * @param taxon whose database will be queried
     * @return Map of biosequences to collections of blat results.
     */
    private Map<BioSequence, Collection<BlatResult>> getAlignments(Collection<BioSequence> sequencesToBlat,
            boolean sensitive, Taxon taxon) {
        Blat blat = new Blat();
        Map<BioSequence, Collection<BlatResult>> results = new HashMap<BioSequence, Collection<BlatResult>>();
        sequencesToBlat = bioSequenceService.thaw(sequencesToBlat);
        try {

            Collection<BioSequence> needBlat = getGoldenPathAlignments(sequencesToBlat, taxon, results);

            if (needBlat.size() > 0) {
                log.info("Running blat on " + needBlat.size() + " sequences");
                Map<BioSequence, Collection<BlatResult>> moreResults = blat.blatQuery(needBlat, sensitive, taxon);
                results.putAll(moreResults);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return results;
    }

    /**
     * Check if there are alignment results in the goldenpath database, in which case we do not reanalyze the sequences.
     * 
     * @param sequencesToBlat The full set of sequences that need analysis.
     * @param taxon
     * @param results Will be stored here.
     * @return the sequences which ARE NOT found in goldenpath and which therefore DO need blat.
     * @throws SQLException
     */
    private Collection<BioSequence> getGoldenPathAlignments(Collection<BioSequence> sequencesToBlat, Taxon taxon,
            Map<BioSequence, Collection<BlatResult>> results) {

        GoldenPathQuery gpq = new GoldenPathQuery(taxon);

        Collection<BioSequence> needBlat = new HashSet<BioSequence>();
        int count = 0;
        int totalFound = 0;
        for (BioSequence sequence : sequencesToBlat) {
            boolean found = false;
            if (sequence.getSequenceDatabaseEntry() != null) {
                Collection<BlatResult> brs = gpq.findAlignments(sequence.getSequenceDatabaseEntry().getAccession());

                if (brs != null && brs.size() > 0) {
                    for (BlatResult result : brs) {
                        copyLengthInformation(sequence, result);
                        result.setQuerySequence(sequence);
                    }
                    results.put(sequence, brs);
                    found = true;
                    totalFound++;
                }
            }

            if (++count % 1000 == 0 && totalFound > 0) {
                log.info("Alignments in Golden Path database for " + totalFound + "/" + count + " checked so far.");
            }

            if (!found) {
                needBlat.add(sequence);
            }

        }

        if (totalFound > 0) {
            log.info("Found " + totalFound + "/" + count + " alignments in Golden Path database");
        }
        return needBlat;
    }

    /**
     * @param ad
     * @return all sequences, across all taxa that might be represented on the array design
     * @see getSequences(ad, taxon) for method to get just the sequences for one taxon.
     */
    public static Collection<BioSequence> getSequences(ArrayDesign ad) {
        return getSequences(ad, null);
    }

    /**
     * @param ad
     * @param taxon (specified in case array has multiple taxa)
     * @return
     */
    public static Collection<BioSequence> getSequences(ArrayDesign ad, Taxon taxon) {

        Collection<CompositeSequence> compositeSequences = ad.getCompositeSequences();
        Collection<BioSequence> sequencesToBlat = new HashSet<BioSequence>();
        int numWithNoBioSequence = 0;
        int numWithNoSequenceData = 0;
        boolean warned = false;
        for (CompositeSequence cs : compositeSequences) {
            BioSequence bs = cs.getBiologicalCharacteristic();

            if (!warned && (numWithNoBioSequence > 20 || numWithNoSequenceData > 20)) {
                warned = true;
                log.warn("More than 20 composite sequences don't have sequence information, no more warnings...");
            }

            if (bs == null) {
                ++numWithNoBioSequence;
                if (!warned) {
                    log.warn(cs + " had no associated biosequence object");
                }
                continue;
            }

            if (bs.getTaxon() == null) {
                warned = true;
                log.warn("There is no taxon defined for this biosequence ");
                continue;
            }
            // if the taxon is null that means we want this run for all taxa for that array
            if (taxon != null && !bs.getTaxon().equals(taxon)) {
                continue;
            }

            if (!warned && (numWithNoBioSequence > 20 || numWithNoSequenceData > 20)) {
                warned = true;
                log.warn("More than 20 composite sequences don't have sequence information, no more warnings...");
            }

            if (StringUtils.isBlank(bs.getSequence())) {
                ++numWithNoSequenceData;
                if (!warned) {
                    log.warn(cs + " had " + bs + " but no sequence, skipping");
                }
                continue;
            }
            sequencesToBlat.add(bs);

        }
        if (numWithNoBioSequence > 0 || numWithNoSequenceData > 0) {
            log.warn(numWithNoBioSequence + " composite sequences lacked biosequence associations; "
                    + numWithNoSequenceData + " lacked sequence data ( out of " + compositeSequences.size()
                    + " total).");
        }
        return sequencesToBlat;
    }

    /**
     * @param sequencesToBlat, assumed to be the ones that were analyzed
     * @param brs, assumed to be from alignments to the genome for the array design (that is, we don't consider aligning
     *        mouse to human)
     * @return
     */
    private Collection<BlatResult> persistBlatResults(Collection<BlatResult> brs) {
        for (BlatResult br : brs) {
            assert br.getQuerySequence() != null;
            assert br.getQuerySequence().getName() != null;
            Taxon taxon = br.getQuerySequence().getTaxon();
            assert taxon != null;
            br.getTargetChromosome().setTaxon(taxon);
            br.getTargetChromosome().getSequence().setTaxon(taxon);

            PhysicalLocation pl = br.getTargetAlignedRegion();
            if (pl == null) {
                pl = PhysicalLocation.Factory.newInstance();
                pl.setChromosome(br.getTargetChromosome());
                pl.setNucleotide(br.getTargetStart());
                assert br.getTargetEnd() != null && br.getTargetStart() != null;
                pl.setNucleotideLength(br.getTargetEnd().intValue() - br.getTargetStart().intValue());
                pl.setStrand(br.getStrand());
                br.setTargetAlignedRegion(pl);

                pl.setBin(SequenceBinUtils.binFromRange(br.getTargetStart().intValue(),
                        br.getTargetEnd().intValue()));
            }

        }
        return (Collection<BlatResult>) persisterHelper.persist(brs);
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.loader.expression.arrayDesign.ArrayDesignSequenceAlignmentService#processArrayDesign(ubic.gemma.model
     * .expression.arrayDesign.ArrayDesign)
     */
    @Override
    public Collection<BlatResult> processArrayDesign(ArrayDesign design) {
        return this.processArrayDesign(design, false);
    }
}