ubic.gemma.core.externalDb.GoldenPathSequenceAnalysis.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.externalDb.GoldenPathSequenceAnalysis.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2006 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.externalDb;

import org.apache.commons.collections.map.LRUMap;
import org.apache.commons.lang3.StringUtils;
import org.springframework.dao.DataAccessException;
import org.springframework.jdbc.core.ResultSetExtractor;
import ubic.basecode.util.SQLUtils;
import ubic.gemma.core.analysis.sequence.ProbeMapperConfig;
import ubic.gemma.core.analysis.sequence.SequenceManipulation;
import ubic.gemma.core.loader.genome.gene.ncbi.NcbiGeneConverter;
import ubic.gemma.model.association.BioSequence2GeneProduct;
import ubic.gemma.model.common.description.DatabaseEntry;
import ubic.gemma.model.genome.Chromosome;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.model.genome.PhysicalLocation;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.gene.GeneProduct;
import ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation;
import ubic.gemma.model.genome.sequenceAnalysis.BlatResult;
import ubic.gemma.model.genome.sequenceAnalysis.ThreePrimeDistanceMethod;
import ubic.gemma.persistence.util.SequenceBinUtils;

import java.sql.Blob;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

/**
 * Using the Goldenpath databases for comparing sequence alignments to gene locations.
 *
 * @author pavlidis
 */
@SuppressWarnings({ "WeakerAccess", "unused" }) // Possible external use
public class GoldenPathSequenceAnalysis extends GoldenPath {

    /**
     * If the exon overlap fraction with annotated (known/refseq) exons is less than this value, some additional
     * checking for mRNAs and ESTs may be done.
     */
    private static final double RECHECK_OVERLAP_THRESHOLD = 0.9;

    /**
     * cache results of mRNA queries.
     */
    private final LRUMap cache = new LRUMap(2000);

    public GoldenPathSequenceAnalysis(int port, String databaseName, String host, String user, String password) {
        super(port, databaseName, host, user, password);
    }

    public GoldenPathSequenceAnalysis(String databaseName) {
        super(databaseName);
    }

    public GoldenPathSequenceAnalysis(Taxon taxon) {
        super(taxon);
    }

    /**
     * Given a physical location, identify overlapping genes or predicted genes.
     *
     * @param chromosome The chromosome name (the organism is set by the constructor)
     * @param queryStart The start base of the region to query (the start of the alignment to the genome)
     * @param queryEnd The end base of the region to query (the end of the alignment to the genome)
     * @param starts Locations of alignment block starts in target. (comma-delimited from blat)
     * @param sizes Sizes of alignment blocks (comma-delimited from blat)
     * @param strand Either + or - indicating the strand to look on, or null to search both strands.
     * @param method The constant representing the method to use to locate the 3' distance.
     * @param config configuration
     * @return A list of BioSequence2GeneProduct objects. The distance stored by a ThreePrimeData will be 0 if the
     *         sequence overhangs the found genes (rather than providing a negative distance). If no genes are found,
     *         the result is null; These are transient instances, not from Gemma's database
     */
    public Collection<BlatAssociation> findAssociations(String chromosome, Long queryStart, Long queryEnd,
            String starts, String sizes, String strand, ThreePrimeDistanceMethod method, ProbeMapperConfig config) {

        if (GoldenPath.log.isDebugEnabled())
            GoldenPath.log.debug("Seeking gene overlaps with: chrom=" + chromosome + " start=" + queryStart
                    + " end=" + queryEnd + " strand=" + strand);

        if (queryEnd < queryStart)
            throw new IllegalArgumentException("End must not be less than start");

        /*
         * These are transient instances only
         */
        Collection<GeneProduct> geneProducts = new HashSet<>();

        if (config.isUseRefGene()) {
            // starting with refgene means we can get the correct transcript name etc.
            geneProducts.addAll(this.findRefGenesByLocation(chromosome, queryStart, queryEnd, strand));
        }

        if (config.isUseKnownGene()) {
            // get known genes as well, in case all we got was an intron. Currently does not work with rat (rn6)
            geneProducts.addAll(this.findKnownGenesByLocation(chromosome, queryStart, queryEnd, strand));
        }

        if (geneProducts.size() == 0)
            return null;

        Collection<BlatAssociation> results = new HashSet<>();
        for (GeneProduct geneProduct : geneProducts) {
            if (GoldenPath.log.isDebugEnabled())
                GoldenPath.log.debug(geneProduct);

            BlatAssociation blatAssociation = this.computeLocationInGene(chromosome, queryStart, queryEnd, starts,
                    sizes, geneProduct, method, config);

            /*
             * We check against the actual threshold later. We can't fully check it now because not all the slots are
             * populated yet.
             */
            if (config.getMinimumExonOverlapFraction() > 0.0 && blatAssociation.getOverlap() == 0) {
                GoldenPath.log.debug("Result failed to meet exon overlap threshold (0)");
                continue;
            }

            results.add(blatAssociation);
        }
        return results;
    }

    /**
     * Given a location, find the nearest gene on the same strand, including only "known", "refseq" or "ensembl"
     * transcripts.
     *
     * @param chromosome chromosome
     * @param queryStart start
     * @param queryEnd end
     * @param strand Either '+' or '-'
     * @param maxWindow the number of bases on each side to look, at most, in addition to looking inside the given
     *        region.
     * @return the Gene closest to the given location. This is a transient instance, not from Gemma's database.
     */
    public Gene findClosestGene(String chromosome, Long queryStart, Long queryEnd, String strand, int maxWindow) {
        if (queryEnd < queryStart)
            throw new IllegalArgumentException("End must not be less than start");

        long round = 0L;
        int numRounds = 5;
        int increment = (int) (maxWindow / (double) numRounds);

        // we look in a window at most increment * numRounds.

        while (round < numRounds) {

            long left = queryStart + round * increment;
            long right = queryEnd + round * increment;

            Collection<GeneProduct> geneProducts = this.findRefGenesByLocation(chromosome, left, right, strand);
            geneProducts.addAll(this.findKnownGenesByLocation(chromosome, left, right, strand));

            Gene nearest = null;
            int closestSoFar = Integer.MAX_VALUE;

            for (GeneProduct geneProduct : geneProducts) {
                PhysicalLocation gpl = geneProduct.getPhysicalLocation();
                Long start = gpl.getNucleotide();
                Long end = start + gpl.getNucleotideLength();

                int gap = (int) Math.min(left - end, start - right);

                if (gap < closestSoFar) {
                    closestSoFar = gap;
                    nearest = geneProduct.getGene();
                }
            }

            if (nearest != null)
                return nearest;
            round++;
        }

        return null;

    }

    /**
     * Check to see if there are ESTs that overlap with this region. We provisionally promote the ESTs to the status of
     * genes for this purpose.
     *
     * @param chromosome chromosome
     * @param regionStart the region to be checked
     * @param regionEnd end
     * @param strand the strand
     * @return The ESTs which overlap the query region. (using the all_est table)
     */
    public Collection<Gene> findESTs(final String chromosome, Long regionStart, Long regionEnd, String strand) {

        String searchChrom = SequenceManipulation.blatFormatChromosomeName(chromosome);
        String query = "SELECT est.qName, est.qName, est.tStart, est.tEnd, est.strand, est.blockSizes, est.tStarts  "
                + " FROM all_est as est  WHERE "
                + "((est.tStart > ? AND est.tEnd < ?) OR (est.tStart < ? AND est.tEnd > ?) OR "
                + "(est.tStart > ?  AND est.tStart < ?) OR  (est.tEnd > ? AND  est.tEnd < ? )) and est.tName = ? ";

        query = query + " and " + SequenceBinUtils.addBinToQuery("est", regionStart, regionEnd);

        if (strand != null) {
            query = query + " and est.strand = ?";
        }

        Object[] params;

        if (strand == null)
            params = new Object[] { regionStart, regionEnd, regionStart, regionEnd, regionStart, regionEnd,
                    regionStart, regionEnd, searchChrom };
        else
            params = new Object[] { regionStart, regionEnd, regionStart, regionEnd, regionStart, regionEnd,
                    regionStart, regionEnd, searchChrom, strand };

        return this.queryAndExtract(chromosome, query, params);

    }

    /**
     * Find "Known" genes contained in or overlapping a region. Note that the NCBI symbol may be blank, when the gene is
     * not a refSeq gene.
     *
     * @param chromosome chromosome
     * @param start start
     * @param end end
     * @param strand strand
     * @return This is a collection of transient instances, not from Gemma's database.
     */
    public Collection<GeneProduct> findKnownGenesByLocation(String chromosome, Long start, Long end,
            String strand) {
        String searchChrom = SequenceManipulation.blatFormatChromosomeName(chromosome);

        /*
         * Rat has changed again for rn6, dropped the use of RGD symbols and there is no Known Gene track, so this must
         * be skipped for rat (3/2018)
         */

        /*
         * Many known genes map to refseq genes. We use those gene symbols instead. Use kgXRef only to get the
         * description.
         */
        String query = "SELECT r.name, r.geneName, r.txStart, r.txEnd, r.strand, r.exonStarts, r.exonEnds, CONCAT('Refseq gene: ', kgr.description) "
                + " FROM knownGene as kg INNER JOIN knownToRefSeq kr on kr.name=kg.name inner join kgXref kgr on kgr.kgID=kg.name "
                + " INNER JOIN refFlat r ON r.name=kr.value  WHERE "
                + "((kg.txStart >= ? AND kg.txEnd <= ?) OR (kg.txStart <= ? AND kg.txEnd >= ?) OR "
                + "(kg.txStart >= ?  AND kg.txStart <= ?) OR  (kg.txEnd >= ? AND  kg.txEnd <= ? )) and kg.chrom = ? ";

        if (strand != null) {
            query = query + " AND kg.strand = ? ";
        }

        Collection<GeneProduct> known2refseq = this.findGenesByQuery(start, end, searchChrom, strand, query);
        Collection<GeneProduct> result = new HashSet<>(known2refseq);

        /*
         * Ones that do not map to refseq using a left outer join.
         */
        query = "SELECT kgxr.mRNA, kgxr.geneSymbol, kg.txStart, kg.txEnd, kg.strand, kg.exonStarts, kg.exonEnds, CONCAT('Known gene: ', kgxr.description) "
                + " FROM knownGene as kg INNER JOIN"
                + " kgXref AS kgxr ON kg.name=kgxr.kgID LEFT OUTER JOIN knownToRefSeq kr on kr.name=kg.name WHERE kr.value IS NULL AND "
                + "((kg.txStart >= ? AND kg.txEnd <= ?) OR (kg.txStart <= ? AND kg.txEnd >= ?) OR "
                + "(kg.txStart >= ?  AND kg.txStart <= ?) OR  (kg.txEnd >= ? AND  kg.txEnd <= ? )) and kg.chrom = ? ";

        if (strand != null) {
            query = query + " AND kg.strand = ? ";
        }
        Collection<GeneProduct> knowng = this.findGenesByQuery(start, end, searchChrom, strand, query);
        result.addAll(knowng);

        return result;
    }

    /**
     * Find RefSeq genes contained in or overlapping a region.
     *
     * @param chromosome chromosome
     * @param start start
     * @param strand strand
     * @param end end
     * @return This is a collection of transient instances, not from Gemma's database.
     */
    public Collection<GeneProduct> findRefGenesByLocation(String chromosome, Long start, Long end, String strand) {
        String searchChrom = SequenceManipulation.blatFormatChromosomeName(chromosome);

        /*
         * Use kgXRef only to get the description - sometimes missing thus the outer join.
         */
        String query = "SELECT r.name, r.geneName, r.txStart, r.txEnd, r.strand, r.exonStarts, r.exonEnds, CONCAT('Refseq gene: ', kgXref.description) "
                + "FROM refFlat as r left outer join kgXref on r.geneName = kgXref.geneSymbol " + "WHERE "
                + "((r.txStart >= ? AND r.txEnd <= ?) OR (r.txStart <= ? AND r.txEnd >= ?) OR "
                + "(r.txStart >= ?  AND r.txStart <= ?) OR  (r.txEnd >= ? AND  r.txEnd <= ? )) and r.chrom = ? ";

        if (strand != null) {
            query = query + " AND r.strand = ?  ";
        }
        return this.findGenesByQuery(start, end, searchChrom, strand, query);
    }

    /**
     * Check to see if there are mRNAs that overlap with this region. We promote the mRNAs to the status of genes for
     * this purpose.
     *
     * @param chromosome chromosome
     * @param regionStart the region to be checked
     * @param regionEnd end
     * @param strand the strand
     * @return The mRNAs which overlap the query region.
     */
    public Collection<Gene> findRNAs(final String chromosome, Long regionStart, Long regionEnd, String strand) {

        String searchChrom = SequenceManipulation.blatFormatChromosomeName(chromosome);
        String query = "SELECT mrna.qName, mrna.qName, mrna.tStart, mrna.tEnd, mrna.strand, mrna.blockSizes, mrna.tStarts  "
                + " FROM all_mrna as mrna  WHERE "
                + "((mrna.tStart > ? AND mrna.tEnd < ?) OR (mrna.tStart < ? AND mrna.tEnd > ?) OR "
                + "(mrna.tStart > ?  AND mrna.tStart < ?) OR  (mrna.tEnd > ? AND  mrna.tEnd < ? )) and mrna.tName = ? ";

        query = query + " and " + SequenceBinUtils.addBinToQuery("mrna", regionStart, regionEnd);

        if (strand != null) {
            query = query + " and mrna.strand = ?";
        }

        Object[] params;

        if (strand == null)
            params = new Object[] { regionStart, regionEnd, regionStart, regionEnd, regionStart, regionEnd,
                    regionStart, regionEnd, searchChrom };
        else
            params = new Object[] { regionStart, regionEnd, regionStart, regionEnd, regionStart, regionEnd,
                    regionStart, regionEnd, searchChrom, strand };

        return this.queryAndExtract(chromosome, query, params);

    }

    /**
     * @param identifier A Genbank accession referring to an EST or mRNA. For other types of queries this will not
     *        return any results.
     * @return Set containing Lists of PhysicalLocation representing places GoldenPath says the sequence referred to by
     *         the identifier aligns. If no results are found the Set will be empty.
     */
    public Collection<BlatResult> findSequenceLocations(String identifier) {

        Object[] params = new Object[] { identifier };
        String query;

        /* ESTs */
        query = "SELECT est.tName, est.blockSizes, est.tStarts,est.qStarts, est.strand, est.qSize, est.matches, "
                + "est.misMatches, est.qNumInsert, est.tNumInsert, est.qStart, est.qEnd, est.tStart, est.tEnd, est.repMatches FROM"
                + " all_est AS est WHERE est.qName = ?";
        Set<BlatResult> matchingBlocks = new HashSet<>(this.findLocationsByQuery(query, params));

        /* mRNA */
        query = "SELECT mrna.tName, mrna.blockSizes, mrna.tStarts, mrna.qStarts, mrna.strand, mrna.qSize, mrna.matches, "
                + "mrna.misMatches, mrna.qNumInsert, mrna.tNumInsert, mrna.qStart, mrna.qEnd, mrna.tStart, mrna.tEnd, mrna.repMatches"
                + " FROM all_mrna AS mrna WHERE mrna.qName = ?";
        matchingBlocks.addAll(this.findLocationsByQuery(query, params));

        return matchingBlocks;
    }

    /**
     * Given a physical location, find how close it is to the 3' end of a gene it is in, using default mapping settings.
     *
     * @param br BlatResult holding the parameters needed.
     * @param method The constant representing the method to use to locate the 3' distance.
     * @return a collection of distances
     */
    public Collection<? extends BioSequence2GeneProduct> getThreePrimeDistances(BlatResult br,
            ThreePrimeDistanceMethod method) {
        return this.findAssociations(br.getTargetChromosome().getName(), br.getTargetStart(), br.getTargetEnd(),
                br.getTargetStarts(), br.getBlockSizes(), br.getStrand(), method, new ProbeMapperConfig());
    }

    /**
     * Uses default mapping settings
     *
     * @param identifier identifier
     * @param method the method
     * @return bio seq 2 gene producs
     */
    public Collection<BioSequence2GeneProduct> getThreePrimeDistances(String identifier,
            ThreePrimeDistanceMethod method) {
        Collection<BlatResult> locations = this.findSequenceLocations(identifier);
        Collection<BioSequence2GeneProduct> results = new HashSet<>();
        for (BlatResult br : locations) {
            results.addAll(this.getThreePrimeDistances(br, method));
        }
        return results;
    }

    private Collection<Gene> queryAndExtract(final String chromosome, String query, Object[] params) {
        return this.getJdbcTemplate().query(query, params, new ResultSetExtractor<Collection<Gene>>() {

            @Override
            public Collection<Gene> extractData(ResultSet rs) throws SQLException, DataAccessException {

                Collection<Gene> r = new HashSet<>();
                while (rs.next()) {

                    Gene gene = Gene.Factory.newInstance();

                    gene.setNcbiGeneId(Integer.parseInt(rs.getString(1)));
                    gene.setOfficialSymbol(rs.getString(2));
                    gene.setName(gene.getOfficialSymbol());

                    PhysicalLocation pl = PhysicalLocation.Factory.newInstance();
                    pl.setNucleotide(rs.getLong(3));
                    pl.setNucleotideLength(rs.getInt(4) - rs.getInt(3));
                    pl.setStrand(rs.getString(5));
                    pl.setBin(SequenceBinUtils.binFromRange((int) rs.getLong(3), rs.getInt(4)));

                    Chromosome c = new Chromosome(SequenceManipulation.deBlatFormatChromosomeName(chromosome),
                            GoldenPathSequenceAnalysis.this.getTaxon());
                    pl.setChromosome(c);

                    // note that we aren't setting the chromosome here; we already know that.
                    gene.setPhysicalLocation(pl);
                    r.add(gene);

                    Blob blockSizes = rs.getBlob(6);
                    Blob blockStarts = rs.getBlob(7);

                    GoldenPathSequenceAnalysis.this.setBlocks(gene, blockSizes, blockStarts);

                }
                return r;
            }
        });
    }

    private Collection<PhysicalLocation> blocksToPhysicalLocations(int[] blockSizes, int[] blockStarts,
            Chromosome chromosome) {
        Collection<PhysicalLocation> blocks = new HashSet<>();
        for (int i = 0; i < blockSizes.length; i++) {
            long exonStart = blockStarts[i];
            int exonSize = blockSizes[i];
            PhysicalLocation block = PhysicalLocation.Factory.newInstance();
            block.setChromosome(chromosome);
            block.setNucleotide(exonStart);
            block.setNucleotideLength(exonSize);
            block.setBin(SequenceBinUtils.binFromRange((int) exonStart, (int) (exonStart + exonSize)));
            blocks.add(block);
        }
        return blocks;
    }

    /**
     * Recompute the exonOverlap looking at EST evidence. This lets us be a much less conservative about how we compute
     * exon overlaps.
     *
     * @param chromosome chromosome
     * @param queryStart start
     * @param queryEnd end
     * @param starts starts
     * @param sizes sizes
     * @param exonOverlap Exon overlap we're starting with. We only care to improve on this.
     * @param strand of the region
     * @return The best overlap with any exons from an mRNA in the selected region.
     */
    @SuppressWarnings("unchecked")
    private int checkESTs(String chromosome, Long queryStart, Long queryEnd, String starts, String sizes,
            int exonOverlap, String strand) {

        String key = "EST " + chromosome + "||" + queryStart.toString() + "||" + queryEnd.toString() + strand;

        Collection<Gene> ests;
        if (cache.containsKey(key)) {
            ests = (Collection<Gene>) cache.get(key);
        } else {
            ests = this.findESTs(chromosome, queryStart, queryEnd, strand);
            cache.put(key, ests);
        }

        if (ests.size() > 0) {
            if (GoldenPath.log.isDebugEnabled())
                GoldenPath.log.debug(ests.size() + " ESTs found at chr" + chromosome + ":" + queryStart + "-"
                        + queryEnd + ", trying to improve overlap of  " + exonOverlap);

            int maxOverlap = exonOverlap;
            for (Gene est : ests) {
                int overlap = SequenceManipulation.getGeneExonOverlaps(chromosome, starts, sizes, null, est);
                if (GoldenPath.log.isDebugEnabled())
                    GoldenPath.log.debug("overlap with " + est.getNcbiGeneId() + "=" + overlap);
                if (overlap > maxOverlap) {
                    if (GoldenPath.log.isDebugEnabled())
                        GoldenPath.log.debug("Best EST overlap=" + overlap);
                    maxOverlap = overlap;
                }
            }

            exonOverlap = maxOverlap;
            if (GoldenPath.log.isDebugEnabled())
                GoldenPath.log.debug("Overlap with ESTs is now " + exonOverlap);
        }

        return exonOverlap;
    }

    /**
     * Recompute the exonOverlap looking at mRNAs. This lets us be a little less conservative about how we compute exon
     * overlaps.
     *
     * @param chromosome chromosome
     * @param queryStart start
     * @param queryEnd end
     * @param starts starts
     * @param sizes sizes
     * @param exonOverlap Exon overlap we're starting with. We only care to improve on this.
     * @param strand of the region
     * @param gene gene
     * @return The best overlap with any exons from an mRNA in the selected region.
     */
    @SuppressWarnings("unchecked")
    private int checkRNAs(String chromosome, Long queryStart, Long queryEnd, String starts, String sizes,
            int exonOverlap, String strand, Gene gene) {

        String key = "MRNA " + chromosome + "||" + queryStart.toString() + "||" + queryEnd.toString() + strand;

        Collection<Gene> mRNAs;
        if (cache.containsKey(key)) {
            mRNAs = (Collection<Gene>) cache.get(key);
        } else {
            mRNAs = this.findRNAs(chromosome, queryStart, queryEnd, strand);
            cache.put(key, mRNAs);
        }

        if (mRNAs.size() > 0) {
            if (GoldenPath.log.isDebugEnabled())
                GoldenPath.log.debug(mRNAs.size() + " mRNAs found at chr" + chromosome + ":" + queryStart + "-"
                        + queryEnd + ", trying to improve overlap of  " + exonOverlap);

            int maxOverlap = exonOverlap;
            for (Gene mRNA : mRNAs) {

                if (gene != null
                        && !gene.getOfficialSymbol().equals(this.getGeneForMessage(mRNA.getOfficialSymbol()))) {
                    continue;
                }

                int overlap = SequenceManipulation.getGeneExonOverlaps(chromosome, starts, sizes, null, mRNA);
                if (GoldenPath.log.isDebugEnabled())
                    GoldenPath.log.debug("overlap with " + mRNA.getNcbiGeneId() + "=" + overlap);
                if (overlap > maxOverlap) {
                    if (GoldenPath.log.isDebugEnabled())
                        GoldenPath.log.debug("Best mRNA overlap=" + overlap);
                    maxOverlap = overlap;
                }
            }

            exonOverlap = maxOverlap;
            if (GoldenPath.log.isDebugEnabled())
                GoldenPath.log.debug("Overlap with mRNAs is now " + exonOverlap);
        }

        return exonOverlap;
    }

    /**
     * Given a location and a gene product, compute the distance from the 3' end of the gene product as well as the
     * amount of overlap. If the location has low overlaps with known exons (threshold set by
     * RECHECK_OVERLAP_THRESHOLD), we optionally search for mRNAs in the region. If there are overlapping mRNAs, we use
     * the best overlap value. If the overlap is still not high enough we optionally check ESTs.
     *
     * @param chromosome chromosome
     * @param queryStart start
     * @param queryEnd end
     * @param starts Start locations of alignments of the query (target coordinates)
     * @param sizes Sizes of alignments of the query.
     * @param geneProduct GeneProduct with which the overlap and distance is to be computed.
     * @param method method
     * @param config The useEsts and useRNA options are relevant
     * @return a ThreePrimeData object containing the results.
     */
    private BlatAssociation computeLocationInGene(String chromosome, Long queryStart, Long queryEnd, String starts,
            String sizes, GeneProduct geneProduct, ThreePrimeDistanceMethod method, ProbeMapperConfig config) {

        assert geneProduct != null : "GeneProduct is null";

        BlatAssociation blatAssociation = BlatAssociation.Factory.newInstance();
        blatAssociation.setGeneProduct(geneProduct);
        blatAssociation.setThreePrimeDistanceMeasurementMethod(method);
        PhysicalLocation geneLoc = geneProduct.getPhysicalLocation();

        assert geneLoc != null : "PhysicalLocation for GeneProduct " + geneProduct + " is null";
        assert geneLoc.getNucleotide() != null;

        int geneStart = geneLoc.getNucleotide().intValue();
        int geneEnd = geneLoc.getNucleotide().intValue() + geneLoc.getNucleotideLength();
        int exonOverlap = 0;
        if (starts != null & sizes != null) {
            exonOverlap = SequenceManipulation.getGeneProductExonOverlap(starts, sizes, geneLoc.getStrand(),
                    geneProduct);
            int totalSize = SequenceManipulation.totalSize(sizes);

            if (config.isUseMrnas()
                    && exonOverlap / (double) (totalSize) < GoldenPathSequenceAnalysis.RECHECK_OVERLAP_THRESHOLD) {
                int newOverlap = this.checkRNAs(chromosome, queryStart, queryEnd, starts, sizes, exonOverlap,
                        geneLoc.getStrand(), geneProduct.getGene());

                if (newOverlap > exonOverlap) {
                    GoldenPath.log.debug("mRNA overlap was higher than primary transcript");
                    exonOverlap = newOverlap;
                }
            }

            if (config.isUseEsts()
                    && exonOverlap / (double) (totalSize) < GoldenPathSequenceAnalysis.RECHECK_OVERLAP_THRESHOLD) {
                int newOverlap = this.checkESTs(chromosome, queryStart, queryEnd, starts, sizes, exonOverlap,
                        geneLoc.getStrand());

                if (newOverlap > exonOverlap) {
                    GoldenPath.log.debug("Exon overlap was higher than mrna or  primary transcript");
                    exonOverlap = newOverlap;
                }
            }
            assert exonOverlap <= totalSize;
        }

        blatAssociation.setOverlap(exonOverlap);

        if (method == ThreePrimeDistanceMethod.MIDDLE) {
            int center = SequenceManipulation.findCenter(starts, sizes);
            if (geneLoc.getStrand().equals("+")) {
                // then the 3' end is at the 'end'. : >>>>>>>>>>>>>>>>>>>>>*>>>>> (* is where we might be)
                blatAssociation.setThreePrimeDistance((long) Math.max(0, geneEnd - center));
            } else if (geneProduct.getPhysicalLocation().getStrand().equals("-")) {
                // then the 3' end is at the 'start'. : <<<*<<<<<<<<<<<<<<<<<<<<<<<
                blatAssociation.setThreePrimeDistance((long) Math.max(0, center - geneStart));
            } else {
                throw new IllegalArgumentException("Strand wasn't '+' or '-'");
            }
        } else if (method == ThreePrimeDistanceMethod.RIGHT) {
            if (geneLoc.getStrand().equals("+")) {
                // then the 3' end is at the 'end'. : >>>>>>>>>>>>>>>>>>>>>*>>>>> (* is where we might be)
                blatAssociation.setThreePrimeDistance(Math.max(0, geneEnd - queryEnd));
            } else if (geneProduct.getPhysicalLocation().getStrand().equals("-")) {
                // then the 3' end is at the 'start'. : <<<*<<<<<<<<<<<<<<<<<<<<<<<
                blatAssociation.setThreePrimeDistance(Math.max(0, queryStart - geneStart));
            } else {
                throw new IllegalArgumentException("Strand wasn't '+' or '-'");
            }
        } else if (method == ThreePrimeDistanceMethod.LEFT) {
            throw new UnsupportedOperationException("Left edge measure not supported");
        } else {
            throw new IllegalArgumentException("Unknown method");
        }
        return blatAssociation;
    }

    /**
     * Generic method to retrieve Genes from the GoldenPath database. The query given must have the appropriate form.
     *
     * @param starti start
     * @param endi end
     * @param chromosome chromosome
     * @param query query
     * @return List of GeneProducts. This is a collection of transient instances, not from Gemma's database.
     */
    private Collection<GeneProduct> findGenesByQuery(Long starti, Long endi, final String chromosome, String strand,
            String query) {
        // Cases:
        // 1. gene is contained within the region: txStart > start & txEnd < end;
        // 2. region is contained within the gene: txStart < start & txEnd > end;
        // 3. region overlaps start of gene: txStart > start & txStart < end.
        // 4. region overlaps end of gene: txEnd > start & txEnd < end
        //

        Object[] params;
        if (strand != null) {
            params = new Object[] { starti, endi, starti, endi, starti, endi, starti, endi, chromosome, strand };
        } else {
            params = new Object[] { starti, endi, starti, endi, starti, endi, starti, endi, chromosome };
        }

        return this.getJdbcTemplate().query(query, params, new ResultSetExtractor<Collection<GeneProduct>>() {

            @Override
            public Collection<GeneProduct> extractData(ResultSet rs) throws SQLException, DataAccessException {
                Collection<GeneProduct> r = new HashSet<>();
                while (rs.next()) {

                    GeneProduct product = GeneProduct.Factory.newInstance();

                    String name = rs.getString(1);

                    /*
                     * This happens for a very few cases in kgXref, where the gene is 'abParts'. We have to skip these.
                     */
                    if (StringUtils.isBlank(name)) {
                        continue;
                    }

                    /*
                     * The name is our database identifier (either genbank or ensembl)
                     */
                    DatabaseEntry accession = DatabaseEntry.Factory.newInstance();
                    accession.setAccession(name);
                    if (name.startsWith("ENST")) {
                        accession.setExternalDatabase(NcbiGeneConverter.getEnsembl());
                    } else {
                        accession.setExternalDatabase(NcbiGeneConverter.getGenbank());
                    }

                    product.getAccessions().add(accession);

                    Gene gene = Gene.Factory.newInstance();
                    gene.setOfficialSymbol(rs.getString(2));
                    gene.setName(gene.getOfficialSymbol());
                    Taxon taxon = GoldenPathSequenceAnalysis.this.getTaxon();

                    assert taxon != null;
                    gene.setTaxon(taxon);

                    PhysicalLocation pl = PhysicalLocation.Factory.newInstance();
                    pl.setNucleotide(rs.getLong(3));
                    pl.setNucleotideLength(rs.getInt(4) - rs.getInt(3));
                    pl.setStrand(rs.getString(5));
                    pl.setBin(SequenceBinUtils.binFromRange((int) rs.getLong(3), rs.getInt(4)));
                    PhysicalLocation genePl = PhysicalLocation.Factory.newInstance();
                    genePl.setStrand(pl.getStrand());

                    Chromosome c = new Chromosome(SequenceManipulation.deBlatFormatChromosomeName(chromosome),
                            taxon);
                    pl.setChromosome(c);
                    genePl.setChromosome(c);

                    /*
                     * this only contains the chromosome and strand: the nucleotide positions are only valid for the
                     * gene product
                     */
                    gene.setPhysicalLocation(genePl);

                    product.setName(name);

                    String descriptionFromGP = rs.getString(8);
                    if (StringUtils.isBlank(descriptionFromGP)) {
                        product.setDescription("Imported from GoldenPath");
                    } else {
                        product.setDescription("Imported from Golden Path: " + descriptionFromGP);
                    }
                    product.setPhysicalLocation(pl);
                    product.setGene(gene);

                    Blob exonStarts = rs.getBlob(6);
                    Blob exonEnds = rs.getBlob(7);
                    product.setExons(GoldenPathSequenceAnalysis.this.getExons(c, exonStarts, exonEnds));

                    /*
                     * For microRNAs, we don't get exons, so we just use the whole length for now.
                     */
                    if (product.getExons().size() == 0) {
                        product.getExons().add(pl);
                    }

                    r.add(product);

                }
                return r;
            }

        });

    }

    /**
     * Uses a query that can retrieve BlatResults from GoldenPath. The query must have the appropriate form.
     *
     * @param query query
     * @param params params
     * @return blat results
     */
    private Collection<BlatResult> findLocationsByQuery(final String query, final Object[] params) {

        return this.getJdbcTemplate().query(query, params, new ResultSetExtractor<Collection<BlatResult>>() {
            @Override
            public Collection<BlatResult> extractData(ResultSet rs) throws SQLException, DataAccessException {
                Collection<BlatResult> r = new HashSet<>();
                while (rs.next()) {

                    BlatResult blatResult = BlatResult.Factory.newInstance();

                    Chromosome c = new Chromosome(SequenceManipulation.deBlatFormatChromosomeName(rs.getString(1)),
                            GoldenPathSequenceAnalysis.this.getTaxon());

                    blatResult.setTargetChromosome(c);

                    Blob blockSizes = rs.getBlob(2);
                    Blob targetStarts = rs.getBlob(3);
                    Blob queryStarts = rs.getBlob(4);

                    blatResult.setBlockSizes(SQLUtils.blobToString(blockSizes));
                    blatResult.setTargetStarts(SQLUtils.blobToString(targetStarts));
                    blatResult.setQueryStarts(SQLUtils.blobToString(queryStarts));

                    blatResult.setStrand(rs.getString(5));

                    // need the query size to compute scores.
                    blatResult.setQuerySequence(BioSequence.Factory.newInstance());
                    blatResult.getQuerySequence().setLength(rs.getLong(6));
                    blatResult.getQuerySequence().setName((String) params[0]);

                    blatResult.setMatches(rs.getInt(7));
                    blatResult.setMismatches(rs.getInt(8));
                    blatResult.setQueryGapCount(rs.getInt(9));
                    blatResult.setTargetGapCount(rs.getInt(10));

                    blatResult.setQueryStart(rs.getInt(11));
                    blatResult.setQueryEnd(rs.getInt(12));

                    blatResult.setTargetStart(rs.getLong(13));
                    blatResult.setTargetEnd(rs.getLong(14));

                    blatResult.setRepMatches(rs.getInt(15));

                    r.add(blatResult);
                }
                return r;
            }

        });

    }

    /**
     * Fill in the exon information for a gene, given the raw blobs from the GoldenPath database.
     * Be sure to pass the right Blob arguments!
     *
     * @param exonStarts starts
     * @param exonEnds ends
     * @throws SQLException sql problem
     */
    private Collection<PhysicalLocation> getExons(Chromosome chrom, Blob exonStarts, Blob exonEnds)
            throws SQLException {

        Collection<PhysicalLocation> exons = new HashSet<>();
        if (exonStarts == null || exonEnds == null) {
            return exons;
        }

        String exonStartLocations = SQLUtils.blobToString(exonStarts);
        String exonEndLocations = SQLUtils.blobToString(exonEnds);

        int[] exonStartsInts = SequenceManipulation.blatLocationsToIntArray(exonStartLocations);
        int[] exonEndsInts = SequenceManipulation.blatLocationsToIntArray(exonEndLocations);

        assert exonStartsInts.length == exonEndsInts.length;

        for (int i = 0; i < exonEndsInts.length; i++) {
            int exonStart = exonStartsInts[i];
            int exonEnd = exonEndsInts[i];
            PhysicalLocation exon = PhysicalLocation.Factory.newInstance();

            exon.setChromosome(chrom);
            assert chrom.getTaxon() != null;

            exon.setNucleotide((long) exonStart);
            exon.setNucleotideLength(exonEnd - exonStart);
            exon.setBin(SequenceBinUtils.binFromRange(exonStart, exonEnd));
            exons.add(exon);
        }

        return exons;
    }

    /**
     * Only for refseq genes.
     *
     * @param ncbiId mRNA accession eg. NR_000028
     * @return string
     */
    private String getGeneForMessage(String ncbiId) {

        return this.getJdbcTemplate().query(
                "SELECT rg.name2 FROM all_mrna m INNER JOIN refGene rg ON m.qName = rg.name WHERE m.qName = ? ",
                new Object[] { ncbiId }, new ResultSetExtractor<String>() {

                    @Override
                    public String extractData(ResultSet rs) throws SQLException, DataAccessException {
                        while (rs.next()) {
                            String string = rs.getString(0);
                            if (StringUtils.isNotBlank(string))
                                return string;
                        }
                        return null;
                    }
                });

    }

    /**
     * Handle the format used by the all_mrna and other GoldenPath tables, which go by sizes of blocks and their starts,
     * not the starts and ends.
     * Be sure to pass the right Blob arguments!
     *
     * @param gene gene
     * @param blockSizes sizes
     * @param blockStarts starts
     */
    private void setBlocks(Gene gene, Blob blockSizes, Blob blockStarts) throws SQLException {
        if (blockSizes == null || blockStarts == null)
            return;

        String exonSizes = SQLUtils.blobToString(blockSizes);
        String exonStarts = SQLUtils.blobToString(blockStarts);

        int[] exonSizeInts = SequenceManipulation.blatLocationsToIntArray(exonSizes);
        int[] exonStartInts = SequenceManipulation.blatLocationsToIntArray(exonStarts);

        assert exonSizeInts.length == exonStartInts.length;

        GeneProduct gp = GeneProduct.Factory.newInstance();
        Chromosome chromosome = null;
        if (gene.getPhysicalLocation() != null)
            chromosome = gene.getPhysicalLocation().getChromosome();
        Collection<PhysicalLocation> exons = this.blocksToPhysicalLocations(exonSizeInts, exonStartInts,
                chromosome);
        gp.setExons(exons);
        gp.setName(gene.getNcbiGeneId().toString()); // this isn't right?
        Collection<GeneProduct> products = new HashSet<>();
        products.add(gp);
        gene.setProducts(products);
    }

}