ubic.gemma.core.analysis.sequence.SequenceManipulation.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.analysis.sequence.SequenceManipulation.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2006 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.analysis.sequence;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import ubic.gemma.core.loader.expression.arrayDesign.Reporter;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.model.genome.PhysicalLocation;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.gene.GeneProduct;

import java.util.Collection;
import java.util.HashSet;

/**
 * Convenient methods for manipulating BioSequences and PhysicalLocations
 *
 * @author pavlidis
 */
@SuppressWarnings("unused") // Possible external use
public class SequenceManipulation {
    private static final Log log = LogFactory.getLog(SequenceManipulation.class);

    /**
     * Puts "chr" prefix on the chromosome name, if need be.
     *
     * @param chromosome chromosome
     * @return formatted name
     */
    public static String blatFormatChromosomeName(String chromosome) {
        String searchChromosome = chromosome;
        if (!searchChromosome.startsWith("chr"))
            searchChromosome = "chr" + searchChromosome;
        return searchChromosome;
    }

    /**
     * Remove a 3' polyA or 5' polyT tail. The entire tail is removed.
     *
     * @param sequence        sequence
     * @param thresholdLength to trigger removal.
     * @return processed sequence
     */
    @SuppressWarnings("Annotator") // Annotator not working properly with the string concat
    public static String stripPolyAorT(String sequence, int thresholdLength) {
        sequence = sequence.replaceAll("A{" + thresholdLength + ",}$", "");
        return sequence.replaceAll("^T{" + thresholdLength + ",}", "");
    }

    /**
     * Convert a psl-formatted list (comma-delimited) to an int[].
     *
     * @param blatLocations locations
     * @return locations
     */
    public static int[] blatLocationsToIntArray(String blatLocations) {
        if (blatLocations == null)
            return null;
        String[] strings = blatLocations.split(",");
        int[] result = new int[strings.length];
        for (int i = 0; i < strings.length; i++) {
            try {
                result[i] = Integer.parseInt(strings[i]);
            } catch (NumberFormatException e) {
                throw new RuntimeException(
                        "Could not parse integer blat location " + blatLocations + ", from " + strings[i], e);
            }
        }
        return result;
    }

    /**
     * Convert a CompositeSequence's immobilizedCharacteristics into a single sequence, using a simple merge-join
     * strategy.
     *
     * @param sequences sequences
     * @return BioSequence. Not all fields are filled in and must be set by the caller.
     */
    public static BioSequence collapse(Collection<Reporter> sequences) {
        Collection<Reporter> copyOfSequences = SequenceManipulation.copyReporters(sequences);
        BioSequence collapsed = BioSequence.Factory.newInstance();
        collapsed.setSequence("");
        if (SequenceManipulation.log.isDebugEnabled())
            SequenceManipulation.log.debug("Collapsing " + sequences.size() + " sequences");
        while (!copyOfSequences.isEmpty()) {
            Reporter next = SequenceManipulation.findLeftMostProbe(copyOfSequences);
            int ol = SequenceManipulation.rightHandOverlap(collapsed, next.getImmobilizedCharacteristic());
            String nextSeqStr = next.getImmobilizedCharacteristic().getSequence();
            collapsed.setSequence(collapsed.getSequence() + nextSeqStr.substring(ol));
            if (SequenceManipulation.log.isDebugEnabled()) {
                SequenceManipulation.log.debug(
                        "New Seq to add: " + nextSeqStr + " Overlap=" + ol + " Result=" + collapsed.getSequence());
            }
            copyOfSequences.remove(next);
        }
        collapsed.setIsCircular(false);
        collapsed.setIsApproximateLength(false);
        collapsed.setLength((long) collapsed.getSequence().length());
        collapsed.setDescription("Collapsed from " + sequences.size() + " reporter sequences");
        return collapsed;
    }

    /**
     * Removes "chr" prefix from the chromosome name, if it is there.
     *
     * @param chromosome chromosome
     * @return formatted name
     */
    public static String deBlatFormatChromosomeName(String chromosome) {
        String searchChromosome = chromosome;
        if (searchChromosome.startsWith("chr"))
            searchChromosome = searchChromosome.substring(3);
        return searchChromosome;
    }

    /**
     * Find where the center of a query location is in a gene. This is defined as the location of the center base of the
     * query sequence relative to the 3' end of the gene.
     *
     * @param sizes  sizes
     * @param starts starts
     * @return center
     */
    public static int findCenter(String starts, String sizes) {

        int[] startArray = SequenceManipulation.blatLocationsToIntArray(starts);
        int[] sizesArray = SequenceManipulation.blatLocationsToIntArray(sizes);

        return SequenceManipulation.findCenterExonCenterBase(startArray, sizesArray);

    }

    /**
     * Given a gene, find out how much of it overlaps with exons provided as starts and sizes. This could involve more
     * than one exon.
     *
     * @param chromosome, as "chrX" or "X".
     * @param starts      of the locations we are testing.
     * @param sizes       of the locations we are testing.
     * @param strand      to consider. If null, strand is ignored.
     * @param gene        Gene we are testing
     * @return Number of bases which overlap with exons of the gene. A value of zero indicates that the location is
     * entirely within an intron. If multiple GeneProducts are associated with this gene, the best (highest)
     * overlap is reported).
     */
    public static int getGeneExonOverlaps(String chromosome, String starts, String sizes, String strand,
            Gene gene) {
        if (gene == null) {
            SequenceManipulation.log.warn("Null gene");
            return 0;
        }

        if (gene.getPhysicalLocation().getChromosome() != null && !gene.getPhysicalLocation().getChromosome()
                .getName().equals(SequenceManipulation.deBlatFormatChromosomeName(chromosome)))
            return 0;

        int bestOverlap = 0;
        for (GeneProduct geneProduct : gene.getProducts()) {
            int overlap = SequenceManipulation.getGeneProductExonOverlap(starts, sizes, strand, geneProduct);
            if (overlap > bestOverlap) {
                bestOverlap = overlap;
            }
        }
        return bestOverlap;
    }

    /**
     * Compute the overlap of a physical location with a transcript (gene product). This assumes that the chromosome is
     * already matched.
     *
     * @param starts      of the locations we are testing (in the target, so on the same coordinates as the geneProduct
     *                    location is scored)
     * @param sizes       of the locations we are testing.
     * @param strand      the strand to look on. If null, strand is ignored.
     * @param geneProduct GeneProduct we are testing. If strand of PhysicalLocation is null, we ignore strand.
     * @return Total number of bases which overlap with exons of the transcript. A value of zero indicates that the
     * location is entirely within an intron, or the strand is wrong.
     */
    public static int getGeneProductExonOverlap(String starts, String sizes, String strand,
            GeneProduct geneProduct) {

        if (starts == null || sizes == null || geneProduct == null)
            throw new IllegalArgumentException("Null data");

        // If strand is null we don't bother looking at it; if the strands don't match we return 0
        PhysicalLocation gpPhysicalLocation = geneProduct.getPhysicalLocation();
        if (strand != null && gpPhysicalLocation != null && gpPhysicalLocation.getStrand() != null
                && !gpPhysicalLocation.getStrand().equals(strand)) {
            return 0;
        }

        // These are transient instances
        Collection<PhysicalLocation> exons = geneProduct.getExons();

        int[] startArray = SequenceManipulation.blatLocationsToIntArray(starts);
        int[] sizesArray = SequenceManipulation.blatLocationsToIntArray(sizes);

        if (exons.size() == 0) {
            /*
             * simply use the gene product location itself. This was the case if we have ProbeAlignedRegion...otherwise
             * it's not expected
             */
            SequenceManipulation.log.warn("No exons for " + geneProduct);
            return 0;

        }

        // this was happening when data was truncated by the database!
        assert startArray.length == sizesArray.length : startArray.length + " starts and " + sizesArray.length
                + " sizes (expected equal numbers)";

        int totalOverlap = 0;
        int totalLength = 0;
        for (int i = 0; i < sizesArray.length; i++) {
            int start = startArray[i];
            int end = start + sizesArray[i];
            for (PhysicalLocation exonLocation : exons) {
                int exonStart = exonLocation.getNucleotide().intValue();
                int exonEnd = exonLocation.getNucleotide().intValue() + exonLocation.getNucleotideLength();
                totalOverlap += PhysicalLocation.computeOverlap(start, end, exonStart, exonEnd);
            }
            totalLength += end - start;
        }

        if (totalOverlap > totalLength)
            SequenceManipulation.log.warn(
                    "More overlap than length of sequence, trimming because " + totalOverlap + " > " + totalLength);

        return Math.min(totalOverlap, totalLength);
    }

    /**
     * Compute just any overlap the compare sequence has with the target on the right side.
     *
     * @param query  query
     * @param target target
     * @return right overlap
     */
    @SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use
    public static int rightHandOverlap(BioSequence target, BioSequence query) {

        if (target == null || query == null)
            throw new IllegalArgumentException("Null parameters");

        String targetString = target.getSequence();
        String queryString = query.getSequence();

        if (targetString == null)
            throw new IllegalArgumentException("Target sequence was empty");
        if (queryString == null)
            throw new IllegalArgumentException("Query sequence was empty");

        // match the end of the target with the beginning of the query. We start with the whole thing
        for (int i = 0; i < targetString.length(); i++) {
            String targetSub = targetString.substring(i);

            if (queryString.indexOf(targetSub) == 0) {
                return targetSub.length();
            }
        }

        return 0;
    }

    public static String reverseComplement(String sequence) {
        if (StringUtils.isBlank(sequence)) {
            return sequence;
        }

        StringBuilder buf = new StringBuilder();
        for (int i = sequence.length() - 1; i >= 0; i--) {
            buf.append(SequenceManipulation.complement(sequence.charAt(i)));
        }
        return buf.toString();
    }

    /**
     * See for example http://www.bio-soft.net/sms/iupac.html
     *
     * @param baseLetter letter
     * @return complement letter
     */
    private static char complement(char baseLetter) {

        switch (baseLetter) {
        // basics
        case 'A':
            return 'T';
        case 'T':
            return 'A';
        case 'G':
            return 'C';
        case 'C':
            return 'G';
        case 'U':
            return 'A';

        // purine/pyrimidine
        case 'R':
            return 'Y';
        case 'Y':
            return 'R';

        // complementary pairs.
        case 'S':
            return 'W';
        case 'W':
            return 'S';

        // non-complementary pairs
        case 'K':
            return 'M';
        case 'M':
            return 'K';

        // choice of 3
        case 'B':
            return 'D';
        case 'D':
            return 'B';

        case 'H':
            return 'V';
        case 'V':
            return 'H';

        // special
        case 'N':
            return 'N';
        case '-':
            return '-';
        case ' ':
            return ' ';
        default:
            break;
        }

        throw new IllegalArgumentException("Don't know complement to " + baseLetter);

    }

    /**
     * @param sizes Blat-formatted string of sizes (comma-delimited)
     * @return total size
     */
    public static int totalSize(String sizes) {
        return SequenceManipulation.totalSize(SequenceManipulation.blatLocationsToIntArray(sizes));
    }

    /**
     * Compute the overlap between two physical locations. If both do not have a length the overlap is zero unless they
     * point to exactly the same nucleotide location, in which case the overlap is 1.
     *
     * @param a a
     * @param b b
     * @return overlap
     */
    @SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use
    public static int computeOverlap(PhysicalLocation a, PhysicalLocation b) {
        if (!a.getChromosome().equals(b.getChromosome())) {
            return 0;
        }

        if (a.getNucleotideLength() == null && b.getNucleotideLength() == null) {
            if (a.getNucleotide().equals(b.getNucleotide())) {
                return 1;
            }
            return 0;
        }

        return PhysicalLocation.computeOverlap(a.getNucleotide(), a.getNucleotide() + a.getNucleotideLength(),
                b.getNucleotide(), b.getNucleotide() + b.getNucleotideLength());

    }

    private static Collection<Reporter> copyReporters(Collection<Reporter> reporters) {
        Collection<Reporter> copyReporters = new HashSet<>();
        for (Reporter next : reporters) {
            assert next.getStartInBioChar() != null : "Null startInBioChar";
            assert next.getImmobilizedCharacteristic() != null : "Null immobilized characteristic";

            Reporter copy = Reporter.Factory.newInstance();
            copy.setImmobilizedCharacteristic(next.getImmobilizedCharacteristic());
            copy.setName(next.getName());
            copy.setStartInBioChar(next.getStartInBioChar());
            copyReporters.add(copy);
        }
        assert copyReporters.size() == reporters.size() : "Sequences did not copy properly";
        return copyReporters;
    }

    /**
     * Find the index of the aligned base in the center exon that is the center of the query.
     */
    private static int findCenterExonCenterBase(int[] startArray, int[] sizesArray) {
        int middle = SequenceManipulation.middleBase(SequenceManipulation.totalSize(sizesArray));
        int totalSize = 0;
        for (int i = 0; i < sizesArray.length; i++) {
            totalSize += sizesArray[i];
            if (totalSize >= middle) {
                totalSize -= sizesArray[i];
                int diff = middle - totalSize;
                return startArray[i] + diff;
            }
        }
        assert false : "Failed to find center!";
        throw new IllegalStateException("Should not be here!");
    }

    /**
     * From a set of Reporters, find the one with the left-most location.
     */
    private static Reporter findLeftMostProbe(Collection<Reporter> copyOfProbes) {
        Long minLocation = (long) Integer.MAX_VALUE;
        Reporter next = null;
        for (Reporter probe : copyOfProbes) {
            Long loc = probe.getStartInBioChar();
            if (loc <= minLocation) {
                minLocation = loc;
                next = probe;
            }
        }
        return next;
    }

    private static int middleBase(int totalSize) {
        return (int) Math.floor(totalSize / 2.0);
    }

    private static int totalSize(int[] sizesArray) {
        int totalSize = 0;
        for (int aSizesArray : sizesArray) {
            totalSize += aSizesArray;
        }
        return totalSize;
    }

}