org.semanticscience.narf.structures.parts.Sequence.java Source code

Introduction

Here is the source code for org.semanticscience.narf.structures.parts.Sequence.java
Source

/**
 * Copyright (c) 2011 William Greenwood and Jose Cruz-Toledo
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package org.semanticscience.narf.structures.parts;

import java.text.CharacterIterator;
import java.text.DecimalFormat;
import java.text.StringCharacterIterator;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.semanticscience.narf.structures.lib.exceptions.InvalidResidueException;
import org.semanticscience.narf.structures.lib.exceptions.InvalidSequenceException;
import org.semanticscience.narf.structures.lib.exceptions.NonConsecutiveNucleotideException;

/**
 * Representation of a nucleotide sequence
 * 
 * @author "Jose Cruz-Toledo"
 * @author "William Greenwood"
 * @since 1.6
 */
public class Sequence implements Iterable<Nucleotide> {

    /**
     * Map of nucleotide residue position to the nucleotide in the sequence.
     */
    private SortedMap<Integer, Nucleotide> nucleotides;

    /**
     * Definition line for a FASTA file.
     */
    private String definitionLine = null;

    /**
     * Construct a sequence from a string
     * 
     * @param aSequence
     *            a nucleotide sequence
     * @throws InvalidSequenceException
     *             if a given sequence has invalid characters
     * @throws InvalidResidueException
     * @since 1.6
     */
    public Sequence(String aSequence) throws InvalidSequenceException, InvalidResidueException {
        this(aSequence, "");
    }

    /**
     * Construct a sequence with a definition line from a string
     * 
     * @param aSequence
     *            a nucleotide sequence
     * @param aDefinitionLine
     *            the definition line of a FASTA file
     * @param aSequence
     *            a nucleotide sequence
     * @throws InvalidSequenceException
     *             if a given sequence has invalid characters
     * @throws InvalidResidueException
     * @since 1.6
     */
    public Sequence(String aSequence, String aDefinitionLine)
            throws InvalidSequenceException, InvalidResidueException {
        this(aSequence, 1, aDefinitionLine);
    }

    /**
     * Construct a sequence starting from a given residue position.
     * 
     * @param aSequence
     *            a sequence string
     * @param aStartingPosition
     *            the desired position of the first nucleotide
     * @throws InvalidSequenceException
     *             if a given sequence has invalid characters
     * @throws InvalidResidueException
     * @since 1.6
     */
    public Sequence(String aSequence, int aStartingPosition)
            throws InvalidSequenceException, InvalidResidueException {
        this(aSequence, aStartingPosition, "");
    }

    /**
     * Construct a sequence starting from a given residue position.
     * 
     * @param aSequence
     *            a sequence string
     * @param aStartingPosition
     *            the desired position of the first nucleotide
     * @param aDefinitionLine
     *            a definition line of a FASTA file
     * @throws InvalidSequenceException
     *             if a given sequence has invalid characters
     * @throws InvalidResidueException
     *             if any of the nucleotides has an invalid residue
     * @since 1.6
     */
    public Sequence(String aSequence, int aStartingPosition, String aDefinitionLine)
            throws InvalidSequenceException, InvalidResidueException {
        this.definitionLine = aDefinitionLine;
        this.nucleotides = makeNucleotides(aSequence, aStartingPosition);
    }

    /**
     * Construct a sequence from a set of nucleotides.
     * 
     * @param aNucleotideSet
     *            a set of nucleotides to construct a sequence
     * 
     * @throws NonConsecutiveNucleotideException
     *             if the nucleotide set contains non-consecutive nucleotides
     * @since 1.6
     */
    public Sequence(Set<Nucleotide> aNucleotideSet) throws NonConsecutiveNucleotideException {
        this(aNucleotideSet, "");
    }

    /**
     * Construct a sequence from a set of nucleotides.
     * 
     * @param aNucleotideSet
     *            a set of nucleotides to construct a sequence
     * @param aDefinitionLine
     *            a definition line of a FASTA file
     * @throws NonConsecutiveNucleotideException
     *             if the nucleotide set contains non-consecutive nucleotides
     * @since 1.6
     */
    public Sequence(Set<Nucleotide> aNucleotideSet, String aDefinitionLine)
            throws NonConsecutiveNucleotideException {
        this.definitionLine = aDefinitionLine;
        this.nucleotides = new TreeMap<Integer, Nucleotide>();

        // add the nucleotides one at a time to the nucleotides instance
        // variable
        Iterator<Nucleotide> itr = aNucleotideSet.iterator();
        while (itr.hasNext()) {
            this.addNucleotide(itr.next());
        }

        // now check that nucleotides only has consecutive nucleotides
        Iterator<Integer> nucItr = this.nucleotides.keySet().iterator();
        Integer previousPos = this.nucleotides.firstKey() - 1;
        while (nucItr.hasNext()) {
            Integer currentPos = nucItr.next();

            if ((Math.abs(currentPos - previousPos) != 1)) {
                throw new NonConsecutiveNucleotideException("A non consecutive nucleotide was found");
            }
            previousPos = currentPos;
        }
    }

    /**
     * The GC Content for the nucleotide sequence
     * @return the gc constant value
     */
    public double getGCContent() {
        double g = StringUtils.countMatches(this.getSequenceString(), "G") * 1.0;
        double c = StringUtils.countMatches(this.getSequenceString(), "C") * 1.0;
        double t = StringUtils.countMatches(this.getSequenceString(), "T") * 1.0;
        double u = StringUtils.countMatches(this.getSequenceString(), "U") * 1.0;
        double a = StringUtils.countMatches(this.getSequenceString(), "A") * 1.0;

        if (u == 0) {//if dna
            double q = a + t + g + c;
            double s = t / q;
            return s * 100;
        } else if (t == 0) {//if rna
            return ((g + c) / (a + g + u + c)) * 100;
        } else {
            return ((g + c) / (a + g + u + t + c)) * 100;
        }
    }

    public int getGuanineCount() {
        return StringUtils.countMatches(this.getSequenceString(), "G");
    }

    public int getCytosineCount() {
        return StringUtils.countMatches(this.getSequenceString(), "C");
    }

    public int getAdenineCount() {
        return StringUtils.countMatches(this.getSequenceString(), "A");
    }

    public int getUracilCount() {
        return StringUtils.countMatches(this.getSequenceString(), "U");
    }

    public int getThymineCount() {
        return StringUtils.countMatches(this.getSequenceString(), "T");
    }

    /**
     * The length of the nucleotide sequence.
     * 
     * @return the length of the sequence
     */
    public int getLength() {
        return this.getNucleotides().size();
    }

    /**
     * Determine if the nucleotide that is at a given position.
     * 
     * @param aPosition
     *            the nucleotide residue position
     * @return <code>true</code> if the nucleotide is at the position, otherwise
     *         <code>false</code>
     */
    public boolean containsNucleotideAtPosition(int aPosition) {
        return (nucleotides.containsKey(aPosition));
    }

    /**
     * Get the nucleotide that is at a given position.
     * 
     * @param aPosition
     *            the nucleotide residue position
     * @return the nucleotide at the given position or <code>null</code> if
     *         there is no nucleotide at that position
     * @since 1.6
     */
    public Nucleotide getNucleotideAtPosition(int aPosition) {
        return nucleotides.get(aPosition);
    }

    /**
     * Determine if all the characters representing in a given sequence string
     * is valid.
     * 
     * @param aSequence
     *            a string representation of a sequence
     * @return <code>true</code> if only accepted characters are present,
     *         otherwise <code>false</code>
     * @since 1.6
     */
    private boolean checkStringSequence(String aSequence) {
        Pattern p = Pattern.compile("[AaCcGgTtUuRrYyKkMmSsWwBbDdHhVvNnXx-]+");
        Matcher m = p.matcher(aSequence);
        if (m.matches()) {
            return true;
        }
        return false;
    }

    /**
     * Create the set of nucleotides from a nucleic acid sequence string. A
     * mapping is constructed of the residue position of the nucleotides to the
     * nucleotides. The method enables the sequence to commence at an arbitrary
     * sequence position.
     * 
     * @param aSequence
     *            sequence string
     * @param aStartingPosition
     *            the residue position of the first nucleotide
     * @return a mapping of the nucleotide residue positions to their respective
     *         nucleotide residue position
     * @throws InvalidSequenceException
     *             if any of the characters in the sequence string is not a
     *             valid residue identifier
     * @throws InvalidResidueException
     *             if the created nucleotide contains a residue exception
     * @since 1.6
     */
    SortedMap<Integer, Nucleotide> makeNucleotides(String aSequence, int aStartingPosition)
            throws InvalidSequenceException, InvalidResidueException {
        //first make the sequence all upper case
        aSequence = aSequence.toUpperCase();
        if ((aSequence == null) || (aSequence.length() == 0)) {
            throw new InvalidSequenceException("The provided sequence has no characters.");
        }

        if (!this.checkStringSequence(aSequence)) {
            throw new InvalidSequenceException(
                    "The inputted sequence has a invalid sequence character.\n" + aSequence + "\n");
        }

        SortedMap<Integer, Nucleotide> returnMe = new TreeMap<Integer, Nucleotide>();
        int currentPosition = aStartingPosition;
        CharacterIterator it = new StringCharacterIterator(aSequence);

        for (char ch = it.first(); ch != CharacterIterator.DONE; ch = it.next()) {
            returnMe.put(currentPosition, new Nucleotide(currentPosition, String.valueOf(ch)));
            currentPosition++;
        }
        return returnMe;
    }

    /**
     * Adds a nucleotide to the sequence at the residue position within the
     * nucleotide.
     * 
     * @param aNucleotide
     *            the nucleotide to be added to the sequence
     * @since 1.6
     */
    private boolean addNucleotide(Nucleotide aNucleotide) {
        // check if currently the residue position of the nucleotide is null
        if (nucleotides.containsKey(aNucleotide.getResiduePosition())
                && (nucleotides.get(aNucleotide.getResiduePosition()) == null)) {
            nucleotides.put(aNucleotide.getResiduePosition(), aNucleotide);
            return true;
        }
        return (nucleotides.put(aNucleotide.getResiduePosition(), aNucleotide) != null);
    }

    /**
     * Get a FASTA string representation of the sequence which includes the
     * definition line and the sequence
     * 
     * @return a FASTA string of the sequence
     * @since 1.6
     */
    public String getFastaString() {
        String returnMe = "";
        if ((this.getDefinitionLine().length() == 0) || (this.getDefinitionLine() == null)) {
            returnMe += ">No definition line available\n";
        } else {
            //check if this definition line starts with ">"
            if (!this.getDefinitionLine().startsWith(">")) {
                returnMe += ">" + this.getDefinitionLine() + "\n";
            } else {
                returnMe += this.getDefinitionLine() + "\n";
            }
        }

        returnMe += this.getSequenceString() + "\n";

        return returnMe;
    }

    /**
     * Get the string representation of the nucleotides in the sequence.
     * 
     * @return a string representation of the sequence
     * @since 1.6
     */
    public String getSequenceString() {
        String returnMe = "";

        Iterator<Nucleotide> itr = this.getNucleotides().iterator();
        while (itr.hasNext()) {
            Nucleotide aNuc = itr.next();
            returnMe += aNuc.getResidueIdentifier().toUpperCase();
        }
        return returnMe;
    }

    /**
     * Get the definition line for a FASTA file.
     * 
     * @return the definition line
     * @since 1.6
     */
    public String getDefinitionLine() {
        return definitionLine;
    }

    //TODO: Compute a random sequence given a GC content and a sequence type
    /**
     * Create a random Sequence given a GC content value
     * @param aGCContent the value of the GC content that you want for your output
     * @param sequenceType accepted values RNA or DNA
     * @param the length of the output sequence
     * @return a sequence of random nucleotides of a given GC content
     */
    public static Sequence createRandomSequence(Double aGCContent, String sequenceType, int lenght) {

        return null;
    }

    //TODO:check ME!!!
    public void setDefinitionLine(String aDefLine) {
        //check if parameter is empty
        if ((aDefLine.length() != 0) && (aDefLine != null)) {
            //make sure that it starts with a ">"
            if (aDefLine.substring(0, 1) != ">") {
                this.definitionLine = ">" + aDefLine;
            } else {
                this.definitionLine = aDefLine;
            }
        } else {
            this.definitionLine = ">no definition available";
        }
    }

    /**
     * Get a subsequence of this sequence. Specify the start and end nucleotides
     * of the residues you wish to extract. The returned subsequence includes
     * the start and end positions
     * 
     * @param aStartNucleotide
     *            the nucleotide at the first position
     * @param anEndNucleotide
     *            the nucleotide at the end position
     * @return a sequence from <code>aStartNucleotide</code> to
     *         <code>anEndNucleotide</code>
     * @throws NonConsecutiveNucleotideException
     *             if any of the nucleotides in the subsequence are not in
     *             consecutive order
     * @since 1.6
     */
    public Sequence getSubsequence(Nucleotide aStartNucleotide, Nucleotide anEndNucleotide)
            throws NonConsecutiveNucleotideException {
        if (!this.containsNucleotide(aStartNucleotide) || !this.containsNucleotide(anEndNucleotide)) {
            return null;
        }

        if (aStartNucleotide.compareTo(anEndNucleotide) == -1) {
            return null;
        }

        boolean addToSubSequence = false;
        Set<Nucleotide> returnMe = new LinkedHashSet<Nucleotide>();

        for (Nucleotide nucleotide : this.getNucleotides()) {
            if (addToSubSequence && nucleotide.equals(anEndNucleotide)) {
                returnMe.add(nucleotide);
                addToSubSequence = false;
                break;
            } else if (addToSubSequence) {
                returnMe.add(nucleotide);
            } else if (nucleotide.equals(aStartNucleotide)) {
                returnMe.add(nucleotide);
                addToSubSequence = true;
            }

        }

        if (addToSubSequence) {
            return null;
        }

        return new Sequence(returnMe);
    }

    /**
     * Get all the nucleotides in the sequence in order by their residue
     * position
     * 
     * @return a collection of all the nucleotides in the sequence
     * @since 1.6
     */
    public Collection<Nucleotide> getNucleotides() {
        return Collections.unmodifiableCollection(nucleotides.values());
    }

    @Override
    public String toString() {
        return this.getFastaString();
    }

    /**
     * Get the last nucleotide in the sequence.
     * 
     * @return the first nucleotide in the sequence
     * @since 1.6
     */
    public Nucleotide getFirstNucleotide() {
        return nucleotides.get(nucleotides.firstKey());
    }

    /**
     * Get the last nucleotide in the sequence.
     * 
     * @return the last nucleotide in the sequence
     * @since 1.6
     */
    public Nucleotide getLastNucleotide() {
        return nucleotides.get(nucleotides.lastKey());
    }

    public Iterator<Nucleotide> iterator() {
        return nucleotides.values().iterator();
    }

    /**
     * Determine if the sequence contains the given nucleotide.
     * 
     * @param nucleotide
     *            a nucleotide used for searching
     * @return <code>true</code>e if the nucleotide is in the sequence,
     *         otherwise <code>false</code>
     * @since 1.6
     */
    public boolean containsNucleotide(Nucleotide nucleotide) {
        if (nucleotides.containsValue(nucleotide)) {
            return true;
        }

        return false;
    }

    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + ((definitionLine == null) ? 0 : definitionLine.hashCode());
        result = prime * result + ((nucleotides == null) ? 0 : nucleotides.hashCode());
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        Sequence other = (Sequence) obj;
        if (definitionLine == null) {
            if (other.definitionLine != null)
                return false;
        } else if (!definitionLine.equals(other.definitionLine))
            return false;
        if (nucleotides == null) {
            if (other.nucleotides != null)
                return false;
        } else if (!nucleotides.equals(other.nucleotides))
            return false;
        return true;
    }

}