org.ut.biolab.medsavant.server.db.variants.annotation.AnnotationCursor.java Source code

Introduction

Here is the source code for org.ut.biolab.medsavant.server.db.variants.annotation.AnnotationCursor.java
Source

/**
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * This is free software; you can redistribute it and/or modify it under the
 * terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or (at your option)
 * any later version.
 *
 * This software is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this software; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA, or see the FSF
 * site: http://www.fsf.org.
 */
package org.ut.biolab.medsavant.server.db.variants.annotation;

import java.io.IOException;
import java.sql.SQLException;
import java.util.List;
import java.util.ListIterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.broad.tabix.TabixReader;
import org.ut.biolab.medsavant.server.db.variants.annotation.BatchVariantAnnotator.SimpleVariantRecord;
import org.ut.biolab.medsavant.server.db.variants.VariantManagerUtils;
import org.ut.biolab.medsavant.shared.model.Annotation;
import org.ut.biolab.medsavant.server.serverapi.AnnotationManager;
import org.ut.biolab.medsavant.shared.format.AnnotationFormat;
import org.ut.biolab.medsavant.shared.model.SessionExpiredException;
import org.ut.biolab.medsavant.shared.util.MiscUtils;

/**
 *
 * @author mfiume
 */
public class AnnotationCursor {

    // the log
    private static final Log LOG = LogFactory.getLog(AnnotationCursor.class);
    // reader for the annotation
    private final TabixReader reader;
    // does the annotation have an alt column
    private final boolean annotationHasAlt;
    // does the annotation have a ref column
    private final boolean annotationHasRef;
    // does this annotation refer to positions or intervals
    private final boolean isInterval;
    // are the endpoints inclusive (only relevant for interval annotations)
    //private final boolean isEndInclusive;

    // the annotation to apply
    private final Annotation annotation;
    //Setup default column indices for required columns.  These can be overridden
    //if the file has a header
    private int pos_annot_index_of_chr = 0;
    private int pos_annot_index_of_pos = 1;
    private int pos_annot_index_of_ref = 2;
    private int pos_annot_index_of_alt = 3;
    private int int_annot_index_of_chr = 0;
    private int int_annot_index_of_start = 1;
    private int int_annot_index_of_end = 2;

    SimpleVariantRecord lastVariantAnnotated;
    SimpleAnnotationRecord lastAnnotationConsidered;
    String[] lastResult;

    public static final int MAX_BASEPAIR_DISTANCE_IN_WINDOW = 20000;

    private final AnnotationFormat annotationFormat;

    /**
     * A file reader and cursor to be used to help in the annotation process
     *
     * @param sid The Session ID that requested the annotation
     * @param annotation The annotation that this instance refers to
     * @throws IOException
     * @throws SQLException
     */
    public AnnotationCursor(String sid, Annotation annotation)
            throws IOException, SQLException, SessionExpiredException, IllegalArgumentException {
        TabixReader headerReader = new TabixReader(annotation.getDataPath());
        String header = headerReader.readLine().trim();
        headerReader.cleanup();

        reader = new TabixReader(annotation.getDataPath());

        //If the tabix file has a header, then find the indices for all the 
        //required columns
        if (header.startsWith(String.valueOf(reader.getCommentChar()))) {
            String[] parts = header.split("\t");
            pos_annot_index_of_chr = int_annot_index_of_chr = -1;
            int_annot_index_of_start = -1;
            int_annot_index_of_end = -1;
            pos_annot_index_of_ref = -1;
            pos_annot_index_of_alt = -1;
            pos_annot_index_of_pos = -1;

            for (int i = parts.length - 1; i >= 0; --i) {
                parts[i] = parts[i].replace("#", "").trim().toUpperCase();
                if (parts[i].equalsIgnoreCase("CHR") || parts[i].equalsIgnoreCase("CHROM")) {
                    pos_annot_index_of_chr = int_annot_index_of_chr = i;
                } else if (parts[i].equalsIgnoreCase("START")) {
                    int_annot_index_of_start = i;
                    pos_annot_index_of_pos = i;
                } else if (parts[i].equalsIgnoreCase("END")) {
                    int_annot_index_of_end = i;
                } else if (parts[i].equalsIgnoreCase("REF")) {
                    pos_annot_index_of_ref = i;
                } else if (parts[i].equalsIgnoreCase("ALT")) {
                    pos_annot_index_of_alt = i;
                } else if (parts[i].equalsIgnoreCase("POSITION")) {
                    pos_annot_index_of_pos = i;
                }
            }

            String missingCol = null;
            if (annotation.isInterval()) {
                if (int_annot_index_of_chr == -1) {
                    missingCol = "Chromosome";
                } else if (int_annot_index_of_start == -1) {
                    missingCol = "Start";
                } else if (int_annot_index_of_end == -1) {
                    missingCol = "End";
                }
            } else {
                if (pos_annot_index_of_chr == -1) {
                    missingCol = "Chromosome";
                } else if (pos_annot_index_of_ref == -1) {
                    missingCol = "Ref";
                } else if (pos_annot_index_of_alt == -1) {
                    missingCol = "Alt";
                } else if (pos_annot_index_of_pos == -1) {
                    missingCol = "Position";
                }
            }

            if (missingCol != null) {
                throw new IllegalArgumentException("Couldn't locate column " + missingCol + " in annotation "
                        + annotation.getProgram() + " (ref=" + annotation.getReferenceName() + ")");
            }
        }

        this.annotation = annotation;
        annotationFormat = AnnotationManager.getInstance().getAnnotationFormat(sid, annotation.getID());
        annotationHasRef = annotationFormat.hasRef();
        annotationHasAlt = annotationFormat.hasAlt();
        isInterval = annotation.isInterval();
    }

    private String[] getVariantAnnotationString(String[] annotationLine) {

        String prefix = "";
        String[] result = new String[getNumNonDefaultFields()];

        int numNonDefaultFields = annotationFormat.getNumNonDefaultFields();
        int numDefaultFields = 2; //chromosome and position are ALWAYS present.

        if (annotationFormat.hasRef()) {
            numDefaultFields++;
        }

        if (annotationFormat.hasAlt()) {
            numDefaultFields++;
        }

        //annotation has an end position
        if (int_annot_index_of_end != -1) {
            numDefaultFields++;
        }

        for (int i = 0; i < numNonDefaultFields; ++i) {
            result[i] = prefix;
            if ((numDefaultFields + i) < annotationLine.length) {
                result[i] += annotationLine[numDefaultFields + i];
            }
        }
        return result;
    }

    //Annotates the first numRecords variant records given in 'records'.  
    //Precondition: All records have the same chromosome, and are ordered by position in ascending order.
    boolean annotateVariants(List<SimpleVariantRecord> variantWindow, long minStart, long maxEnd,
            int annotationIndex, boolean isLowDensity) throws Exception {
        if (!canAnnotateThisChromosome(variantWindow.get(0).chrom)) {
            //this chromosome can't be annotated.
            return false;
        }

        //Annotates variants by separately seeking annotations for each variant.  Usually slow unless
        //variants are very sparse.
        if (isLowDensity) {
            for (SimpleVariantRecord svr : variantWindow) {
                TabixReader.Iterator annotationIt = reader.query(reader.chr2tid(variantWindow.get(0).chrom),
                        (int) svr.start - 1, //this function returns annotations AFTER this position, so we need to have the -1
                        (int) svr.end);
                if (annotationIt == null) {
                    continue;
                }
                while (annotationIt.hasNext()) {
                    String annotationLineStr = (String) annotationIt.next();
                    String[] annotationLine = removeNewLinesAndCarriageReturns(annotationLineStr)
                            .split(VariantManagerUtils.FIELD_DELIMITER, -1);
                    SimpleAnnotationRecord annotationRecord = new SimpleAnnotationRecord(annotationLine);
                    if (annotationRecord.matchesVariant(svr)) {
                        svr.annotate(annotationIndex, getVariantAnnotationString(annotationLine));
                    }
                }
            }

            return true;
        }

        //Annotates variants by reading in all annotations in the region delimited by the variantWindow 
        //(only uses one seek).  Faster if variants are dense.
        TabixReader.Iterator annotationIt = reader.query(reader.chr2tid(variantWindow.get(0).chrom),
                (int) Math.max(0, minStart - 1), //this function returns annotations AFTER this position, so we need to have the -1
                (int) maxEnd);

        if (annotationIt == null) { //no annotations in this range.
            return true;
        }

        int variantWindowIndex = 0;
        while (annotationIt.hasNext()) { //For each annotation in start + min(start+max_base_pair_distance_in_window, end)
            String annotationLineStr = (String) annotationIt.next();
            String[] annotationLine = removeNewLinesAndCarriageReturns(annotationLineStr)
                    .split(VariantManagerUtils.FIELD_DELIMITER, -1);
            SimpleAnnotationRecord annotationRecord = new SimpleAnnotationRecord(annotationLine);

            ListIterator<SimpleVariantRecord> variantWindowIt = variantWindow.listIterator(variantWindowIndex);
            while (variantWindowIt.hasNext()) {
                SimpleVariantRecord variantRecord = variantWindowIt.next();
                if (annotationRecord.matchesVariant(variantRecord)) {
                    //annotate
                    variantRecord.annotate(annotationIndex, getVariantAnnotationString(annotationLine));
                } else if (variantRecord.start > annotationRecord.end) {
                    variantWindowIndex = variantWindowIt.previousIndex();
                    break;
                }
            }
        }
        return true;
    }

    /**
     * Get the number of fields (not including standard ones like chr, pos, ref,
     * alt)
     *
     * @return the number of non default fields
     */
    public int getNumNonDefaultFields() {
        return annotationFormat.getNumNonDefaultFields();
    }

    /**
     * Return whether or not this annotation contains anything for the given
     * chromosome
     *
     * @param chrom The chromosome to look for
     * @return Whether or not this annotation contains anything for the given
     * chromosome
     */
    private boolean canAnnotateThisChromosome(String chrom) {
        return reader.chr2tid(chrom) != -1;
    }

    /**
     * Remove excess stuff from the ends of lines.
     *
     * @param line a line of text, possibly with new lines and carriage returns
     * @return the line without new lines or carriage returns
     */
    private static String removeNewLinesAndCarriageReturns(String next) {
        next = next.replaceAll("\n", "");
        next = next.replaceAll("\r", "");
        return next;
    }

    /**
     * Checks if the annotation has chromosome field in the format 1 or chr1. In
     * the former case, this method returns true since we'll need to homogenize
     * the variant records, which are in the latter format.
     *
     * It is assumed that all the references in an annotation file are in the
     * same format, and not mixed.
     *
     * @param reader A Tabix reader for the annotation
     * @return Whether or not homogenization of the variant chrom field is
     * needed
     */
    private boolean checkIfHomogenizationIsNeeded(TabixReader reader) {
        for (String s : reader.getReferenceNames()) {
            if (s.contains("chr") || s.contains("contig")) {
                return true;
            }
        }
        return false;

    }

    /**
     * Get the annotation that this cursor works for
     *
     * @return The annotation this cursor works for
     */
    Annotation getAnnotation() {
        return annotation;

    }

    private class SimpleAnnotationRecord {

        public String chrom;
        public int position;
        public long start;
        public long end;
        public String ref;
        public String alt;

        public SimpleAnnotationRecord(String[] line) {
            setFromLine(line);
        }

        private void setFromLine(String[] line) {
            if (isInterval) {
                setFromLineInterval(line);
            } else {
                setFromLinePosition(line);
            }
        }

        private void setFromLinePosition(String[] line) {
            chrom = line[pos_annot_index_of_chr];
            if (!chrom.toLowerCase().startsWith("chr")) {
                chrom = "chr" + MiscUtils.homogenizeSequence(chrom);
            }
            start = end = position = Integer.parseInt(line[pos_annot_index_of_pos]);
            if (annotationHasRef) {
                ref = line[pos_annot_index_of_ref];
            } else {
                ref = null;
            }
            if (annotationHasAlt) {
                alt = line[pos_annot_index_of_alt];
            } else {
                alt = null;
            }
        }

        private void setFromLineInterval(String[] line) {
            chrom = line[int_annot_index_of_chr];
            if (!chrom.toLowerCase().startsWith("chr")) {
                chrom = "chr" + MiscUtils.homogenizeSequence(chrom);
            }
            start = Integer.parseInt(line[int_annot_index_of_start]);
            end = Integer.parseInt(line[int_annot_index_of_end]);
            ref = null;
            alt = null;
        }

        @Override
        public String toString() {
            if (isInterval) {
                return "SimpleAnnotationRecord{" + "chrom=" + chrom + ", start=" + start + ", end=" + end + '}';
            } else {
                return "SimpleAnnotationRecord{" + "chrom=" + chrom + ", position=" + position + ", ref=" + ref
                        + ", alt=" + alt + '}';
            }
        }

        private boolean matchesRef(String ref) { //doon't ever depend on ref for matching.
            return true;

            //If annotation ref is 0, then automatically assume the ref matches.
            //(i.e. we interpret the 0 as 'unspecified').            
            /*if(ref.equals("0") && !ref0_logged){
             ref0_logged = true;
             LOG.info("Reference 0 detected for annotation "+annotation.getProgram());
             }
             return (this.alt == null) || (ref.equals("0")) || (this.ref != null && this.ref.equals(ref));*/
        }

        private boolean matchesAlt(String alt) {
            return this.alt == null || (this.alt != null && this.alt.equals(alt));
        }

        private boolean intersectsPosition(String chrom, long start, long end) {
            if (this.chrom.equals(chrom)) {
                if (this.start < start) {
                    if (this.end < start) {
                        return false;
                    } else {
                        return true;
                    }
                } else if (start < this.start) { //start >= start
                    if (end < this.end) {
                        return false;
                    } else {
                        return true;
                    }
                }
                return true;

            }
            return false;
            /*return (!isInterval && this.chrom.equals(chrom) && this.position == position)
             || (isInterval && this.start <= position && (isEndInclusive ? (this.end >= position) : (this.end > position)));*/
        }

        private boolean matchesVariant(SimpleVariantRecord r) {
            return this.chrom.equals(r.chrom) && (this.start == r.start) && (this.end == r.end)
                    && matchesAlt(r.alt);
            /*return (isInterval && intersectsPosition(r.chrom, r.start, r.end))
             || (!isInterval && intersectsPosition(r.chrom, r.start, r.end) && matchesRef(r.ref) && matchesAlt(r.alt));*/
        }
    }

    public void cleanup() throws IOException {
        reader.cleanup();
    }
}