org.ut.biolab.medsavant.server.db.variants.annotation.BatchVariantAnnotator.java Source code

Java tutorial

Introduction

Here is the source code for org.ut.biolab.medsavant.server.db.variants.annotation.BatchVariantAnnotator.java

Source

/**
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership.
 *
 * This is free software; you can redistribute it and/or modify it under the
 * terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or (at your option)
 * any later version.
 *
 * This software is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this software; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA, or see the FSF
 * site: http://www.fsf.org.
 */
package org.ut.biolab.medsavant.server.db.variants.annotation;

import au.com.bytecode.opencsv.CSVReader;
import au.com.bytecode.opencsv.CSVWriter;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.rmi.RemoteException;
import java.sql.SQLException;
import java.util.LinkedList;
import java.util.List;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.ut.biolab.medsavant.server.IOJob;
import org.ut.biolab.medsavant.server.MedSavantIOController;
import org.ut.biolab.medsavant.server.db.variants.VariantManagerUtils;
import static org.ut.biolab.medsavant.server.db.variants.annotation.AnnotationCursor.MAX_BASEPAIR_DISTANCE_IN_WINDOW;
import org.ut.biolab.medsavant.shared.model.Annotation;
import org.ut.biolab.medsavant.shared.model.MedSavantServerJobProgress;
import org.ut.biolab.medsavant.shared.model.SessionExpiredException;
import org.ut.biolab.medsavant.shared.serverapi.LogManagerAdapter;
import org.ut.biolab.medsavant.shared.util.IOUtils;

/**
 *
 * @author mfiume
 */
public class BatchVariantAnnotator {

    // the log
    private static final Log LOG = LogFactory.getLog(BatchVariantAnnotator.class);
    // input file to be annotated
    private File inputTDFFile;
    // output file
    private File outputTDFFile;
    // annotations to perform
    private final Annotation[] annotations;
    // the session ID requesting the annotation
    private final String sid;
    // the total number of lines read from the input file
    private int totalNumLinesRead;

    private int totalNumVariantsWritten = 0;
    // the total number of warnings raised while annotating
    private int totalNumWarnings;

    private MedSavantServerJobProgress jobProgress;

    public int getTotalVariantsWritten() {
        return totalNumVariantsWritten;
    }

    /**
     * Perform a set of annotations on a tab delimited genomic input file, and
     * output a tab delimited file to a specified path.
     *
     * A few assumptions are made: The input file is in tab delimited format.
     * The annotation files are in Tabix format. The input and annotation files
     * are sorted by chromosome, position, reference, alternate. The annotation,
     * if interval, is not inclusive of the end position.
     *
     * @param tdfFilename The tab delimited file to be annotated
     * @param outputFilename The resulting tab delimited file
     * @param annotIds The annotations to apply
     * @param sid The session ID that requested the annotation
     */
    public BatchVariantAnnotator(MedSavantServerJobProgress jobProgress, File inputFile, File outputFile,
            Annotation[] annotations, String sid) throws RemoteException, SQLException {
        this.inputTDFFile = inputFile;
        this.outputTDFFile = outputFile;
        this.sid = sid;
        this.annotations = annotations;
        this.jobProgress = jobProgress;
    }

    private int PROGRESS_STEP_SIZE = 10;

    private int logProgress(int totalNumLinesRead, int numLines, int oldp) {
        int x = Math.round(totalNumLinesRead / (float) numLines * 100);
        if ((x - oldp) >= PROGRESS_STEP_SIZE) {
            oldp = x;
            LOG.info("\t" + oldp + "% (" + totalNumLinesRead + " of " + numLines + " lines)");
        }
        return oldp;
    }

    /**
     * Perform the prepared batch annotation in parallel
     *
     * @throws IOException
     * @throws SQLException
     */
    public void performBatchAnnotationInParallel()
            throws IOException, SQLException, SessionExpiredException, IllegalArgumentException {

        org.ut.biolab.medsavant.server.serverapi.LogManager.getInstance().addServerLog(sid,
                LogManagerAdapter.LogType.INFO, "Annotation of " + inputTDFFile.getAbsolutePath() + " was started. "
                        + annotations.length + " annotation(s) will be performed.");

        LOG.info("Annotation of " + inputTDFFile.getAbsolutePath() + " was started. " + annotations.length
                + " annotation(s) will be performed.");
        //EmailLogger.logByEmail("Annotation started", "Annotation of " + inputTDFFile.getAbsolutePath() + " was started. " + annotations.length + " annotation(s) will be performed.");

        CSVReader recordReader = null;
        CSVWriter recordWriter = null;
        AnnotationCursor[] cursors = null;
        try {
            // no annotations to perform, copy input to output
            if (annotations.length == 0) {
                jobProgress.setMessage("No annotations to perform, processing intermediate files");
                MedSavantIOController.requestIO(new IOJob("Copy File") {
                    @Override
                    protected void doIO() throws IOException {
                        IOUtils.copyFile(inputTDFFile, outputTDFFile);
                    }
                });

                return;
            }

            // otherwise, perform annotations
            jobProgress.setMessage("Performing " + annotations.length + " annotations");
            LOG.info("Performing " + annotations.length + " annotations");

            // the number of columns in the input file
            int numFieldsInInputFile = getNumFieldsInTDF(inputTDFFile);
            if (numFieldsInInputFile == 0) {
                org.ut.biolab.medsavant.server.serverapi.LogManager.getInstance().addServerLog(sid,
                        LogManagerAdapter.LogType.ERROR,
                        "Error parsing input file " + inputTDFFile.getAbsolutePath() + " . Is it tab delimited?");
                throw new IOException("Error parsing input file. Is it tab delimited?");
            }

            // the number of fields that will be in the output file
            int numFieldsInOutputFile = numFieldsInInputFile;

            // create cursors for all annotations
            cursors = new AnnotationCursor[annotations.length];
            for (int i = 0; i < annotations.length; i++) {
                AnnotationCursor ac = new AnnotationCursor(sid, annotations[i]);
                numFieldsInOutputFile += ac.getNumNonDefaultFields();
                cursors[i] = ac;
            }

            final int numlines[] = new int[1];

            MedSavantIOController.requestIO(new IOJob("Line counter") {
                @Override
                protected void doIO() throws IOException {
                    jobProgress.setMessage("Counting number of variants to annotate");
                    //LOG.info("DEBUG: Inside doIO of LineCounter, working with TDF File "+inputTDFFile+"\n");
                    int numLines = 0;
                    BufferedReader reader = null;
                    try {
                        reader = new BufferedReader(new FileReader(inputTDFFile));
                        while (reader.readLine() != null) {
                            numLines++;
                        }
                    } finally {
                        if (reader != null) {
                            reader.close();
                        }
                    }
                    numlines[0] = numLines;
                    //LOG.info("DEBUG: Read "+numLines+" from "+inputTDFFile);
                }
            });

            // open the input and output files
            recordReader = new CSVReader(new FileReader(inputTDFFile),
                    VariantManagerUtils.FIELD_DELIMITER.charAt(0), CSVWriter.DEFAULT_QUOTE_CHARACTER, '\\');
            //recordWriter = new CSVWriter(new FileWriter(outputTDFFile), VariantManagerUtils.FIELD_DELIMITER.charAt(0), CSVWriter.DEFAULT_QUOTE_CHARACTER, '\\', "\r\n");
            recordWriter = new CSVWriter(new FileWriter(outputTDFFile),
                    VariantManagerUtils.FIELD_DELIMITER.charAt(0), CSVWriter.DEFAULT_QUOTE_CHARACTER, "\r\n");

            //LOG.info("Reading from " + inputTDFFile.getAbsolutePath());
            //LOG.info("Writing to " + outputTDFFile.getAbsolutePath());
            // read the input, line by line
            String[] inputLine;

            MedSavantIOController.requestIO(new VariantAnnotatorIOJob(cursors, recordReader, recordWriter,
                    numlines[0], numFieldsInOutputFile));

            org.ut.biolab.medsavant.server.serverapi.LogManager.getInstance().addServerLog(sid,
                    LogManagerAdapter.LogType.INFO, "Annotation of " + inputTDFFile.getAbsolutePath()
                            + " completed. " + annotations.length + " annotations were performed.");

            // report success
            LOG.info("Annotation of " + inputTDFFile.getAbsolutePath() + " completed. " + annotations.length
                    + " annotations were performed.");
            //EmailLogger.logByEmail("Annotation completed", "Annotation of " + inputTDFFile.getAbsolutePath() + " completed. " + annotations.length + " annotations were performed.");
        } catch (InterruptedException ie) {

            org.ut.biolab.medsavant.server.serverapi.LogManager.getInstance().addServerLog(sid,
                    LogManagerAdapter.LogType.ERROR, "Error performing annotation(s). " + ie.getLocalizedMessage());

            LOG.error("performBatchAnnotationInParallell interrupted: " + ie);
        } finally {
            // clean up
            try {
                /*
                if (cursors != null) {
                    
                for (AnnotationCursor c : cursors) {
                    c.cleanup();
                }
                }*/
                if (recordReader != null) {
                    recordReader.close();
                }
                if (recordWriter != null) {
                    recordWriter.close();
                }
                inputTDFFile.delete(); //no longer need input TDF file.
            } catch (NullPointerException nex) {
                LOG.error("Caught nullpointerexception ");
                nex.printStackTrace();
            }
        }
    }

    /**
     * Read a line into a string array from the given reader
     *
     * @param recordReader the reader to read from
     * @return a line tokenized into a string array
     * @throws IOException
     */
    private static String[] readNext(CSVReader recordReader) throws IOException {

        // read a line
        String[] line = recordReader.readNext();
        if (line == null) {
            return null;
        } // if it has content, trim excess line-ending stuff from the last entry
        else {
            line[line.length - 1] = removeNewLinesAndCarriageReturns(line[line.length - 1]);
        }

        return line;
    }

    /**
     * Remove excess stuff from the ends of lines.
     *
     * @param line a line of text, possibly with new lines and carriage returns
     * @return the line without new lines or carriage returns
     */
    private static String removeNewLinesAndCarriageReturns(String line) {
        line = line.replaceAll("\n", "");
        line = line.replaceAll("\r", "");
        return line;
    }

    /**
     * Helper functions
     */
    /**
     * Infer the number of columns in the given TDF file by looking at the first
     * line only
     *
     * @param file The TDF file whose columns are to be counted
     * @return The number of columns in the first line in the TDF file
     * @throws FileNotFoundException
     * @throws IOException
     */
    private static int getNumFieldsInTDF(File file) throws FileNotFoundException, IOException {

        // open the file
        CSVReader reader = new CSVReader(new FileReader(file), VariantManagerUtils.FIELD_DELIMITER.charAt(0),
                CSVWriter.DEFAULT_QUOTE_CHARACTER, '\\');

        //LOG.info("Analyzing file to be annotated " + file.getAbsolutePath());
        // read the first line
        String[] next = reader.readNext();
        if (next == null) {
            throw new IOException("Error determining number of columns in the annotation file.");
        }

        // return the results
        int result = next.length;

        // clean up and return
        reader.close();
        return result;
    }

    private boolean isAStandardSingleNucleotide(String base) {
        return base.equals("A") || base.equals("C") || base.equals("G") || base.equals("T");
    }

    /**
     * Basic data structure for variants stored in MedSavant database. Variants
     * are imported from VCF4 files and stored in .avinput format within the
     * database tables. To annotate these variants, they must be converted (via
     * this class) to a format compatible with Annovar annotations.
     *
     * This class should ONLY be used for variants read from database tables,
     * for the purposes of annotation.
     *
     * SimpleAnnotationRecord should be used to store annotations.
     *
     * @see SimpleAnnotationRecord
     */
    static class SimpleVariantRecord {

        public static final String NULL_VALUE = "\\N";
        public static final int INITIAL_NUMBER_OF_ANNOTATIONS = 10;
        // indexes of various columns in the input file
        // (there are a few) columns at the start pertaining to import / file ids
        public static final int VARIANT_INDEX_OF_CHR = 4;
        //private static final int VARIANT_INDEX_OF_POS = 5;
        public static final int VARIANT_INDEX_OF_START = 5;
        public static final int VARIANT_INDEX_OF_END = 6;
        public static final int VARIANT_INDEX_OF_REF = 8;
        public static final int VARIANT_INDEX_OF_ALT = 9;
        // basic variant fields
        public String chrom;
        //public int position;
        public long start;
        public long end;
        public String ref;
        public String alt;

        public final String[] line;

        public String[][] annotations = null;

        public String[] getLineWithAnnotations(AnnotationCursor[] cursors) {
            int ndf = 0;

            for (int i = 0; i < annotations.length; ++i) {
                ndf += cursors[i].getNumNonDefaultFields();
            }

            String[] flattenedAnnotations = new String[ndf];

            int k = 0;
            for (int i = 0; i < annotations.length; ++i) {
                if (annotations[i] != null) {
                    for (int j = 0; j < cursors[i].getNumNonDefaultFields(); ++j) {
                        if (annotations[i][j].trim().length() > 0) {
                            flattenedAnnotations[k++] = annotations[i][j];
                        } else {
                            flattenedAnnotations[k++] = NULL_VALUE;
                        }
                    }
                } else {
                    for (int j = 0; j < cursors[i].getNumNonDefaultFields(); ++j) {
                        flattenedAnnotations[k++] = NULL_VALUE;
                    }
                }
            }

            return (String[]) ArrayUtils.addAll(line, flattenedAnnotations);
        }

        public void annotate(int annotationIndex, String[] annotation) {
            annotations[annotationIndex] = annotation;
        }

        /**
         * Create a SimpleVariantRecord from an array, which has been tokenized
         * from a TDF file line
         *
         * @param line The array from which to make this record
         */
        public SimpleVariantRecord(String[] line, int numAnnotations) throws IllegalArgumentException {
            this.line = line;
            annotations = new String[numAnnotations][];
            setFromLine(line);
        }

        /**
         * Set up variables based on the content of line
         *
         * @param line The contents
         */
        private void setFromLine(String[] line) throws IllegalArgumentException {
            chrom = line[VARIANT_INDEX_OF_CHR];
            String startString = line[VARIANT_INDEX_OF_START];
            String endString = line[VARIANT_INDEX_OF_END];
            try {
                start = Long.parseLong(startString);
                end = Long.parseLong(endString);
            } catch (NumberFormatException nex) {
                throw new NumberFormatException(
                        "Start or End Position is not an integer. Strings were '" + line[VARIANT_INDEX_OF_START]
                                + "', '" + line[VARIANT_INDEX_OF_END] + "' Message: " + nex.getMessage() + "\n");
            }
            ref = line[VARIANT_INDEX_OF_REF];
            alt = line[VARIANT_INDEX_OF_ALT];

            //Convert .avinput format into the format that annovar annotations are stored in, 
            //to enable direct comparisons.
            if (start == end && ref.equals("-")) { //insertion
                alt = "0" + alt;
            } else if (alt.equals("-")) { //deletion
                alt = Long.toString(end - start + 1);
            } else if (end > start || start == end && alt.length() > 1) { //block substitution
                alt = Long.toString((end - start + 1)) + alt;
            }

        }

        /**
         * Basic to string
         *
         * @return String representation of this object
         */
        @Override
        public String toString() {
            return "SimpleVariantRecord{" + "chrom=" + chrom + ", start=" + start + ", ref=" + ref + ", alt=" + alt
                    + '}';
        }

        @Override
        public boolean equals(Object obj) {
            if (obj == null) {
                return false;
            }
            if (getClass() != obj.getClass()) {
                return false;
            }
            final SimpleVariantRecord other = (SimpleVariantRecord) obj;
            if ((this.chrom == null) ? (other.chrom != null) : !this.chrom.equals(other.chrom)) {
                return false;
            }
            if (this.start != other.start) {
                return false;
            }
            if (this.end != other.end) {
                return false;
            }
            if ((this.ref == null) ? (other.ref != null) : !this.ref.equals(other.ref)) {
                return false;
            }
            if ((this.alt == null) ? (other.alt != null) : !this.alt.equals(other.alt)) {
                return false;
            }
            return true;
        }

        @Override
        public int hashCode() {
            int hash = 5;
            hash = 79 * hash + (this.chrom != null ? this.chrom.hashCode() : 0);

            int low = (int) (this.start & 0xFFFFFFFFl);
            hash = 79 * hash + (int) (this.start & 0xFFFFFFFFl) + (int) ((this.start >> 32) & 0xFFFFFFFFl);
            hash = 79 * hash + (int) (this.end & 0xFFFFFFFFl) + (int) ((this.end >> 32) & 0xFFFFFFFFl);
            hash = 79 * hash + (this.ref != null ? this.ref.hashCode() : 0);
            hash = 79 * hash + (this.alt != null ? this.alt.hashCode() : 0);
            return hash;
        }
    }

    class VariantAnnotatorIOJob extends IOJob {

        private final int NUM_VARIANTS_BEFORE_MSG = 10000;
        private String[] inputLine;
        // container for the components of the output line (original line, plus annotations)
        private String[][] outputAnnotations;
        private final AnnotationCursor[] cursors;
        private final CSVReader recordReader;
        private final CSVWriter recordWriter;
        //private SimpleVariantRecord[] variantWindow;

        private List<SimpleVariantRecord> variantWindow;
        private long minStart = Long.MAX_VALUE;
        private long maxStart = Long.MIN_VALUE;
        private int numFieldsInOutputFile = 0;
        private final int numLines;

        private long variantsAnnotated = 0;
        private long prevVariantsAnnotated = 0;
        private int vps = -1;
        private long startTime = 0;
        private String vstr = "";

        private int numLd = 0;
        private int numHd = 0;
        //If the variant density in the variantWindow is <= than this, use unoptimized annotation
        //(seek to each annotation separately)
        private static final double DENSITY_THRESHOLD = 10.0f / MAX_BASEPAIR_DISTANCE_IN_WINDOW;

        public VariantAnnotatorIOJob(AnnotationCursor[] cursors, CSVReader recordReader, CSVWriter recordWriter,
                int numLines, int numFieldsInOutputFile) {
            super("Variant annotator");
            LOG.info("Annotation started...");
            jobProgress.setMessage("Starting annotation...");

            this.cursors = cursors;
            this.recordReader = recordReader;
            this.recordWriter = recordWriter;
            this.numLines = numLines;
            this.numFieldsInOutputFile = numFieldsInOutputFile;
            variantWindow = new LinkedList<SimpleVariantRecord>(); //profile with arraylist also.
            startTime = System.currentTimeMillis();
            //variantWindow = new SimpleVariantRecord[VARIANT_WINDOW_SIZE];            
        }

        boolean isLowDensity(List<SimpleVariantRecord> variantWindow) {
            return (variantWindow.size() / (double) MAX_BASEPAIR_DISTANCE_IN_WINDOW) <= DENSITY_THRESHOLD;
        }

        @Override
        protected boolean continueIO() throws IOException {
            inputLine = readNext(recordReader); //read next input line from variant file.
            return inputLine != null;
        }

        private void annotateWindow() throws IllegalArgumentException {
            try {
                if (variantWindow.isEmpty()) {
                    return;
                }

                int ofs = 0;

                boolean ld = isLowDensity(variantWindow);
                if (ld) {
                    numLd++;
                } else {
                    numHd++;
                }
                // perform each annotation, in turn
                for (int annotationIndex = 0; annotationIndex < annotations.length; annotationIndex++) {
                    // get the annotation for this line
                    if (!cursors[annotationIndex].annotateVariants(variantWindow, minStart, maxStart,
                            annotationIndex, ld)) {
                        org.ut.biolab.medsavant.server.serverapi.LogManager.getInstance().addServerLog(sid,
                                LogManagerAdapter.LogType.WARNING,
                                inputTDFFile.getName() + "Cannot annotate chromosome " + variantWindow.get(0).chrom
                                        + " with annotation " + annotations[annotationIndex]);
                    }
                    /*  
                     // add it to the output buffer                    
                     for (int k = 0; k < annotationsForThisLine.length; ++k) { //for each row
                     for (int l = 0; l < cursors[annotationIndex].getNumNonDefaultFields(); ++l) {
                     if (annotationsForThisLine[k] == null) {
                     outputAnnotations[k][l + ofs] = "";
                     } else {
                     outputAnnotations[k][l + ofs] = annotationsForThisLine[k][l];
                     }
                     }
                     }
                     ofs += cursors[annotationIndex].getNumNonDefaultFields();*/
                }

                for (SimpleVariantRecord svr : variantWindow) {
                    recordWriter.writeNext(svr.getLineWithAnnotations(cursors));
                    totalNumVariantsWritten++;
                    //recordWriter.writeNext((String[]) ArrayUtils.addAll(svr.line, outputAnnotations[i]));
                }

                /*for (int i = 0; i < variantWindow.size(); ++i) {
                 recordWriter.writeNext((String[]) ArrayUtils.addAll(inputLines[i], outputAnnotations[i]));
                 }*/
                //LOG.info(variantsAnnotated + " annotated so far...");
                long previous = variantsAnnotated;
                variantsAnnotated += variantWindow.size();

                SimpleVariantRecord lv = variantWindow.get(variantWindow.size() - 1);

                String s = inputTDFFile.getName() + ": Annotated " + variantsAnnotated
                        + " variants so far (#sparse/dense regions=" + numLd + "/" + numHd + ").  Chrom=" + lv.chrom
                        + " Position=" + lv.start;

                if ((Math.floor(variantsAnnotated / (double) NUM_VARIANTS_BEFORE_MSG)
                        - Math.floor(previous / (double) NUM_VARIANTS_BEFORE_MSG)) >= 1) {
                    //                    String s = inputTDFFile.getName() + ": Annotated " + variantsAnnotated + " variants so far.  Last variant considered: Chrom=" + lv.chrom + " Position=" + lv.start;                                       

                    if (prevVariantsAnnotated > 0) {
                        vps = (int) Math.round((variantsAnnotated - prevVariantsAnnotated)
                                / ((System.currentTimeMillis() - startTime) / 1000.0));
                    }

                    if (vps > 0) {
                        vstr = " (" + vps + " variants/sec)";
                    }
                    prevVariantsAnnotated = variantsAnnotated;
                    startTime = System.currentTimeMillis();
                    org.ut.biolab.medsavant.server.serverapi.LogManager.getInstance().addServerLog(sid,
                            LogManagerAdapter.LogType.INFO, s + vstr);
                    //LOG.info(s+vstr);
                }

                jobProgress.setMessage(s + vstr);

            } catch (Exception ex) {
                LOG.error("Couldn't communicate progress message to user: " + ex);
                ex.printStackTrace();
            }
        }

        @Override
        protected void finish() throws IOException {
            annotateWindow();
            for (AnnotationCursor cursor : cursors) {
                cursor.cleanup();
            }
            jobProgress.setMessage("Finished.");
        }

        @Override
        protected void doIO() throws IOException {
            try {
                //TODO: Log progress
                boolean chromMismatch = false;
                SimpleVariantRecord nextInputRecord = new SimpleVariantRecord(inputLine, cursors.length);

                if (!variantWindow.isEmpty()) {
                    SimpleVariantRecord firstInWindow = variantWindow.get(0);
                    if (((nextInputRecord.start
                            - firstInWindow.start) > AnnotationCursor.MAX_BASEPAIR_DISTANCE_IN_WINDOW)
                            || !nextInputRecord.chrom.equals(firstInWindow.chrom)) {
                        annotateWindow();
                        variantWindow.clear();
                        minStart = Long.MAX_VALUE;
                        maxStart = Long.MIN_VALUE;
                    }
                }

                variantWindow.add(nextInputRecord);
                minStart = Math.min(nextInputRecord.start, minStart);
                maxStart = Math.max(nextInputRecord.start, maxStart);

            } catch (IllegalArgumentException iex) {
                LOG.error(iex);
            }

        }
    }
}