edu.cornell.med.icb.goby.modes.AnnotationPenaltyMode.java Source code

Java tutorial

Introduction

Here is the source code for edu.cornell.med.icb.goby.modes.AnnotationPenaltyMode.java

Source

/*
 * Copyright (C) 2009-2010 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.modes;

import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import edu.cornell.med.icb.goby.algorithmic.data.Annotation;
import edu.cornell.med.icb.goby.algorithmic.data.Segment;
import it.unimi.dsi.fastutil.objects.Object2ObjectMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.fastutil.objects.ObjectList;
import org.apache.commons.io.IOUtils;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;
import java.util.Set;

/**
 * Write annotations corresponding to consensus peaks found in each sequence of count archives.
 *
 * @author Jaaved Mohammed
 */
public class AnnotationPenaltyMode extends AbstractGobyMode {
    /**
     * The mode name.
     */
    private static final String MODE_NAME = "annotation-penalty";

    /**
     * The mode description help text.
     */
    private static final String MODE_DESCRIPTION = "Calculates the distances between two collections of annotations.";

    /**
     * Constants
     * These penalty scores are designed to be small enough for whole genome sequences (mouse)
     * They may need to be further shrunk to reduce overflow error for larger genomes.
     */
    private static final double NON_OVERLAP_COST = 0.02;
    private static final double OVERLAP_COST = 0.01;
    private String geneAnnotFile;
    private String proposalAnnotFile;

    /**
     * The total penalty score, resets to zero at each execution of the mode.
     */
    private static double score;
    /**
     * Keeps track of the last position processed until to prevent duplication.
     */
    private static int lastPos;

    @Override
    public String getModeName() {
        return MODE_NAME;
    }

    @Override
    public String getModeDescription() {
        return MODE_DESCRIPTION;
    }

    public double getScore() {
        return score;
    }

    /**
     * Configure.
     *
     * @param args command line arguments
     * @return this object for chaining
     * @throws java.io.IOException error parsing
     * @throws com.martiansoftware.jsap.JSAPException
     *                             error parsing
     */
    @Override
    public AbstractCommandLineMode configure(final String[] args) throws IOException, JSAPException {
        final JSAPResult jsapResult = parseJsapArguments(args);

        geneAnnotFile = jsapResult.getString("gene-annotation");
        proposalAnnotFile = jsapResult.getString("proposal-annotation");

        return this;
    }

    /**
     * Run the mode.
     *
     * @throws java.io.IOException error reading / writing
     */
    @Override
    public void execute() throws IOException {
        score = 0d;
        lastPos = -1;

        System.out.println("Reading official gene annotations from " + geneAnnotFile);
        final Object2ObjectMap<String, ObjectList<Annotation>> allGeneAnnots = readAnnotations(geneAnnotFile);
        System.out.println("Reading official proposal annotations from " + proposalAnnotFile);
        final Object2ObjectMap<String, ObjectList<Annotation>> allProposalAnnots = readAnnotations(
                proposalAnnotFile);

        //Find the references (chromosomes) common in both annotation files
        final Set<String> allGeneChroms = allGeneAnnots.keySet();
        final Set<String> allProposalChroms = allProposalAnnots.keySet();
        final Set<String> intersect = allGeneChroms;
        intersect.retainAll(allProposalChroms);
        final Iterator<String> commonChromsIt = intersect.iterator();

        //Find the overlapping differences between the official gene annotation and the proposed one.
        while (commonChromsIt.hasNext()) {
            lastPos = 0;
            final String chromosome = commonChromsIt.next();

            final ObjectList<Annotation> geneAnnotations = allGeneAnnots.get(chromosome);
            final ObjectList<Annotation> proposalAnnotations = allProposalAnnots.get(chromosome);
            //Sort to allow fast comparison
            Collections.sort(geneAnnotations);
            Collections.sort(proposalAnnotations);

            final ObjectIterator<Annotation> geneAnnotIt = geneAnnotations.iterator();
            final ObjectIterator<Annotation> propAnnotIt = proposalAnnotations.iterator();

            Annotation geneAnnot = getNextAnnotation(geneAnnotIt);
            Annotation propAnnot = getNextAnnotation(propAnnotIt);

            while ((geneAnnot != null) || (propAnnot != null)) {
                if ((geneAnnot != null) && (propAnnot != null)) {
                    //check to see if they overlap
                    if (!geneAnnot.overlap(propAnnot)) {
                        //determine which sortedPositionIterator to increment
                        if (geneAnnot.getStart() > propAnnot.getStart()) {
                            addPenalty(propAnnot);
                            propAnnot = getNextAnnotation(propAnnotIt);
                        } else {
                            addPenalty(geneAnnot);
                            geneAnnot = getNextAnnotation(geneAnnotIt);
                        }
                    } else {
                        //They do overlap
                        //Need to determine which annotation is the shorter sequence and advance it's sortedPositionIterator
                        addPenalty(geneAnnot, propAnnot);
                        if (geneAnnot.getEnd() < propAnnot.getEnd()) {
                            geneAnnot = getNextAnnotation(geneAnnotIt);
                        } else {
                            propAnnot = getNextAnnotation(propAnnotIt);
                        }
                    }
                } else if (geneAnnot == null) {
                    //simply add the cost of all remaining propAnnot
                    addPenalty(propAnnot);
                    propAnnot = getNextAnnotation(propAnnotIt);
                } else if (propAnnot == null) {
                    //simply add the cost of all remaining geneAnnot
                    addPenalty(geneAnnot);
                    geneAnnot = getNextAnnotation(geneAnnotIt);
                }
            }
        }
        System.out.println("Score = " + score);
    }

    private static void addPenalty(final Annotation annot) {
        score += (annot.getLength() * NON_OVERLAP_COST);
        lastPos = annot.getEnd();
    }

    private static void addPenalty(final Segment segment) {
        if (segment.getEnd() < lastPos) {
            return;
        }

        score += (segment.getLength() * NON_OVERLAP_COST);
        lastPos = segment.getEnd();
    }

    private static void addPenalty(final Segment geneSeg, final Segment propSeg) {
        final int stop = Math.min(geneSeg.getEnd(), propSeg.getEnd());
        if (stop < lastPos) {
            return;
        }

        int overlapLen = 0;
        int nonOverlapLen = 0;
        for (int i = lastPos; i < stop; i++) {
            if ((geneSeg.getStart() <= i && i <= geneSeg.getEnd())
                    && (propSeg.getStart() <= i && i <= propSeg.getEnd())) {
                overlapLen++;
            } else {
                nonOverlapLen++;
            }
        }

        score += ((nonOverlapLen * NON_OVERLAP_COST) + (overlapLen * OVERLAP_COST));
        lastPos = stop;
    }

    private static void addPenalty(final Annotation geneAnnot, final Annotation propAnnot) {
        final ObjectIterator<Segment> geneSegIt = geneAnnot.getSegments().iterator();
        final ObjectIterator<Segment> propSegIt = propAnnot.getSegments().iterator();
        Segment geneSeg = getNextSegment(geneSegIt);
        Segment propSeg = getNextSegment(propSegIt);

        while ((geneSeg != null) || (propSeg != null)) {
            if ((geneSeg != null) && (propSeg != null)) {
                //check to see if they overlap
                if (!geneSeg.overlap(propSeg)) {
                    //determine which sortedPositionIterator to increment
                    if (geneSeg.getStart() > propSeg.getStart()) {
                        addPenalty(propSeg);
                        propSeg = getNextSegment(propSegIt);
                    } else {
                        addPenalty(geneSeg);
                        geneSeg = getNextSegment(geneSegIt);
                    }
                } else {
                    //They do overlap
                    //Need to determine which annotation is the shorter sequence and advance it's sortedPositionIterator
                    addPenalty(geneSeg, propSeg);
                    if (geneSeg.getEnd() < propSeg.getEnd()) {
                        geneSeg = getNextSegment(geneSegIt);
                    } else {
                        propSeg = getNextSegment(propSegIt);
                    }
                }
            } else if (geneSeg == null) {
                //simply add the cost of all remaining propSeg
                addPenalty(propSeg);
                propSeg = getNextSegment(propSegIt);
            } else if (propSeg == null) {
                //simply add the cost of all remaining geneSeg
                addPenalty(geneSeg);
                geneSeg = getNextSegment(geneSegIt);
            }
        }
    }

    private static Annotation getNextAnnotation(final ObjectIterator<Annotation> iterator) {
        Annotation annotation = null;
        if (iterator.hasNext()) {
            annotation = iterator.next();
            annotation.sortSegments();
        }
        return annotation;
    }

    private static Segment getNextSegment(final ObjectIterator<Segment> iterator) {
        Segment segment = null;
        if (iterator.hasNext()) {
            segment = iterator.next();
        }
        return segment;
    }

    /**
     * Read tab delimited annotation file including 6 columns : chromosome, strand, transcriptID,
     * segmentID, start, end each row is a segment, read each chromosome at a time.
     *
     * @param annotFile
     * @return
     * @throws IOException
     */
    public Object2ObjectMap<String, ObjectList<Annotation>> readAnnotations(final String annotFile)
            throws IOException {

        BufferedReader reader = null;
        final Object2ObjectMap<String, Annotation> annots = new Object2ObjectOpenHashMap<String, Annotation>();
        try {
            reader = new BufferedReader(new FileReader(annotFile));
            String line;
            final String header = reader.readLine();

            while ((line = reader.readLine()) != null) {
                if (!line.startsWith("#")) {
                    final String[] linearray = line.trim().split("\t");
                    final String chromosome = linearray[0];
                    //           if(!chromosome.equalsIgnoreCase(chroName)) continue;
                    final String strand = linearray[1];
                    final String transcriptID = linearray[2];
                    final String exonID = linearray[3];
                    final int segmentStart = Integer.parseInt(linearray[4]);
                    final int segmentEnd = Integer.parseInt(linearray[5]);
                    final Segment segment = new Segment(segmentStart, segmentEnd, exonID, strand);
                    if (annots.containsKey(transcriptID)) {
                        annots.get(transcriptID).addSegment(segment);
                    } else {
                        final Annotation annot = new Annotation(transcriptID, chromosome, strand);
                        annot.addSegment(segment);
                        annots.put(transcriptID, annot);
                    }
                }
            }
        } finally {
            IOUtils.closeQuietly(reader);
        }

        //organize the Annotations to chromosome
        final Object2ObjectMap<String, ObjectList<Annotation>> allAnnots = new Object2ObjectOpenHashMap<String, ObjectList<Annotation>>();
        for (final Object2ObjectMap.Entry<String, Annotation> entry : annots.object2ObjectEntrySet()) {
            final Annotation annotation = entry.getValue();
            annotation.sortSegments();
            final String chromosome = annotation.getChromosome();
            if (allAnnots.containsKey(chromosome)) {
                allAnnots.get(chromosome).add(annotation);
            } else {
                final ObjectList<Annotation> annotations = new ObjectArrayList<Annotation>();
                annotations.add(annotation);
                allAnnots.put(chromosome, annotations);
            }
        }
        return allAnnots;
    }

    /**
     * Main method.
     *
     * @param args command line args.
     * @throws com.martiansoftware.jsap.JSAPException
     *                             error parsing
     * @throws java.io.IOException error parsing or executing.
     */
    public static void main(final String[] args) throws JSAPException, IOException {
        new AnnotationPenaltyMode().configure(args).execute();
    }
}