edu.cornell.med.icb.goby.modes.CompactAlignmentToAnnotationCountsMode.java Source code

Java tutorial

Introduction

Here is the source code for edu.cornell.med.icb.goby.modes.CompactAlignmentToAnnotationCountsMode.java

Source

/*
 * Copyright (C) 2009-2010 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.modes;

import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import edu.cornell.med.icb.goby.algorithmic.algorithm.AnnotationCountInterface;
import edu.cornell.med.icb.goby.algorithmic.algorithm.AnnotationCountIterateAlignments;
import edu.cornell.med.icb.goby.algorithmic.data.Annotation;
import edu.cornell.med.icb.goby.algorithmic.data.GroupComparison;
import edu.cornell.med.icb.goby.algorithmic.data.Segment;
import edu.cornell.med.icb.goby.algorithmic.data.WeightsInfo;
import edu.cornell.med.icb.goby.algorithmic.data.xml.AnnotationLength;
import edu.cornell.med.icb.goby.algorithmic.data.xml.InfoOutput;
import edu.cornell.med.icb.goby.algorithmic.data.xml.SampleTotalCount;
import edu.cornell.med.icb.goby.alignments.*;
import edu.cornell.med.icb.goby.exception.GobyRuntimeException;
import edu.cornell.med.icb.goby.stats.DifferentialExpressionAnalysis;
import edu.cornell.med.icb.goby.stats.DifferentialExpressionCalculator;
import edu.cornell.med.icb.goby.stats.DifferentialExpressionResults;
import edu.cornell.med.icb.goby.stats.NormalizationMethod;
import edu.cornell.med.icb.goby.util.Timer;
import edu.cornell.med.icb.goby.util.dynoptions.DynamicOptionRegistry;
import edu.cornell.med.icb.identifier.DoubleIndexedIdentifier;
import edu.rit.pj.IntegerForLoop;
import edu.rit.pj.ParallelRegion;
import edu.rit.pj.ParallelTeam;
import it.unimi.dsi.fastutil.ints.*;
import it.unimi.dsi.fastutil.objects.*;
import it.unimi.dsi.lang.MutableString;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Marshaller;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Map;

/**
 * Reads Goby alignments and genome annotations and output read counts that overlap with
 * the annotation segments.
 *
 * @author Fabien Campagne
 */
public class CompactAlignmentToAnnotationCountsMode extends AbstractGobyMode {
    /**
     * Used to log debug and informational messages.
     */
    private static final Log LOG = LogFactory.getLog(CompactAlignmentToAnnotationCountsMode.class);

    /**
     * The mode name.
     */
    private static final String MODE_NAME = "alignment-to-annotation-counts";

    /**
     * The mode description help text.
     */
    private static final String MODE_DESCRIPTION = "Converts alignment to counts for "
            + "annotations (e.g., gene annotations or exons).";

    /**
     * The natural log of the number two.
     */
    private static final double LOG_2 = Math.log(2);

    /**
     * The output file.
     */
    private String outputFilename;
    /**
     * The annotation file.
     */
    private String annotationFile;

    private boolean filterByReferenceNames;
    private ObjectSet<String> includeReferenceNames;
    private ObjectOpenHashSet<String> includeAnnotationTypes;
    private String[] inputFilenames;

    private boolean writeAnnotationCounts = true;
    private boolean omitNonInformativeColumns;
    private String statsFilename;
    private ParallelTeam team;
    private boolean parallel;
    private boolean doComparison;
    private final DifferentialExpressionCalculator deCalculator = new DifferentialExpressionCalculator();
    private final DifferentialExpressionAnalysis deAnalyzer = new DifferentialExpressionAnalysis();
    /**
     * The set of normalization methods to use for the comparison.
     */

    private ObjectArraySet<NormalizationMethod> normalizationMethods;

    WeightParameters weightParams;
    private String includeReferenceNameCommas;
    private boolean filterAmbiguousReads;
    /**
     * When not null, filename for an xml formatted output file. This file will contain information
     * needed by stats mode.
     */
    private String infoOutputFilename;
    private boolean removeSharedSegments;
    /**
     * List of comparisons to perform.
     */
    private ArrayList<GroupComparison> groupComparisonsList;

    @Override
    public String getModeName() {
        return MODE_NAME;
    }

    @Override
    public String getModeDescription() {
        return MODE_DESCRIPTION;
    }

    /**
     * Configure.
     *
     * @param args command line arguments
     * @return this object for chaining
     * @throws IOException   error parsing
     * @throws JSAPException error parsing
     */
    @Override
    public AbstractCommandLineMode configure(final String[] args) throws IOException, JSAPException {
        final JSAPResult jsapResult = parseJsapArguments(args);
        parseGenomicRange(jsapResult);

        parallel = jsapResult.getBoolean("parallel", false);
        writeAnnotationCounts = jsapResult.getBoolean("write-annotation-counts");
        omitNonInformativeColumns = jsapResult.getBoolean("omit-non-informative-columns");
        inputFilenames = jsapResult.getStringArray("input");
        outputFilename = jsapResult.getString("output");
        filterAmbiguousReads = jsapResult.getBoolean("filter-ambiguous-reads");
        removeSharedSegments = jsapResult.getBoolean("remove-shared-segments");

        if (filterAmbiguousReads) {
            System.out.println("Ambiguous reads will not be considered when estimating count statistics.");
        }
        statsFilename = jsapResult.getString("stats");
        final String groupsDefinition = jsapResult.getString("groups");
        deAnalyzer.parseGroupsDefinition(groupsDefinition, deCalculator, inputFilenames);
        final String compare = jsapResult.getString("compare");
        if (compare == null) {
            doComparison = false;
        } else {
            doComparison = true;
        }
        if (doComparison) {
            groupComparisonsList = deAnalyzer.parseCompare(compare);
        }
        deAnalyzer.setRunInParallel(parallel);
        includeReferenceNameCommas = jsapResult.getString("include-reference-names");
        includeReferenceNames = new ObjectOpenHashSet<String>();
        if (includeReferenceNameCommas != null) {
            includeReferenceNames.addAll(Arrays.asList(includeReferenceNameCommas.split("[,]")));
            System.out.println("Will write counts for the following sequences:");
            for (final String name : includeReferenceNames) {
                System.out.println(name);
            }
            filterByReferenceNames = true;
        }
        parseAnnotations(jsapResult);
        normalizationMethods = deAnalyzer.parseNormalization(jsapResult);
        parseEval(jsapResult, deAnalyzer);
        infoOutputFilename = jsapResult.getString("info-output");

        weightParams = configureWeights(jsapResult);
        return this;
    }

    private void parseGenomicRange(JSAPResult jsapResult) {
        String startOffsetArgument = jsapResult.getString("start-position");
        String endOffsetArgument = jsapResult.getString("end-position");
        if (startOffsetArgument != null && endOffsetArgument == null
                || endOffsetArgument != null && startOffsetArgument == null) {
            System.err.println(
                    "Start (-s/--start-position) and end offset (-e/--end-position) arguments must be specified together or not at all.");
            System.exit(1);
        }
        if (startOffsetArgument != null) {
            genomicRange = new GenomicRange();
            final String[] startTokens = startOffsetArgument.split("[,]");
            final String[] endTokens = endOffsetArgument.split("[,]");
            genomicRange.startPosition = Integer.parseInt(startTokens[1]);
            genomicRange.endPosition = Integer.parseInt(endTokens[1]);

            genomicRange.startChromosome = startTokens[0];
            genomicRange.endChromosome = endTokens[0];
        }

    }

    protected static WeightParameters configureWeights(final JSAPResult jsapResult) {
        final WeightParameters params = new WeightParameters();
        params.weightId = jsapResult.getString("use-weights");
        if (params.weightId == null || params.weightId.equals("false")) {
            params.useWeights = false;
        } else {
            params.useWeights = true;
        }

        params.formulaChoice = jsapResult.getString("adjust-gc-bias");
        params.adjustGcBias = !(params.formulaChoice == null || "false".equals(params.formulaChoice));
        if (params.formulaChoice != null) {
            params.formulaChoice = params.formulaChoice.toUpperCase();

        }
        if (params.adjustGcBias && !params.useWeights) {
            System.err.println("Cannot adjust bias when use-weights is false");
            System.exit(1);
        }
        if (params.useWeights) {
            if (!params.adjustGcBias) {
                System.out.println("Estimating expression as sum of weights");

            } else {
                System.out.println("Estimating expression with gc bias adjustment formula=" + params.formulaChoice);

            }
        }
        return params;
    }

    public static void parseEval(final JSAPResult jsapResult, final DifferentialExpressionAnalysis deAnalyzer) {
        final String evalString = jsapResult.getString("eval");

        final String[] evalArray = evalString.split(",");
        final ObjectSet<String> evalSet = new ObjectOpenHashSet<String>();
        for (final String evalName : evalArray) {
            evalSet.add(evalName.trim().toLowerCase().intern());
        }
        deAnalyzer.setEvalNames(evalSet);
    }

    private void parseAnnotations(final JSAPResult jsapResult) {
        annotationFile = jsapResult.getString("annotation");
        final String includeAnnotationTypeCommas = jsapResult.getString("include-annotation-types");
        includeAnnotationTypes = new ObjectOpenHashSet<String>();
        if (includeAnnotationTypeCommas != null) {
            includeAnnotationTypes.addAll(Arrays.asList(includeAnnotationTypeCommas.split("[,]")));
            for (final String name : includeAnnotationTypes) {
                if (name.equals("gene") || name.equals("other") || name.equals("exon")) {
                    continue;
                } else {
                    System.out.println("Please enter a valid annotation type. "
                            + "Valid annotation types include gene, exon, and or other.");
                    System.exit(1);
                }
                System.out.println("Will write counts for the following annotation types:");
                System.out.println(name);
            }
        }
    }

    class BasenameParallelRegion extends ParallelRegion {
        private final Object2ObjectMap<String, ObjectList<Annotation>> allAnnots;
        private final String[] inputFiles;
        private final BufferedWriter writer;
        private final GenomicRange genomicRange;

        BasenameParallelRegion(final Object2ObjectMap<String, ObjectList<Annotation>> allAnnots,
                final String[] inputFiles, final BufferedWriter writer, final GenomicRange genomicRange) {
            super();
            this.allAnnots = allAnnots;
            this.inputFiles = inputFiles;
            this.writer = writer;
            this.genomicRange = genomicRange;
        }

        @Override
        public void run() throws Exception {
            execute(0, inputFiles.length - 1 /* end index must be inclusive. This is counter-intuitive */,
                    new IntegerForLoop() {

                        @Override
                        public void run(final int startIndex, final int endIndex) {
                            //   System.out.println(String.format("executing start= %d end=%d ",startIndex, endIndex));
                            for (int i = startIndex; i <= endIndex; ++i) {
                                if (i >= 0 && i < inputFilenames.length) {
                                    final String inputBasename = AlignmentReaderImpl.getBasename(inputFiles[i]);
                                    try {
                                        processOneBasename(allAnnots, writer, inputFiles[i], inputBasename,
                                                genomicRange);
                                        Runtime.getRuntime().gc();
                                    } catch (IOException e) {
                                        throw new GobyRuntimeException(e);
                                    } finally {
                                        IOUtils.closeQuietly(writer);
                                    }
                                }
                            }
                        }
                    });
        }

    }

    protected synchronized ParallelTeam getParallelTeam() {
        if (team == null) {
            if (!parallel) {
                // 1 thread only is sequential
                team = new ParallelTeam(1);
            } else {
                // as many threads as configured with -Dpj.nt or default.
                team = new ParallelTeam();
            }
        }
        LOG.info("Executing on " + team.getThreadCount() + " threads.");
        return team;
    }

    GenomicRange genomicRange = null;

    /**
     * Run the mode.
     *
     * @throws java.io.IOException error reading / writing
     */
    @Override
    public void execute() throws IOException {
        System.out.println("Reading annotations from " + annotationFile);
        if (genomicRange != null) {
            // resolve chromsome ids against the alignment headers:

            ConcatAlignmentReader concat = new ConcatAlignmentReader(
                    AlignmentReaderImpl.getBasenames(inputFilenames));
            concat.readHeader();

            genomicRange.setTargetIds(concat.getTargetIdentifiers());
            concat.close();
        }
        final Object2ObjectMap<String, ObjectList<Annotation>> allAnnots = filterAnnotations(
                removeNonConstitutiveSegments(readAnnotations(annotationFile)), genomicRange);

        final Timer timer = new Timer();
        timer.start();
        BufferedWriter writer = null;
        try {

            if (outputFilename != null) {
                writer = new BufferedWriter(new FileWriter(outputFilename));
                writer.write(
                        "basename\tmain-id\tsecondary-id\ttype\tchro\tstrand\tlength\tstart\tend\tin-count\tover-count\tRPKM\tlog2(RPKM+1)\texpression\tnum-exons\n");
            }

            final BasenameParallelRegion region = new BasenameParallelRegion(allAnnots, inputFilenames, writer,
                    genomicRange);

            try {
                getParallelTeam().execute(region);
            } catch (Exception e) {
                LOG.error("An exception occurred.", e);
            }
            Runtime.getRuntime().gc();
            Runtime.getRuntime().gc();
            if (doComparison) {
                final DifferentialExpressionResults results = deAnalyzer
                        .evaluateDifferentialExpressionStatistics(deCalculator, doComparison, normalizationMethods);
                PrintWriter statsWriter = null;
                try {
                    statsWriter = new PrintWriter(statsFilename);
                    results.write(statsWriter, '\t', deCalculator);
                } finally {
                    IOUtils.closeQuietly(statsWriter);
                }
            }

            writeInfoOutput();
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        } finally {
            IOUtils.closeQuietly(writer);
        }
        timer.stop();
        System.out.println("time spent  " + timer.toString());

    }

    /**
     * Remove segments of an annotation when they overlap with other annotations. This keep only segments that uniquely
     * tag a gene/transcript. Single base overlaps are sufficient to trigger the exclusion of an entire segment (secondary id).
     *
     * @param annotations annotations read from disk
     * @return
     */
    private Object2ObjectMap<String, ObjectList<Annotation>> removeNonConstitutiveSegments(
            final Object2ObjectMap<String, ObjectList<Annotation>> annotations) {
        if (!removeSharedSegments) {
            return annotations;
        }
        final Int2ObjectMap<ObjectSet<String>> positionMap = new Int2ObjectOpenHashMap<ObjectSet<String>>();
        positionMap.defaultReturnValue(new ObjectOpenHashSet<String>());

        for (final String chromosome : annotations.keySet()) {
            positionMap.clear();
            for (final Annotation annotation : annotations.get(chromosome)) {
                for (final Segment element : annotation.getSegments()) {
                    for (int i = element.getStart(); i < element.getEnd(); i++) {
                        final ObjectSet<String> setOfGenes = positionMap.get(i);
                        setOfGenes.add(annotation.getId());
                        if (setOfGenes.size() > 1) {
                            annotation.remove(element);
                            if (annotation.getSegments().isEmpty()) {
                                // no more segments. Remove the annotation entirely.
                                annotations.get(chromosome).remove(annotation);
                            }
                            break;
                        }
                    }
                }
            }
        }
        return annotations;
    }

    InfoOutput infoOutInstance = new InfoOutput();

    private void writeInfoOutput() throws JAXBException, IOException {
        if (infoOutputFilename != null) {

            final JAXBContext jc = JAXBContext.newInstance(InfoOutput.class);

            final Marshaller m = jc.createMarshaller();

            for (String sampleId : deCalculator.samples()) {
                SampleTotalCount stc = new SampleTotalCount();
                stc.sampleId = sampleId;
                stc.totalCount = deCalculator.getNumAlignedInSample(sampleId);
                System.out.println(stc);
                infoOutInstance.totalCounts.add(stc);
            }
            // lengths have been added when filtering annotations.
            /*for (MutableString elementId : deCalculator.getElementIds()) {
            AnnotationLength ae = new AnnotationLength();
            ae.length = deCalculator.getElementLength(elementId);
            ae.id = elementId.toString();
            infoOutInstance.lengths.add(ae);
            } */
            FileWriter fileWriter = new FileWriter(infoOutputFilename);
            m.marshal(infoOutInstance, fileWriter);
            fileWriter.close();

        }
    }

    // Remove annotations that do not fully map within the genomic range.
    private Object2ObjectMap<String, ObjectList<Annotation>> filterAnnotations(
            Object2ObjectMap<String, ObjectList<Annotation>> map, GenomicRange genomicRange) {
        if (genomicRange == null) {
            return map;
        }
        Object2ObjectMap<String, ObjectList<Annotation>> filtered = new Object2ObjectArrayMap<String, ObjectList<Annotation>>();
        for (Map.Entry<String, ObjectList<Annotation>> entry : map.entrySet()) {
            String key = entry.getKey();
            String chromosome = key;

            for (final Annotation value : entry.getValue()) {

                chromosome = value.getChromosome();
                // convert to zero-based coordinates:
                int segmentStart = value.getStart() - 1;
                int segmentEnd = value.getEnd() - 1;
                if (genomicRange.fullyContains(chromosome, segmentStart, segmentEnd)) {

                    ObjectList<Annotation> chromosomeList = filtered.get(key);
                    if (chromosomeList == null) {
                        chromosomeList = new ObjectArrayList<Annotation>();
                        filtered.put(key, chromosomeList);
                    }
                    chromosomeList.add(value);
                }
                // Even when the element is out of GR, we keep  the element length, used to write complete info-output:
                AnnotationLength annotationLength = new AnnotationLength();
                annotationLength.id = value.getId();
                annotationLength.length = value.getLength();
                infoOutInstance.lengths.add(annotationLength);
            }
        }
        return filtered;
    }

    private void processOneBasename(final Object2ObjectMap<String, ObjectList<Annotation>> allAnnots,
            BufferedWriter writer, final String inputFile, final String inputBasename, GenomicRange range)
            throws IOException {

        WeightsInfo weights = null;
        if (weightParams.useWeights) {
            weights = loadWeights(inputBasename, weightParams.useWeights, weightParams.weightId);
            if (weights != null) {
                System.err.println(
                        "Weights have been provided and loaded and will be used to reweight transcript counts.");
            }
        }
        final AlignmentReaderFactory factory = filterAmbiguousReads ? new NonAmbiguousAlignmentReaderFactory()
                : new DefaultAlignmentReaderFactory();
        final int numberOfReferences;
        final DoubleIndexedIdentifier referenceIds;
        {
            final AlignmentReader reader = factory.createReader(inputBasename);
            reader.readHeader();
            numberOfReferences = reader.getNumberOfTargets();

            referenceIds = new DoubleIndexedIdentifier(reader.getTargetIdentifiers());
            reader.close();
        }
        System.out.println(String.format("Alignment contains %d reference sequences", numberOfReferences));
        if (genomicRange != null) {
            genomicRange.resolveChromosomeIndices(referenceIds);
        }
        final AnnotationCountIterateAlignments iterateAlignment = new AnnotationCountIterateAlignments();
        iterateAlignment.setWeightInfo(weightParams, weights);
        iterateAlignment.parseIncludeReferenceArgument(includeReferenceNameCommas);

        // Iterate through the alignment and retrieve algs:
        System.out.println("Loading alignment " + inputBasename + "..");
        iterateAlignment.setAlignmentReaderFactory(factory);
        iterateAlignment.iterate(genomicRange, inputBasename);

        final long numAlignedReadsInSample = iterateAlignment.getNumAlignedReadsInSample();
        final AnnotationCountInterface[] algs = iterateAlignment.getAlgs();
        final IntSortedSet referencesToProcess = iterateAlignment.getReferencesSelected();

        final String sampleId = FilenameUtils.getBaseName(inputBasename);

        deCalculator.setNumAlignedInSample(sampleId, numAlignedReadsInSample);

        if (outputFilename == null) {
            // output filename was not provided on the command line. We make one output per input basename
            if (writeAnnotationCounts) {
                final String outputFileTmp = FilenameUtils.removeExtension(inputFile) + ".ann-counts.tsv";
                writer = new BufferedWriter(new FileWriter(outputFileTmp));
                writer.write(
                        "basename\tmain-id\tsecondary-id\ttype\tchro\tstrand\tlength\tstart\tend\tin-count\tover-count\tRPKM\tlog2(RPKM+1)\texpression\tnum-exons\n");
            }
        }

        writeAnnotationCounts(allAnnots, writer, inputBasename, referenceIds, algs, referencesToProcess);

        if (outputFilename == null) {
            // output filename was not provided on the command line. We close each basename output.
            IOUtils.closeQuietly(writer);
        }

    }

    public static WeightsInfo loadWeights(final String inputBasename, final boolean useWeights, final String id) {
        WeightsInfo weights = null;
        if (useWeights) {
            try {
                weights = WeightsInfo.loadForBasename(inputBasename, id);
            } catch (Exception e) {
                LOG.warn("Cannot load weights file for " + inputBasename, e);
                LOG.warn("Using weight=1.0 for each read.");
                return new WeightsInfo() {
                    @Override
                    public float getWeight(final int readIndex) {
                        return 1f;
                    }
                };
            }
        }
        return weights;
    }

    private void writeAnnotationCounts(final Object2ObjectMap<String, ObjectList<Annotation>> allAnnots,
            final BufferedWriter writer, final String inputBasename, final DoubleIndexedIdentifier referenceIds,
            final AnnotationCountInterface[] algs, final IntSet referencesToProcess) throws IOException {

        // collect all element ids:
        if (doComparison) {
            int numberOfElements = 0;
            int numberOfGenes = 0;
            int numberOfExons = 0;
            int numberOfIntrons = 0;
            for (final int referenceIndex : referencesToProcess) {
                final String chromosomeName = referenceIds.getId(referenceIndex).toString();

                if (!allAnnots.containsKey(chromosomeName)) {
                    continue;
                }
                final ObjectList<Annotation> annots = allAnnots.get(chromosomeName);

                for (final Annotation annot : annots) {
                    final String geneID = annot.getId();
                    final int numExons = annot.getSegments().size();
                    final int numberIntrons = numExons - 1;

                    if (includeAnnotationTypes.contains("gene")) {
                        final int index = deCalculator.defineElement(geneID,
                                DifferentialExpressionCalculator.ElementType.GENE);
                        deCalculator.defineElementLength(index, annot.getLength());
                        numberOfGenes++;
                        numberOfElements++;
                    }

                    if (includeAnnotationTypes.contains("exon")) {

                        for (int i = 0; i < numExons; i++) {
                            final Segment exonSegment = annot.getSegments().get(i);
                            final String exonID = exonSegment.getId();
                            final int index = deCalculator.defineElement(exonID,
                                    DifferentialExpressionCalculator.ElementType.EXON);
                            deCalculator.defineElementLength(index, annot.getLength());
                            numberOfExons++;
                            numberOfElements++;
                        }
                    }

                    if (includeAnnotationTypes.contains("other")) {
                        for (int i = 0; i < numExons; i++) {
                            if (i < numberIntrons) {
                                final Segment segment = annot.getSegments().get(i);
                                final int intronStart = segment.getEnd() + 1;
                                final Segment intronSegment = annot.getSegments().get(i + 1);
                                final int intronEnd = intronSegment.getStart() - 1;
                                final int intronLength = intronEnd - intronStart + 1;
                                final String intronID = segment.getId() + "-" + intronSegment.getId();
                                final int index = deCalculator.defineElement(intronID,
                                        DifferentialExpressionCalculator.ElementType.OTHER);
                                deCalculator.defineElementLength(index, intronLength);
                                numberOfIntrons++;
                                numberOfElements++;
                            }
                        }
                    }
                }
            }
            LOG.info(String.format("%d Genes %d exons %d other total %d ", numberOfGenes, numberOfExons,
                    numberOfIntrons, numberOfElements));
            deCalculator.reserve(numberOfElements, inputFilenames.length);
        }

        int numberOfAnnotationCountsWritten = 0;
        for (final int referenceIndex : referencesToProcess) {
            final String chromosomeName = referenceIds.getId(referenceIndex).toString();
            System.out.println("Writing annotation counts for reference " + chromosomeName);

            if (!allAnnots.containsKey(chromosomeName)) {
                continue;
            }

            final ObjectList<Annotation> annots = allAnnots.get(chromosomeName);
            algs[referenceIndex].sortReads();
            algs[referenceIndex].accumulate();
            algs[referenceIndex].baseCount();
            if (doComparison) {
                for (final Annotation annot : annots) {
                    final String geneID = annot.getId();
                    deCalculator.defineElement(geneID);
                }
            }
            // get just the filename (strip the path, not the extension)
            final String basename = FilenameUtils.getName(inputBasename);
            final String sampleId = inputBasename;
            for (final Annotation annot : annots) {
                final String geneID = annot.getId();

                if (includeAnnotationTypes.contains("gene")) {
                    final int geneStart = annot.getStart();
                    final int geneEnd = annot.getEnd();
                    final int geneLength = geneEnd - geneStart + 1;
                    final float geneDepth = algs[referenceIndex].averageReadsPerPosition(geneStart, geneEnd);
                    final double geneOverlapReads = algs[referenceIndex]
                            .countReadsPartiallyOverlappingWithInterval(geneStart, geneEnd);
                    final double geneInsideReads = algs[referenceIndex].countReadsStriclyWithinInterval(geneStart,
                            geneEnd);
                    final double geneExpression = algs[referenceIndex].geneExpressionCount(annot);

                    final int numExons = annot.getSegments().size();

                    final double geneRPKM = deCalculator.calculateNormalized(geneOverlapReads, annot.getLength(),
                            deCalculator.getNumAlignedInSample(sampleId));
                    if (writeAnnotationCounts) {
                        writer.write(String.format("%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%g\t%g\t%g\t%g\t%g\t%d%n",
                                basename, geneID, "", "gene", annot.getChromosome(), annot.getStrand(), geneLength,
                                geneStart, geneEnd, geneInsideReads, geneOverlapReads, geneRPKM, log2(geneRPKM),
                                geneExpression, numExons));

                    }
                    numberOfAnnotationCountsWritten++;
                    if (doComparison) {
                        deCalculator.observe(basename, geneID, geneExpression);
                    }
                }
                final int numberExons = annot.getSegments().size();
                final int numberIntrons = numberExons - 1;
                // skip unnecessary computation if we don't need exon or intron info:
                if (includeAnnotationTypes.contains("exon") || includeAnnotationTypes.contains("other")) {
                    for (int i = 0; i < numberExons; i++) {
                        final Segment segment = annot.getSegments().get(i);
                        final int exonStart = segment.getStart();
                        final int exonEnd = segment.getEnd();

                        final String exonStrand = segment.getStrand();
                        final int exonLength = segment.getLength();
                        final String exonID = segment.getId();
                        final float exonDepth = algs[referenceIndex].averageReadsPerPosition(exonStart, exonEnd);
                        final double exonOverlapReads = algs[referenceIndex]
                                .countReadsPartiallyOverlappingWithInterval(exonStart, exonEnd);
                        final double exonInsideReads = algs[referenceIndex]
                                .countReadsStriclyWithinInterval(exonStart, exonEnd);
                        final double exonRPKM = deCalculator.calculateNormalized(exonOverlapReads,
                                segment.getLength(), deCalculator.getNumAlignedInSample(sampleId));
                        if (includeAnnotationTypes.contains("exon")) {
                            if (writeAnnotationCounts) {
                                writer.write(String.format("%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%g\t%g\t%g\t%g%n",
                                        basename, geneID, exonID, "exon", annot.getChromosome(), exonStrand,
                                        exonLength, exonStart, exonEnd, exonInsideReads, exonOverlapReads, exonRPKM,
                                        log2(exonRPKM)));
                            }
                            numberOfAnnotationCountsWritten++;
                            if (doComparison && includeAnnotationTypes.contains("exon")) {
                                deCalculator.observe(basename, exonID, exonOverlapReads);
                            }
                        }
                        if (i < numberIntrons) {
                            final int intronStart = segment.getEnd() + 1;
                            final Segment intronSegment = annot.getSegments().get(i + 1);
                            final int intronEnd = intronSegment.getStart() - 1;
                            final int intronLength = intronEnd - intronStart + 1;
                            final String intronID = segment.getId() + "-" + intronSegment.getId();
                            final float intronDepth = algs[referenceIndex].averageReadsPerPosition(intronStart,
                                    intronEnd);
                            final double intronOverlapReads = algs[referenceIndex]
                                    .countReadsPartiallyOverlappingWithInterval(intronStart, intronEnd);
                            final double intronInsideReads = algs[referenceIndex]
                                    .countReadsStriclyWithinInterval(intronStart, intronEnd);
                            final double intronRPKM = deCalculator.calculateNormalized(intronOverlapReads,
                                    intronSegment.getLength(), deCalculator.getNumAlignedInSample(sampleId));
                            if (intronLength > 0) {
                                if (includeAnnotationTypes.contains("intron")) {
                                    if (writeAnnotationCounts) {
                                        writer.write(String.format(
                                                "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%g\t%g\t%e\t%g%n", basename,
                                                geneID, intronID, "other", annot.getChromosome(), exonStrand,
                                                exonLength, exonStart, exonEnd, intronInsideReads,
                                                intronOverlapReads, intronRPKM, log2(intronRPKM)));
                                    }
                                    numberOfAnnotationCountsWritten++;
                                    if (doComparison && includeAnnotationTypes.contains("other")) {
                                        deCalculator.observe(basename, intronID, intronOverlapReads);
                                    }
                                }
                            }
                        }
                    }
                }
            }
            algs[referenceIndex] = null;
        }

        LOG.info("Wrote " + numberOfAnnotationCountsWritten + " entries");
        if (numberOfAnnotationCountsWritten == 0) {
            LOG.warn("No entries were written.  This may be due to the fact that names "
                    + "in the reference dataset used do not match those in the annotation file.  "
                    + "For example, ENSEMBL names chromosomes \"1\",\"2\",\"3\" whereas UCSC "
                    + "names the same chromosomes \"chr1\",\"chr2\",\"chr3\". In these "
                    + "cases you will need to adjust the names in the annotation file being used "
                    + "so they match the names used in the reference dataset.");
        }
    }

    /**
     * Calculate the log2 of x+1.
     *
     * @param x
     * @return log2(x+1)=Math.log1p(x)/Math.log(2)
     */
    private double log2(final double x) {
        return Math.log1p(x) / LOG_2;
    }

    /**
     * Read tab delimited annotation file including 6 columns : chromosome, strand, transcriptID,
     * segmentID, start, end each row is a segment, read each chromosome at a time.
     *
     * @param annotFile
     * @return
     * @throws IOException
     */
    public static Object2ObjectMap<String, ObjectList<Annotation>> readAnnotations(final String annotFile)
            throws IOException {
        return readAnnotations(new FileReader(annotFile));
    }

    /**
     * Read tab delimited annotation file including 6 columns : chromosome, strand, transcriptID,
     * segmentID, start, end each row is a segment, read each chromosome at a time.
     *
     * @param annotReader reader over an annotation file.
     * @return
     * @throws IOException
     */
    public static Object2ObjectMap<String, ObjectList<Annotation>> readAnnotations(final Reader annotReader)
            throws IOException {

        BufferedReader reader = null;
        final Object2ObjectMap<String, Annotation> annots = new Object2ObjectAVLTreeMap<String, Annotation>();
        try {
            reader = new BufferedReader(annotReader);
            String line;
            final String header = reader.readLine();

            while ((line = reader.readLine()) != null) {
                if (!line.startsWith("#")) {
                    final String[] linearray = line.trim().split("\t");
                    if (linearray.length < 6) {
                        LOG.warn("Annotation file, encountered truncated line, ignoring: " + line);
                        continue;
                    }
                    final String chromosome = linearray[0];
                    //           if(!chromosome.equalsIgnoreCase(chroName)) continue;
                    final String strand = linearray[1];
                    final String transcriptID = linearray[2];
                    final String exonID = linearray[3];
                    final int segmentStart = Integer.parseInt(linearray[4]);
                    final int segmentEnd = Integer.parseInt(linearray[5]);
                    final Segment segment = new Segment(segmentStart, segmentEnd, exonID, strand);
                    boolean ignoreElement = false;

                    // annotation segment must be within the genomic range to be considered. Consider all segments when
                    // we are processing the entire alignment (genomicRange==null)
                    if (annots.containsKey(transcriptID)) {
                        annots.get(transcriptID).addSegment(segment);
                    } else {
                        final Annotation annot = new Annotation(transcriptID, chromosome, strand);
                        annot.addSegment(segment);
                        annots.put(transcriptID, annot);
                    }

                }
            }
        } finally {
            IOUtils.closeQuietly(reader);
        }

        //Group annotations by chromosome (key of the map returned)
        final Object2ObjectMap<String, ObjectList<Annotation>> allAnnots = new Object2ObjectOpenHashMap<String, ObjectList<Annotation>>();
        for (final Object2ObjectMap.Entry<String, Annotation> entry : annots.object2ObjectEntrySet()) {
            final Annotation annotation = entry.getValue();
            annotation.sortSegments();
            final String chromosome = annotation.getChromosome();
            if (allAnnots.containsKey(chromosome)) {
                allAnnots.get(chromosome).add(annotation);
            } else {
                final ObjectList<Annotation> annotations = new ObjectArrayList<Annotation>();
                annotations.add(annotation);
                allAnnots.put(chromosome, annotations);
            }
        }
        return allAnnots;
    }

    /**
     * Main method.
     *
     * @param args command line args.
     * @throws com.martiansoftware.jsap.JSAPException
     *                             error parsing
     * @throws java.io.IOException error parsing or executing.
     */
    public static void main(final String[] args) throws JSAPException, IOException {
        new CompactAlignmentToAnnotationCountsMode().configure(args).execute();
    }

}