edu.cornell.med.icb.goby.modes.CountsArchiveToUnionPeaksAnnotationMode.java Source code

Introduction

Here is the source code for edu.cornell.med.icb.goby.modes.CountsArchiveToUnionPeaksAnnotationMode.java
Source

/*
 * Copyright (C) 2009-2010 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.modes;

import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import edu.cornell.med.icb.goby.algorithmic.data.Annotation;
import edu.cornell.med.icb.goby.algorithmic.data.Segment;
import edu.cornell.med.icb.goby.alignments.ConcatAlignmentReader;
import edu.cornell.med.icb.goby.alignments.AlignmentReaderImpl;
import edu.cornell.med.icb.goby.counts.AnyTransitionCountsIterator;
import edu.cornell.med.icb.goby.counts.CountsArchiveReader;
import edu.cornell.med.icb.goby.counts.CountsReaderI;
import edu.cornell.med.icb.goby.counts.Peak;
import edu.cornell.med.icb.goby.counts.PeakAggregator;
import edu.cornell.med.icb.identifier.DoubleIndexedIdentifier;
import edu.cornell.med.icb.identifier.IndexedIdentifier;
import it.unimi.dsi.fastutil.objects.Object2ObjectMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectList;
import it.unimi.dsi.fastutil.objects.ObjectListIterator;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import org.apache.commons.io.IOUtils;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.Collections;
import java.util.Set;

/**
 * Write annotations corresponding to a consolidation of peaks accross each sequence in
 * the count archives.
 *
 * @author Jaaved Mohammed
 */
public class CountsArchiveToUnionPeaksAnnotationMode extends AbstractGobyMode {
    /**
     * The mode name.
     */
    private static final String MODE_NAME = "count-archive-to-peak-union-annotations";

    /**
     * The mode description help text.
     */
    private static final String MODE_DESCRIPTION = "Write annotations corresponding "
            + "to the union of peaks accross each sequence in the count archives.";

    /**
     * The input files. Must reduce to alignment basenames.
     */
    private String[] inputFiles;

    /**
     * The output file.
     */
    private String outputFile;

    private boolean filterByReferenceNames;
    private ObjectSet<String> includeReferenceNames;
    private String alternativeCountsName;
    private int detectionThreshold;

    @Override
    public String getModeName() {
        return MODE_NAME;
    }

    @Override
    public String getModeDescription() {
        return MODE_DESCRIPTION;
    }

    /**
     * Configure.
     *
     * @param args command line arguments
     * @return this object for chaining
     * @throws IOException error parsing
     * @throws JSAPException error parsing
     */
    @Override
    public AbstractCommandLineMode configure(final String[] args) throws IOException, JSAPException {
        final JSAPResult jsapResult = parseJsapArguments(args);

        inputFiles = AlignmentReaderImpl.getBasenames(jsapResult.getStringArray("input"));
        outputFile = jsapResult.getString("output");
        final String includeReferenceNameCommas = jsapResult.getString("include-reference-names");
        includeReferenceNames = new ObjectOpenHashSet<String>();
        if (includeReferenceNameCommas != null) {
            includeReferenceNames.addAll(Arrays.asList(includeReferenceNameCommas.split("[,]")));
            System.out.println("Will write counts for the following sequences:");
            for (final String name : includeReferenceNames) {
                System.out.println(name);
            }
            filterByReferenceNames = true;
        }
        alternativeCountsName = jsapResult.getString("alternative-count-archive");
        detectionThreshold = jsapResult.getInt("threshold");
        System.out.println("Peak Detection Threshold is " + detectionThreshold);

        return this;
    }

    /**
     * Run the mode.
     *
     * @throws java.io.IOException error reading / writing
     */
    @Override
    public void execute() throws IOException {
        final String[] basenames = AlignmentReaderImpl.getBasenames(inputFiles);
        /**
         * TODO: Determine of adjustQueryIndices should be the default of true.
         */
        final ConcatAlignmentReader reader = new ConcatAlignmentReader(basenames);
        reader.readHeader();
        final int numberOfReferences = reader.getNumberOfTargets();
        final IndexedIdentifier referenceIds = reader.getTargetIdentifiers();
        final DoubleIndexedIdentifier backwards = new DoubleIndexedIdentifier(referenceIds);
        reader.close();

        // read counts archive:
        final CountsArchiveReader[] countArchiveReaders = new CountsArchiveReader[inputFiles.length];
        int i = 0;
        for (final String inputFile : basenames) {
            countArchiveReaders[i++] = new CountsArchiveReader(inputFile, alternativeCountsName);
        }

        // for each reference sequence, generate annotations for the union of peaks across all
        //  the input samples. More precisely, we generate an annotation across the widest peak
        // that can be called with the sum of counts across all the input samples.
        final Object2ObjectMap<String, ObjectList<Segment>> allSegments = new Object2ObjectOpenHashMap<String, ObjectList<Segment>>();
        for (int referenceIndex = 0; referenceIndex < numberOfReferences; referenceIndex++) {
            final String referenceId = backwards.getId(referenceIndex).toString();
            boolean processThisSequence = true;

            if (filterByReferenceNames && !includeReferenceNames.contains(referenceId)) {
                processThisSequence = false;
            }

            if (processThisSequence) {
                System.out.println("Processing reference " + referenceId);
                final CountsReaderI[] readers = new CountsReaderI[countArchiveReaders.length];
                int readerIndex = 0;
                for (final CountsArchiveReader archive : countArchiveReaders) {
                    if (archive.getIdentifier(referenceIndex) != null) {
                        readers[readerIndex++] = archive.getCountReader(referenceIndex);

                        if (allSegments.get(referenceId) == null) {
                            allSegments.put(referenceId, new ObjectArrayList<Segment>());
                        }
                    }
                }

                // Reads in all the files and defines one sortedPositionIterator over all input files
                final AnyTransitionCountsIterator iterator = new AnyTransitionCountsIterator(readers);
                // Given all input files and one sortedPositionIterator over them, start collecting (possibly overlapping)
                // peaks across all input files
                final PeakAggregator peakAggregator = new PeakAggregator(iterator);
                peakAggregator.setPeakDetectionThreshold(detectionThreshold);
                while (peakAggregator.hasNext()) {
                    final Peak peak = peakAggregator.next();

                    final ObjectList<Segment> segmentsList = allSegments.get(referenceId);
                    segmentsList.add(new Segment(peak.start, peak.start + peak.length, "id", "either"));
                }
            }
        }

        // Consolidate all overlapping peaks across all input files by taking the
        // union of overlapping peaks
        System.out.println("Consolidating overlapping peaks over all counts files.");
        final Set<String> chromsomes = allSegments.keySet();
        ObjectList<Segment> consensusList = null;
        for (final String chromsome : chromsomes) {
            consensusList = new ObjectArrayList<Segment>();
            final ObjectList<Segment> segmentsList = allSegments.get(chromsome);

            // Sort the segments to make consolidation by union easier.
            Collections.sort(segmentsList);
            System.out.println(
                    String.format("Reference %s: Starting with %d segments.", chromsome, segmentsList.size()));
            ObjectListIterator<Segment> segIterator = segmentsList.listIterator();

            Segment segment = null;
            Segment nextSeg = null;
            Segment consolidatedSeg = null;
            if (segIterator.hasNext()) {
                segment = segIterator.next();
            }
            while (segment != null) {
                if (segIterator.hasNext()) {
                    nextSeg = segIterator.next();
                    while ((nextSeg != null) && segment.overlap(nextSeg)) {
                        // merge the two segments
                        consolidatedSeg = segment.merge(nextSeg);
                        segment = consolidatedSeg;
                        if (segIterator.hasNext()) {
                            nextSeg = segIterator.next();
                        } else {
                            nextSeg = null;
                        }
                    }
                }
                consensusList.add(segment);
                segment = nextSeg;
                nextSeg = null;
            }
            Collections.sort(consensusList);
            segmentsList.clear();
            segmentsList.addAll(consensusList);

            System.out.println(
                    String.format("Reference %s: Finished with %d segments.", chromsome, segmentsList.size()));

            // Transform the segments back to annotations for easy writing to file
            final ObjectList<Annotation> annotationList = new ObjectArrayList<Annotation>();
            segIterator = segmentsList.listIterator();
            while (segIterator.hasNext()) {
                segment = segIterator.next();
                final Annotation annot = new Annotation(
                        chromsome + "." + segment.getStart() + "." + segment.getLength(), chromsome);
                annot.addSegment(segment);
                annotationList.add(annot);
            }

            // Write the list of annotations to file
            writeAnnotations(outputFile, annotationList, true);
        }
    }

    public static void writeAnnotations(final String outputFileName, final ObjectList<Annotation> annotationList,
            final boolean append) {
        final File outputFile = new File(outputFileName);
        PrintWriter writer = null;
        try {
            if (!outputFile.exists()) {
                writer = new PrintWriter(outputFile);
                writer.write(
                        "Chromosome_Name\tStrand\tPrimary_ID\tSecondary_ID\tTranscript_Start\tTranscript_End\n");
            } else {
                writer = new PrintWriter(new FileOutputStream(outputFile, append));
            }

            final ObjectListIterator<Annotation> annotIterator = annotationList.listIterator();
            while (annotIterator.hasNext()) {
                final Annotation annotation = annotIterator.next();
                annotation.write(writer);
            }
        } catch (FileNotFoundException fnfe) {
            System.err.println("Caught exception in writeAnnotations: " + fnfe.getMessage());
            System.exit(1);
        } finally {
            IOUtils.closeQuietly(writer);
        }
    }

    /**
     * Main method.
     *
     * @param args command line args.
     * @throws com.martiansoftware.jsap.JSAPException
     *                             error parsing
     * @throws java.io.IOException error parsing or executing.
     */
    public static void main(final String[] args) throws JSAPException, IOException {
        new CountsArchiveToUnionPeaksAnnotationMode().configure(args).execute();
    }
}