edu.cornell.med.icb.goby.modes.CountsArchiveToBedGraphMode.java Source code

Introduction

Here is the source code for edu.cornell.med.icb.goby.modes.CountsArchiveToBedGraphMode.java
Source

/*
 * Copyright (C) 2009-2010 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.modes;

import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import edu.cornell.med.icb.goby.alignments.AlignmentReaderImpl;
import edu.cornell.med.icb.goby.counts.CountsArchiveReader;
import edu.cornell.med.icb.goby.counts.CountsReader;
import edu.cornell.med.icb.identifier.DoubleIndexedIdentifier;
import edu.cornell.med.icb.identifier.IndexedIdentifier;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import org.apache.commons.io.IOUtils;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.zip.GZIPOutputStream;

/**
 * Converts a full genome counts archive to the BedGraph format.
 * The  <a href="http://genome.ucsc.edu/goldenPath/help/bedgraph.html">BedGraph Track Format</a> can
 * be imported in the UCSC genome browser to visualize counts in the context of genome annotations.
 * This format is less lossly than Wiggle (unless you do resolution = 1 for Wiggle) but makes
 * considerably larger files.
 *
 * @author Fabien Campagne
 */
public class CountsArchiveToBedGraphMode extends AbstractGobyMode {
    /**
     * The mode name.
     */
    private static final String MODE_NAME = "counts-to-bedgraph";

    /**
     * The mode description help text.
     */
    private static final String MODE_DESCRIPTION = "Converts a full genome counts archive to "
            + "the bedgraph format.  The bedgraph format can be imported in the UCSC genome "
            + "browser to visualize counts in the context of genome annotations."
            + "(See http://genome.ucsc.edu/goldenPath/help/bedgraph.html)";

    /**
     * The input file.
     */
    private String inputBasename;

    /**
     * The output file.
     */
    private String outputFile;

    /**
     * If true only a subset of references will be processed defined by the set
     * {@link #includeReferenceNames}.
     */
    private boolean filterByReferenceNames;

    /**
     * The set of reference names to process if the user chose to filter.
     */
    private ObjectSet<String> includeReferenceNames = new ObjectOpenHashSet<String>();

    /**
     * The name to embed in the bedgraph file.
     */
    private String label;

    /**
     * Use to switch from the default "counts" archive to another count archive within the
     * same basename.
     */
    private String alternativeCountArchiveExtension;

    @Override
    public String getModeName() {
        return MODE_NAME;
    }

    @Override
    public String getModeDescription() {
        return MODE_DESCRIPTION;
    }

    /**
     * Configure.
     *
     * @param args command line arguments
     * @return this object for chaining
     * @throws java.io.IOException error parsing
     * @throws com.martiansoftware.jsap.JSAPException
     *                             error parsing
     */
    @Override
    public AbstractCommandLineMode configure(final String[] args) throws IOException, JSAPException {
        final JSAPResult jsapResult = parseJsapArguments(args);

        inputBasename = AlignmentReaderImpl.getBasename(jsapResult.getString("input"));
        outputFile = jsapResult.getString("output");
        alternativeCountArchiveExtension = jsapResult.getString("alternative-count-archive");
        label = jsapResult.getString("label");
        if (label == null) {
            label = new File(inputBasename + "-" + alternativeCountArchiveExtension).getName();
            System.out.println("Setting label from basename: " + label);
        }
        final String includeReferenceNameCommas = jsapResult.getString("include-reference-names");
        if (includeReferenceNameCommas != null) {
            includeReferenceNames = new ObjectOpenHashSet<String>();
            includeReferenceNames.addAll(Arrays.asList(includeReferenceNameCommas.split("[,]")));
            System.out.println("Will write bedgraph for the following sequences:");
            for (final String name : includeReferenceNames) {
                System.out.println(name);
            }
            filterByReferenceNames = true;
        }

        // default output file has extension ".bed"
        if (outputFile == null) {
            outputFile = inputBasename
                    + (includeReferenceNameCommas != null ? ("-" + includeReferenceNameCommas) : "-all") + ".bed";
        }

        return this;
    }

    /**
     * Run the map2text mode.
     *
     * @throws java.io.IOException error reading / writing
     */
    @Override
    public void execute() throws IOException {
        PrintWriter writer = null;
        try {
            writer = new PrintWriter(
                    new GZIPOutputStream(new FastBufferedOutputStream(new FileOutputStream(outputFile + ".gz"))));
            writer.write("track type=bedGraph name=" + label + " visibility=full viewLimits=1:200\n");
            final AlignmentReaderImpl alignment = new AlignmentReaderImpl(inputBasename);
            alignment.readHeader();
            alignment.close();
            final IndexedIdentifier referenceIds = alignment.getTargetIdentifiers();
            final DoubleIndexedIdentifier backwards = new DoubleIndexedIdentifier(referenceIds);
            final CountsArchiveReader reader = new CountsArchiveReader(inputBasename,
                    alternativeCountArchiveExtension);

            for (int referenceIndex = 0; referenceIndex < reader.getNumberOfIndices(); referenceIndex++) {
                String referenceId = backwards.getId(referenceIndex).toString();
                boolean processThisSequence = true;
                // prepare reference ID for UCSC genome browser.
                if ("MT".equalsIgnoreCase(referenceId)) {
                    // patch chromosome name for UCSC genome browser:
                    referenceId = "M";
                }

                // ignore c22_H2, c5_H2, and other contigs but not things like chr1 (mm9)
                if (referenceId.startsWith("c") && !referenceId.startsWith("chr")) {
                    processThisSequence = false;
                }

                // ignore NT_*
                if (referenceId.startsWith("NT_")) {
                    processThisSequence = false;
                }

                if (filterByReferenceNames && !includeReferenceNames.contains(referenceId)) {
                    processThisSequence = false;
                }

                if (processThisSequence) {
                    // prepend the reference id with "chr" if it doesn't use that already
                    final String chromosome;
                    if (referenceId.startsWith("chr")) {
                        chromosome = referenceId;
                    } else {
                        chromosome = "chr" + referenceId;
                    }

                    long sumCount = 0;
                    int numCounts = 0;

                    final CountsReader counts = reader.getCountReader(referenceIndex);
                    int writePosition;
                    while (counts.hasNextTransition()) {
                        counts.nextTransition();
                        final int length = counts.getLength();

                        final int count = counts.getCount();
                        final int position = counts.getPosition();
                        writePosition = position + 1;
                        if (count != 0) {
                            writer.printf("%s %d %d %d\n", chromosome, writePosition, writePosition + length,
                                    count);
                        }
                        sumCount += count;
                        numCounts++;
                    }
                    final double averageCount = sumCount / (double) numCounts;
                    System.out.println("average count for sequence " + referenceId + " " + averageCount);
                }
            }
        } finally {
            IOUtils.closeQuietly(writer);
        }
    }

    /**
     * Main method.
     *
     * @param args command line args.
     * @throws com.martiansoftware.jsap.JSAPException error parsing
     * @throws java.io.IOException error parsing or executing.
     */

    public static void main(final String[] args) throws JSAPException, IOException {
        new CountsArchiveToBedGraphMode().configure(args).execute();
    }
}