edu.cornell.med.icb.goby.modes.ReadQualityStatsMode.java Source code

Java tutorial

Introduction

Here is the source code for edu.cornell.med.icb.goby.modes.ReadQualityStatsMode.java

Source

/*
 * Copyright (C) 2009-2010 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.modes;

import com.google.protobuf.ByteString;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import edu.cornell.med.icb.goby.reads.Reads;
import edu.cornell.med.icb.goby.reads.ReadsReader;
import it.unimi.dsi.fastutil.bytes.ByteArrayList;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
import it.unimi.dsi.logging.ProgressLogger;
import org.apache.commons.io.IOUtils;
import org.apache.commons.math.random.MersenneTwister;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;

/**
 * Evaluate statistics for read qualities.
 *
 * @author Fabien Campagne
 */
public class ReadQualityStatsMode extends AbstractGobyMode {

    private static final org.apache.log4j.Logger LOG = org.apache.log4j.Logger
            .getLogger(ReadQualityStatsMode.class);
    /**
     * The mode name.
     */
    private static final String MODE_NAME = "read-quality-stats";

    /**
     * The mode description help text.
     */
    private static final String MODE_DESCRIPTION = "Calculate statistics for quality scores in a compact reads file.";

    /**
     * The output file.
     */
    private File outputFile;

    /**
     * The basename of the compact read files.
     */
    private final List<File> inputFiles = new LinkedList<File>();
    private double sampleFraction = 0.01;

    @Override
    public String getModeName() {
        return MODE_NAME;
    }

    @Override
    public String getModeDescription() {
        return MODE_DESCRIPTION;
    }

    enum OutputFormat {
        TSV,
    }

    private OutputFormat outputFormat = OutputFormat.TSV;

    /**
     * Number of samples ignored because of sampleFraction value < 1.0d.
     */
    private long numberOfSkippedReads;

    /**
     * Number of samples observed because of sampleFraction value < 1.0d. If sampleFraction == 1.0d
     * this should be the total number of samples in the file.
     */
    private long numberOfObservedReads;

    /**
     * Configure.
     *
     * @param args command line arguments
     * @return this object for chaining
     * @throws java.io.IOException error parsing
     * @throws com.martiansoftware.jsap.JSAPException
     *                             error parsing
     */
    @Override
    public AbstractCommandLineMode configure(final String[] args) throws IOException, JSAPException {
        final JSAPResult jsapResult = parseJsapArguments(args);

        final File[] inputFilesArray = jsapResult.getFileArray("input");
        inputFiles.addAll(Arrays.asList(inputFilesArray));
        outputFile = jsapResult.getFile("output");
        outputFormat = OutputFormat.valueOf(jsapResult.getString("format").toUpperCase());
        sampleFraction = jsapResult.getDouble("sample-fraction");
        return this;
    }

    /**
     * The percentage of reads to process. 0.01 means 1% of reads,
     * 1.0 means 100% of reads. The default of 0.01 should work fine
     * for most files but if you are dealing with a very small file
     * you should set this to 1.0.
     *
     * @return The percentage of reads to process
     */
    public double getSampleFraction() {
        return sampleFraction;
    }

    /**
     * The precentage of reads to process. 0.01 means 1% of reads,
     * 1.0 means 100% of reads. The default of 0.01 should work fine
     * for most files but if you are dealing with a very small file
     * you should set this to 1.0.
     *
     * @param sampleFraction The percentage of reads to process
     */
    public void setSampleFraction(final double sampleFraction) {
        this.sampleFraction = sampleFraction;
    }

    /**
     * Get the list of files (reads/alignments) to process.
     *
     * @return The list of files.
     */
    public List<File> getInputFiles() {
        return inputFiles;
    }

    /**
     * Add the specified file to the list of files to process.
     *
     * @param inputFile The file to process
     */
    public void addInputFile(final File inputFile) {
        inputFiles.add(inputFile);
    }

    /**
     * Get the output file.
     *
     * @return the output file
     */
    public File getOutputFile() {
        return outputFile;
    }

    /**
     * Set the output file.
     *
     * @param outputFile the output file
     */
    public void setOutputFile(final File outputFile) {
        this.outputFile = outputFile;
    }

    /**
     * Number of reads skipped because of sampleFraction value < 1.0d.
     * @return the number of skipped samples.
     */
    public long getNumberOfSkippedReads() {
        return numberOfSkippedReads;
    }

    /**
     * Number of reads observed because of sampleFraction value < 1.0d. If sampleFraction == 1.0d
     * this should be the total number of reads in the file.
     * @return the number of observed samples
     */
    public long getNumberOfObservedReads() {
        return numberOfObservedReads;
    }

    /**
     * Display the alignments as text files.
     *
     * @throws java.io.IOException error reading / writing
     */
    @Override
    public void execute() throws IOException {
        PrintStream writer = null;
        try {
            numberOfSkippedReads = 0;
            numberOfObservedReads = 0;
            writer = outputFile == null ? System.out : new PrintStream(new FileOutputStream(outputFile));
            final Int2ObjectMap<ReadQualityStats> qualityStats = new Int2ObjectOpenHashMap<ReadQualityStats>();
            final ProgressLogger progress = new ProgressLogger(LOG);
            progress.start();
            final MersenneTwister random = new MersenneTwister(37);
            final boolean doSample = sampleFraction < 1.0d;
            writer.println("basename\treadIndex\t25%-percentile\tmedian\taverageQuality\t75%-percentile");
            for (final File filename : inputFiles) {
                final ReadsReader reader = new ReadsReader(filename);
                final String basename = ReadsReader.getBasename(filename.toString());
                for (final Reads.ReadEntry entry : reader) {
                    if (!doSample || random.nextDouble() < sampleFraction) {
                        final ByteString qualityScores = entry.getQualityScores();
                        final int size = qualityScores.size();
                        for (int readIndex = 0; readIndex < size; readIndex++) {
                            final byte code = qualityScores.byteAt(readIndex);
                            ReadQualityStats stats = qualityStats.get(readIndex);
                            if (stats == null) {
                                stats = new ReadQualityStats(1.0d);
                                qualityStats.put(readIndex, stats);
                                stats.readIndex = readIndex;
                            }
                            stats.observe(code);
                        }
                        numberOfObservedReads++;
                    } else {
                        numberOfSkippedReads++;
                    }
                    progress.lightUpdate();
                }

                for (final ReadQualityStats stat : qualityStats.values()) {
                    if (!stat.sampleIsEmpty()) {
                        stat.evaluatePercentiles();
                        writer.printf("%s\t%d\t%d\t%d\t%f\t%d%n", basename, stat.readIndex, stat.percentile(25),
                                stat.percentile(50), stat.averageQuality / stat.observedCount, stat.percentile(75));
                    }
                }
            }
            progress.updateAndDisplay();
            progress.stop();

        } finally {
            if (writer != System.out) {
                IOUtils.closeQuietly(writer);
            }
        }
    }

    /**
     * Main method.
     *
     * @param args command line args.
     * @throws com.martiansoftware.jsap.JSAPException
     *                             error parsing
     * @throws java.io.IOException error parsing or executing.
     */

    public static void main(final String[] args) throws JSAPException, IOException {
        new ReadQualityStatsMode().configure(args).execute();
    }

    private static class ReadQualityStats {
        private final MersenneTwister random = new MersenneTwister();
        private int readIndex;
        // byte minValue;
        // byte maxValue;
        private final ByteArrayList sample = new ByteArrayList();
        private double averageQuality;
        private int observedCount;
        private final double sampleFraction;
        private final boolean doSample;

        private ReadQualityStats(final double sampleFraction) {
            this.sampleFraction = sampleFraction;
            doSample = sampleFraction < 1.0;
        }

        void observe(final byte b) {
            averageQuality += b;
            observedCount += 1;
            if (!doSample || random.nextDouble() < sampleFraction) {
                sample.add(b);
            }
        }

        public void evaluatePercentiles() {
            Collections.sort(sample);
        }

        public byte percentile(final double percent) {
            final int index = (int) ((double) sample.size() * percent / 100d);
            return sample.get(index);
        }

        public boolean sampleIsEmpty() {
            return sample.isEmpty();
        }
    }
}