edu.cornell.med.icb.goby.modes.SampleQualityScoresMode.java Source code

Java tutorial

Introduction

Here is the source code for edu.cornell.med.icb.goby.modes.SampleQualityScoresMode.java

Source

/*
 * Copyright (C) 2009-2011 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.modes;

import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import edu.cornell.med.icb.goby.readers.FastXEntry;
import edu.cornell.med.icb.goby.readers.FastXReader;
import edu.cornell.med.icb.goby.readers.sam.SAMRecordIterable;
import edu.cornell.med.icb.goby.reads.Reads;
import edu.cornell.med.icb.goby.reads.ReadsReader;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.lang.MutableString;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Read the first n (defaults to 10,000) entries of a compact-reads or FASTQ file and report the minimum,
 * maximum, and average quality score value and provide a guess at the quality encoding scheme
 * (Phred, Sanger, Illumina/Solexa). It's worth noting that compact-reads files, if created correctly,
 * should always report Phred.
 *
 * @author Kevin C. Dorff
 *         Date: Sept 13, 2011
 */
public class SampleQualityScoresMode extends AbstractGobyMode {
    /**
     * Used to log informational and debug messages.
     */
    private static final Log LOG = LogFactory.getLog(SampleQualityScoresMode.class);

    /**
     * The mode name.
     */
    private static final String MODE_NAME = "sample-quality-scores";

    /**
     * The mode description help text.
     */
    private static final String MODE_DESCRIPTION = "Read the first n (defaults to 10,000) entries of a compact-reads, fastq, bam, sam, or sam.gz file "
            + "and report the minimum, maximum, and average quality score value and provide a guess at the quality "
            + "encoding scheme (Phred, Sanger, Illumina/Solexa). It's worth noting that compact-reads files, if "
            + "created correctly, should always report Phred. ";

    /**
     * The reads files to process.
     */
    private final List<String> inputFilenames = new ArrayList<String>();

    /**
     * The CURRENT min found qual score (only the last file processed).
     * See avgQualScores and likelyEncodings file results.
     */
    private int minQualScore;

    /**
     * The CURRENT max found qual score (only the last file processed).
     * See avgQualScores and likelyEncodings file results.
     */
    private int maxQualScore;

    /**
     * The CURRENT sum of all of the quality scores, so we can perform an average (only the last file processed).
     * See avgQualScores and likelyEncodings file results.
     */
    private long sumQualScores;

    /**
     * The CURRENT number of qual scores sampled, so we can perform an average (only the last file processed).
     * See avgQualScores and likelyEncodings file results.
     */
    private long numQualScoresSampled;

    /**
     * The average qual scores for each processed file.
     */
    private final IntList avgQualScores = new IntArrayList();

    /**
     * The likely encodings for each processed file.
     */
    private final List<String> likelyEncodings = new ArrayList<String>();

    /**
     * The number of read entries from each input file to to process.
     */
    private int numberOfReadEntriesToProcess = 10000;

    /**
     * If quality scores were found in the current input file.
     */
    boolean qualityScoresFound;

    /**
     * Returns the mode name defined by subclasses.
     *
     * @return The name of the mode
     */
    @Override
    public String getModeName() {
        return MODE_NAME;
    }

    /**
     * Returns the mode description defined by subclasses.
     *
     * @return A description of the mode
     */
    @Override
    public String getModeDescription() {
        return MODE_DESCRIPTION;
    }

    /**
     * Clear the list of files to process.
     *
     * @param inputFilename The list of files.
     */
    public void addInputFilename(final String inputFilename) {
        inputFilenames.add(inputFilename);
    }

    /**
     * Clear the list of files to process.
     */
    public void clearInputFilenames() {
        inputFilenames.clear();
    }

    /**
     * The list of input filenames.
     * @return list of input filenames.
     */
    public List<String> getInputFilenames() {
        return inputFilenames;
    }

    /**
     * Get the number of read entries to process per input file.
     *
     * @return  the number of read entries to process.
     */
    public int getNumberOfReadEntriesToProcess() {
        return numberOfReadEntriesToProcess;
    }

    /**
     * Set the number of read entries to process per input file.
     * Set to <= 0 to process the entire file.
     *
     * @param numberOfReadEntriesToProcess the number of read entries to process.
     */
    public void setNumberOfReadEntriesToProcess(final int numberOfReadEntriesToProcess) {
        this.numberOfReadEntriesToProcess = numberOfReadEntriesToProcess;
    }

    /**
     * The average quality scores, one for each input file.
     * @return the list of average quality scores.
     */
    public IntList getAvgQualScores() {
        return avgQualScores;
    }

    /**
     * The names of the likely quality encoding schemes, one for each input file.
     * @return the list of likely quality encoding scheme names.
     */
    public List<String> getLikelyEncodings() {
        return likelyEncodings;
    }

    /**
     * Configure.
     *
     * @param args command line arguments
     * @return this object for chaining
     * @throws java.io.IOException   error parsing
     * @throws com.martiansoftware.jsap.JSAPException error parsing
     */
    @Override
    public AbstractCommandLineMode configure(final String[] args) throws IOException, JSAPException {
        final JSAPResult jsapResult = parseJsapArguments(args);
        inputFilenames.clear();
        Collections.addAll(inputFilenames, jsapResult.getStringArray("input"));
        numberOfReadEntriesToProcess = jsapResult.getInt("number-of-reads");
        return this;
    }

    /**
     * Perform the conversion fasta -> compact-reads on one or more files.
     *
     * @throws java.io.IOException if the input/output files cannot be read/written
     */
    @Override
    public void execute() throws IOException {
        try {
            for (final String inputFilename : inputFilenames) {
                processingStart(inputFilename);
                final int numEntriesSampled;
                final String compareName = inputFilename.toLowerCase();
                if (compareName.endsWith(".compact-reads")) {
                    numEntriesSampled = processCompactReadsFile(inputFilename);
                } else if (compareName.endsWith(".bam") || compareName.endsWith(".sam")
                        || compareName.endsWith(".sam.gz")) {
                    numEntriesSampled = processSamReadsFile(inputFilename);
                } else {
                    numEntriesSampled = processFastqFile(inputFilename);
                }
                processingEnd(numEntriesSampled);
            }
        } catch (Exception e) {
            LOG.error("Error processing", e);
        }
    }

    /**
     * Before each file, reset the state, report the filename about to be processed.
     * @param inputFilename the filename to be processed.
     */
    private void processingStart(final String inputFilename) {
        System.out.printf("Processing %s%n", inputFilename);
        numQualScoresSampled = 0;
        sumQualScores = 0;
        minQualScore = Integer.MAX_VALUE;
        maxQualScore = Integer.MIN_VALUE;
        qualityScoresFound = false;
    }

    /**
     * After each file, report the details for the file.
     * @param numEntries the number of read entries processed.
     */
    private void processingEnd(final int numEntries) {
        System.out.printf("Processed %d read entries.%n", numEntries);
        final int avgQualScore;
        if (numQualScoresSampled > 0) {
            avgQualScore = (int) (sumQualScores / numQualScoresSampled);
        } else {
            avgQualScore = 0;
        }
        final String likelyEncoding;
        if (!qualityScoresFound) {
            likelyEncoding = "fasta";
        } else if (avgQualScore <= 41) {
            likelyEncoding = "Phred";
        } else if (avgQualScore <= 83) {
            likelyEncoding = "Sanger";
        } else {
            likelyEncoding = "Illumina/Solexa";
        }

        System.out.printf("Min quality score: %d%n", minQualScore);
        System.out.printf("Max quality score: %d%n", maxQualScore);
        System.out.printf("Avg quality score: %d%n", avgQualScore);
        System.out.printf("Probable quality encoding scheme: %s%n", likelyEncoding);
        likelyEncodings.add(likelyEncoding);
        avgQualScores.add(avgQualScore);
    }

    /**
     * Process ONE compact-reads file.
     * @param inputFilename the filename to process
     * @return the number of read entries processed
     * @throws IOException error reading file
     */
    private int processCompactReadsFile(final String inputFilename) throws IOException {
        // Create directory for output file if it doesn't already exist
        int i = 0;
        for (final Reads.ReadEntry entry : new ReadsReader(inputFilename)) {
            final byte[] qualityScores = entry.getQualityScores().toByteArray();
            final boolean hasQualityScores = entry.hasQualityScores() && !ArrayUtils.isEmpty(qualityScores);
            if (hasQualityScores) {
                qualityScoresFound = true;
                for (final int qualScore : qualityScores) {
                    minQualScore = Math.min(qualScore, minQualScore);
                    maxQualScore = Math.max(qualScore, maxQualScore);
                    numQualScoresSampled++;
                    sumQualScores += Math.abs(qualScore);
                }
                if (++i == numberOfReadEntriesToProcess) {
                    break;
                }
            }
        }
        return i;
    }

    /**
     * Process ONE FASTQ file.
     * @param inputFilename the filename to process
     * @return the number of read entries processed
     * @throws IOException error reading file
     */
    private int processFastqFile(final String inputFilename) throws IOException {
        // Create directory for output file if it doesn't already exist
        int i = 0;
        for (final FastXEntry entry : new FastXReader(inputFilename)) {
            final MutableString quality = entry.getQuality();
            if (quality.length() != 0) {
                qualityScoresFound = true;
                for (final int qualScore : quality.array()) {
                    minQualScore = Math.min(qualScore, minQualScore);
                    maxQualScore = Math.max(qualScore, maxQualScore);
                    numQualScoresSampled++;
                    sumQualScores += Math.abs(qualScore);
                }
                if (++i == numberOfReadEntriesToProcess) {
                    break;
                }
            }
        }
        return i;
    }

    /**
     * Process ONE sam, sam.gz, or bam file.
     * @param inputFilename the filename to process
     * @return the number of read entries processed
     * @throws IOException error reading file
     */
    private int processSamReadsFile(final String inputFilename) throws IOException {
        // Create directory for output file if it doesn't already exist
        int i = 0;
        SAMFileReader.setDefaultValidationStringency(SAMFileReader.ValidationStringency.SILENT);
        final SAMFileReader parser = new SAMFileReader(new FileInputStream(inputFilename));
        for (final SAMRecord samRecord : new SAMRecordIterable(parser.iterator())) {
            final String quality = samRecord.getBaseQualityString();
            if (quality != null && quality.length() != 0) {
                qualityScoresFound = true;
                for (final int qualScore : quality.toCharArray()) {
                    //for (final int qualScore : samRecord.getBaseQualities()) {
                    minQualScore = Math.min(qualScore, minQualScore);
                    maxQualScore = Math.max(qualScore, maxQualScore);
                    numQualScoresSampled++;
                    sumQualScores += Math.abs(qualScore);
                }
                if (++i == numberOfReadEntriesToProcess) {
                    break;
                }
            }
        }
        return i;
    }

    /**
     * Main entry point.
     * @param args command line arguments
     * @throws IOException error with IO
     * @throws JSAPException error with command line processing
     */
    public static void main(final String[] args) throws IOException, JSAPException {
        new SampleQualityScoresMode().configure(args).execute();
    }
}