edu.cornell.med.icb.goby.modes.CompactToFastaMode.java Source code

Java tutorial

Introduction

Here is the source code for edu.cornell.med.icb.goby.modes.CompactToFastaMode.java

Source

/*
 * Copyright (C) 2009-2010 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.modes;

import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import edu.cornell.med.icb.goby.reads.ColorSpaceConverter;
import edu.cornell.med.icb.goby.reads.QualityEncoding;
import edu.cornell.med.icb.goby.reads.ReadSet;
import edu.cornell.med.icb.goby.reads.Reads;
import edu.cornell.med.icb.goby.reads.ReadsReader;
import it.unimi.dsi.fastutil.ints.Int2IntMap;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.ArrayUtils;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Locale;

/**
 * Converts a Compact file to <a href="http://en.wikipedia.org/wiki/FASTA_format">FASTA</a>
 * or <a href="http://en.wikipedia.org/wiki/FASTQ_format">FASTQ</a> format.
 *
 * @author Fabien Campagne
 *         Date: May 4 2009
 *         Time: 12:28 PM
 */
public class CompactToFastaMode extends AbstractGobyMode {
    /**
     * The mode name.
     */
    private static final String MODE_NAME = "compact-to-fasta";
    /**
     * The mode description help text.
     */
    private static final String MODE_DESCRIPTION = "Converts a Compact file to Fasta format.";

    /**
     * Constant quality score to use when writing a FASTQ file from a source file that
     * contains no quality score values.
     */
    private static final byte FAKE_QUALITY_SCORE = 40;
    private int smallestQueryIndex = Integer.MAX_VALUE;

    private int largestQueryIndex = Integer.MIN_VALUE;
    private boolean processPairs;
    private boolean hasStartOrEndPosition;
    private long startPosition;
    private long endPosition;
    private int fastaLineLength = 50;

    public int getSmallestQueryIndex() {
        return smallestQueryIndex;
    }

    public int getLargestQueryIndex() {
        return largestQueryIndex;
    }

    /**
     * Output formats supported by Goby.
     */
    public enum OutputFormat {
        /**
         * <a href="http://en.wikipedia.org/wiki/FASTA_format">FASTA</a>.
         */
        FASTA,
        /**
         * <a href="http://en.wikipedia.org/wiki/FASTQ_format">FASTQ</a>.
         */
        FASTQ
    }

    private String inputFilename;
    private String outputFilename;
    private String outputPairFilename;
    private boolean indexToHeader;
    private boolean identifierToHeader;
    private boolean referenceConversion;
    private String alphabet;
    private File readIndexFilterFile;
    private int numberOfFilteredSequences;

    private int numberOfSequences;
    private final Int2IntMap queryLengths = new Int2IntOpenHashMap();
    private boolean hashOutputFilename;

    private static final char[] FAKE_NT_ALPHABET = { 'A', 'C', 'G', 'T', 'N' };

    private boolean outputColorMode;
    private boolean outputFakeNtMode;
    private int trimAdaptorLength;

    // TODO: generate colorspace results using BWA without using Stu's ouputFakeQualityMode HACK !
    private boolean outputFakeQualityMode;

    /**
     * Output format for converted file.
     */
    private OutputFormat outputFormat = OutputFormat.FASTA;

    /**
     * Quality encoding for FASTQ output only.
     */
    private QualityEncoding qualityEncoding = QualityEncoding.ILLUMINA;

    /**
     * Select the desired output format.
     *
     * @param format The output format to convert to
     */
    public void setOutputFormat(final OutputFormat format) {
        outputFormat = format;
    }

    /**
     * Set the type of encoding to write quality scores.  Applies to
     * {@link OutputFormat#FASTQ} only.
     *
     * @param qualityEncoding The encoding format to use.
     */
    public void setQualityEncoding(final QualityEncoding qualityEncoding) {
        this.qualityEncoding = qualityEncoding;
    }

    /**
     * Fet the type of encoding used to write quality scores.  Applies to
     * {@link OutputFormat#FASTQ} only.
     *
     * @return The encoding format in use.
     */
    public QualityEncoding getQualityEncoding() {
        return qualityEncoding;
    }

    public OutputFormat getOutputFormat() {
        return outputFormat;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public String getModeName() {
        return MODE_NAME;
    }

    public void setStartPosition(long startPosition) {
        this.startPosition = startPosition;
        hasStartOrEndPosition = true;
    }

    public void setEndPosition(long endPosition) {
        this.endPosition = endPosition;
        hasStartOrEndPosition = true;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public String getModeDescription() {
        return MODE_DESCRIPTION;
    }

    public int getNumberOfSequences() {
        return numberOfSequences;
    }

    public int getMinSequenceLength() {
        int minLength = Integer.MAX_VALUE;
        for (final int length : queryLengths.values()) {
            minLength = Math.min(minLength, length);
        }
        return minLength;
    }

    /**
     * Configure.
     *
     * @param args command line arguments
     * @return this object for chaining
     * @throws IOException   error parsing
     * @throws JSAPException error parsing
     */
    @Override
    public AbstractCommandLineMode configure(final String[] args) throws IOException, JSAPException {
        final JSAPResult jsapResult = parseJsapArguments(args);

        inputFilename = jsapResult.getString("input");
        outputFilename = jsapResult.getString("output");
        outputPairFilename = jsapResult.getString("pair-output");
        if (outputPairFilename != null) {
            processPairs = true;
        }
        outputFormat = OutputFormat.valueOf(jsapResult.getString("output-format").toUpperCase(Locale.getDefault()));
        alphabet = jsapResult.getString("alphabet");
        indexToHeader = jsapResult.getBoolean("index-to-header");
        identifierToHeader = jsapResult.getBoolean("identifier-to-header");
        outputColorMode = jsapResult.getBoolean("output-color-space");
        outputFakeNtMode = jsapResult.getBoolean("output-fake-nt");
        trimAdaptorLength = jsapResult.getInt("trim-adaptor-length");
        outputFakeQualityMode = jsapResult.getBoolean("output-fake-quality");
        referenceConversion = jsapResult.getBoolean("reference");
        readIndexFilterFile = jsapResult.getFile("read-index-filter");
        fastaLineLength = jsapResult.getInt("fasta-line-length");
        qualityEncoding = QualityEncoding
                .valueOf(jsapResult.getString("quality-encoding").toUpperCase(Locale.getDefault()));

        if (jsapResult.contains("start-position") || jsapResult.contains("end-position")) {
            hasStartOrEndPosition = true;
            startPosition = jsapResult.getLong("start-position", 0L);
            endPosition = jsapResult.getLong("end-position", Long.MAX_VALUE);
            if (startPosition == 0 && endPosition == 0) {
                // whole file.
                hasStartOrEndPosition = false;
            }
        }

        return this;
    }

    public void setTrimAdaptorLength(final int trimAdaptorLength) {
        this.trimAdaptorLength = trimAdaptorLength;
    }

    public void setOutputColorMode(final boolean outputColorMode) {
        this.outputColorMode = outputColorMode;
    }

    public void setOutputFakeNtMode(final boolean outputFakeNtMode) {
        this.outputFakeNtMode = outputFakeNtMode;
    }

    public void setOutputFakeQualityMode(final boolean outputFakeQualityMode) {
        this.outputFakeQualityMode = outputFakeQualityMode;
    }

    public void setIndexToHeader(final boolean indexToHeader) {
        this.indexToHeader = indexToHeader;
    }

    public void setReadIndexFilterFile(final File readIndexFilterFile) {
        this.readIndexFilterFile = readIndexFilterFile;
    }

    public void setAlphabet(final String alphabet) {
        this.alphabet = alphabet;
    }

    public void setReferenceConversion(final boolean referenceConversion) {
        this.referenceConversion = referenceConversion;
    }

    public void setInputFilename(final String inputFilename) {
        this.inputFilename = inputFilename;
    }

    public void setOutputFilename(final String outputFilename) {
        this.outputFilename = outputFilename;
    }

    @Override
    public void execute() throws IOException {
        // output file extension is based on the output format type
        final String outputExtension = "." + outputFormat.name().toLowerCase(Locale.getDefault());
        if (outputFilename == null) {
            outputFilename = FilenameUtils.removeExtension(inputFilename) + (hashOutputFilename ? hash() : "")
                    + outputExtension;
        } else if (hashOutputFilename) {
            outputFilename = FilenameUtils.removeExtension(outputFilename) + (hashOutputFilename ? hash() : "")
                    + outputExtension;
        }

        ReadSet readIndexFilter = new ReadSet();
        if (readIndexFilterFile == null) {
            readIndexFilter = null;
        } else {
            readIndexFilter.load(readIndexFilterFile);
        }

        final ProgressLogger progress = new ProgressLogger();
        progress.start();
        progress.displayFreeMemory = true;

        // Only sanger and illumina encoding are supported at this time
        if (qualityEncoding == QualityEncoding.SOLEXA) {
            throw new UnsupportedOperationException(
                    "SOLEXA encoding is not supported " + "at this time for lack of clear documentation.");
        } else if (qualityEncoding != QualityEncoding.SANGER && qualityEncoding != QualityEncoding.ILLUMINA) {
            throw new UnsupportedOperationException("Unknown encoding: " + qualityEncoding);
        }

        ReadsReader reader = null;
        Writer writer = null;
        OutputStreamWriter pairWriter = null;
        final int newEntryCharacter;
        switch (outputFormat) {
        case FASTQ:
            newEntryCharacter = '@';
            break;
        case FASTA:
        default:
            newEntryCharacter = '>';
            break;
        }
        try {
            writer = new OutputStreamWriter(new FastBufferedOutputStream(new FileOutputStream(outputFilename)));
            pairWriter = processPairs
                    ? new OutputStreamWriter(new FastBufferedOutputStream(new FileOutputStream(outputPairFilename)))
                    : null;

            final MutableString colorSpaceBuffer = new MutableString();
            final MutableString sequence = new MutableString();
            final MutableString sequencePair = new MutableString();

            if (hasStartOrEndPosition) {
                reader = new ReadsReader(startPosition, endPosition, inputFilename);
            } else {
                reader = new ReadsReader(inputFilename);
            }

            for (final Reads.ReadEntry readEntry : reader) {
                if (readIndexFilter == null || readIndexFilter.contains(readEntry.getReadIndex())) {
                    final String description;

                    if (indexToHeader) {
                        description = Integer.toString(readEntry.getReadIndex());
                    } else if (identifierToHeader && readEntry.hasReadIdentifier()) {
                        description = readEntry.getReadIdentifier();
                    } else if (readEntry.hasDescription()) {
                        description = readEntry.getDescription();
                    } else {
                        description = Integer.toString(readEntry.getReadIndex());
                    }

                    writer.write(newEntryCharacter);
                    writer.write(description);
                    writer.write('\n');
                    final boolean processPairInThisRead = processPairs && readEntry.hasSequencePair();
                    if (processPairInThisRead) {
                        pairWriter.write(newEntryCharacter);
                        pairWriter.write(description);
                        pairWriter.write('\n');
                    }
                    observeReadIndex(readEntry.getReadIndex());
                    ReadsReader.decodeSequence(readEntry, sequence);

                    if (processPairInThisRead) {
                        ReadsReader.decodeSequence(readEntry, sequencePair, true);
                    }

                    if (queryLengths != null) {
                        queryLengths.put(readEntry.getReadIndex(), sequence.length());
                    }

                    MutableString transformedSequence = sequence;
                    MutableString transformedSequencePair = sequencePair;
                    if (outputColorMode) {
                        ColorSpaceConverter.convert(transformedSequence, colorSpaceBuffer, referenceConversion);
                        transformedSequence = colorSpaceBuffer;
                        if (processPairInThisRead) {
                            ColorSpaceConverter.convert(transformedSequencePair, colorSpaceBuffer,
                                    referenceConversion);
                            transformedSequencePair = colorSpaceBuffer;
                        }
                    }
                    if (outputFakeNtMode) {
                        for (int i = 0; i < transformedSequence.length(); i++) {
                            transformedSequence.charAt(i, getFakeNtCharacter(transformedSequence.charAt(i)));
                        }
                        if (processPairInThisRead) {
                            for (int i = 0; i < transformedSequencePair.length(); i++) {
                                transformedSequencePair.charAt(i,
                                        getFakeNtCharacter(transformedSequencePair.charAt(i)));
                            }
                        }
                    }
                    if (trimAdaptorLength > 0) {
                        transformedSequence = transformedSequence.substring(trimAdaptorLength);
                        if (processPairInThisRead) {
                            transformedSequencePair = transformedSequencePair.substring(trimAdaptorLength);
                        }
                    }
                    // filter unrecognized bases from output
                    if (alphabet != null) {
                        for (int i = 0; i < transformedSequence.length(); i++) {
                            if (alphabet.indexOf(transformedSequence.charAt(i)) == -1) {
                                transformedSequence.charAt(i, 'N');
                            }
                        }
                    }
                    if (processPairInThisRead) {
                        if (alphabet != null) {

                            for (int i = 0; i < transformedSequencePair.length(); i++) {
                                if (alphabet.indexOf(transformedSequencePair.charAt(i)) == -1) {
                                    transformedSequencePair.charAt(i, 'N');
                                }
                            }
                        }
                    }
                    writeSequence(writer, transformedSequence, fastaLineLength);
                    if (processPairInThisRead) {
                        writeSequence(pairWriter, transformedSequencePair, fastaLineLength);
                    }
                    if (outputFormat == OutputFormat.FASTQ) {
                        final int readLength = transformedSequence.length();
                        final byte[] qualityScores = readEntry.getQualityScores().toByteArray();
                        final boolean hasQualityScores = readEntry.hasQualityScores()
                                && !ArrayUtils.isEmpty(qualityScores);
                        writeQualityScores(writer, hasQualityScores, qualityScores, outputFakeQualityMode,
                                readLength);

                        if (processPairInThisRead) {
                            final int readLengthPair = transformedSequencePair.length();
                            final byte[] qualityScoresPair = readEntry.getQualityScoresPair().toByteArray();
                            final boolean hasPairQualityScores = readEntry.hasQualityScoresPair()
                                    && !ArrayUtils.isEmpty(qualityScoresPair);
                            writeQualityScores(pairWriter, hasPairQualityScores, qualityScoresPair,
                                    outputFakeQualityMode, readLengthPair);

                        }
                    }
                    ++numberOfFilteredSequences;

                    progress.lightUpdate();
                    ++numberOfSequences;
                }

            }
        } finally {
            IOUtils.closeQuietly(writer);
            if (processPairs) {
                IOUtils.closeQuietly(pairWriter);
            }
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) { // NOPMD
                    // silently ignore
                }
            }
        }

        progress.stop();
    }

    private String getPair2(final String outputFilename) {
        return null; //To change body of created methods use File | Settings | File Templates.
    }

    // determine the values of smallestQueryIndex and largestQueryIndex over the input files.
    private void observeReadIndex(final int readIndex) {
        smallestQueryIndex = Math.min(smallestQueryIndex, readIndex);
        largestQueryIndex = Math.max(largestQueryIndex, readIndex);
    }

    /**
     * Return a fake nucleotide corresponding to a color space character, or leave unchanged if
     * input is not color space.
     *
     * @param c The character to get the fake nucleotide for
     * @return he fake nucleotide represented by the input
     */
    private char getFakeNtCharacter(final char c) {
        final int i = Character.getNumericValue(c);
        return i >= 0 && i < FAKE_NT_ALPHABET.length ? FAKE_NT_ALPHABET[i] : c;
    }

    private String hash() {
        if (readIndexFilterFile == null) {
            return "";
        } else {
            return Integer.toHexString(readIndexFilterFile.hashCode() ^ inputFilename.hashCode());
        }
    }

    /**
     * Write the sequence at 60 chars per line.
     *
     * @param writer   the writer
     * @param sequence the sequence
     * @throws IOException error reading
     */
    public static void writeSequence(final Writer writer, final MutableString sequence) throws IOException {
        writeSequence(writer, sequence, 60);
    }

    public static void writeSequence(final Writer writer, final MutableString sequence, final int fastaLineLength)
            throws IOException {
        final int length = sequence.length();
        for (int i = 0; i < length; i++) {
            if (i != 0 && (i % fastaLineLength == 0)) {
                writer.write('\n');
            }
            writer.write(sequence.charAt(i));
        }
        writer.write('\n');

    }

    /**
     * Write quality scores, or fake some anyway if the input does not have any or
     * fakeQualityScores is true.
     *
     * @param writer            Where to write quality scores.
     * @param hasScores         If the compact-reads file has quality scores.
     * @param qualityScores     The quality scores (Phred unit)
     * @param fakeQualityScores Whether quality scores should be faked.
     * @param readLength        The read lenth, used when faking.
     * @throws IOException If an error occured.
     */
    private void writeQualityScores(final Writer writer, final boolean hasScores, final byte[] qualityScores,
            final boolean fakeQualityScores, final int readLength) throws IOException {
        // in theory the quality length and the read length should be equal
        // however in practice this may not be the case. Either way, we should
        // write readLength number of quality scores to the output. Writing too
        // many or too few seems to confuse some aligners.
        writer.write('+');
        writer.write('\n');
        for (int i = 0; i < readLength; i++) {
            if (i != 0 && (i % fastaLineLength == 0)) {
                writer.write('\n');
            }

            // write the score if there is one otherwise write the default "fake" score
            if (!fakeQualityScores && hasScores && i < qualityScores.length) {
                writer.write(qualityEncoding.phredQualityScoreToAsciiEncoding(qualityScores[i]));
            } else {
                writer.write(qualityEncoding.phredQualityScoreToAsciiEncoding(FAKE_QUALITY_SCORE));
            }
        }
        writer.write('\n');
    }

    public String getOutputFilename() {
        return outputFilename;
    }

    public int getNumberOfFilteredSequences() {
        return numberOfFilteredSequences;
    }

    public void setHashOutputFilename(final boolean hash) {
        hashOutputFilename = hash;
    }

    public static void main(final String[] args) throws IOException, JSAPException {
        new CompactToFastaMode().configure(args).execute();
    }
}