edu.cornell.med.icb.goby.modes.ReformatCompactReadsMode.java Source code

Introduction

Here is the source code for edu.cornell.med.icb.goby.modes.ReformatCompactReadsMode.java
Source

/*
 * Copyright (C) 2009-2010 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.modes;

import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import edu.cornell.med.icb.goby.reads.*;
import edu.cornell.med.icb.goby.util.FileExtensionHelper;
import it.unimi.dsi.fastutil.bytes.ByteArrayList;
import it.unimi.dsi.fastutil.chars.CharArraySet;
import it.unimi.dsi.fastutil.chars.CharSet;
import it.unimi.dsi.fastutil.ints.IntArraySet;
import it.unimi.dsi.fastutil.ints.IntSet;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import it.unimi.dsi.lang.MutableString;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.StringUtils;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Date;
import java.util.Random;

/**
 * Reformat a compact file, possibly dropping identifiers, or descriptions, or splitting the file.
 * When a compact-reads file is split, reads in each split are renumbered (their readIndex
 * is changed), starting at zero for the first sequence of each split. This ensures that indices
 * are correctly concatenated back together.
 *
 * @author Fabien Campagne
 *         Date: Apr 28, 2009
 *         Time: 6:03:56 PM
 */
public class ReformatCompactReadsMode extends AbstractGobyMode {
    private String[] inputFilenames;

    private boolean pushDescription;
    private boolean pushIdentifier;
    private String outputFile;
    private boolean mutateSequences;
    private int numberOfMismatches;
    private CharSet bases;
    private File readIndexFilterFile;

    /**
     * The mode name.
     */
    private static final String MODE_NAME = "reformat-compact-reads";

    /**
     * The mode description help text.
     */
    private static final String MODE_DESCRIPTION = "This mode is used to reformat compact files, possibly dropping "
            + "identifiers, or descriptions, splitting the file in several pieces, or even trimming the sequence or "
            + "introducing mutations. When a compact-reads file is split, reads in each split are renumbered "
            + "(their read index is changed), starting at zero for the first sequence of each split. "
            + "This ensures that indices are correctly concatenated back together. ";

    private int sequencePerChunk = 10000;
    private boolean excludeSequences;
    private int sequencePerOutput = Integer.MAX_VALUE;
    /**
     * Any sequences shorter than this length will be excluded from the output.
     */
    private int minReadLength;
    /**
     * Any sequences longer than this length will be excluded from the output.
     */
    private int maxReadLength = Integer.MAX_VALUE;
    /**
     * Any sequences longer than this length will be cut to this length in the output.
     */
    private int trimReadLength = Integer.MAX_VALUE;
    private final ObjectSet<String> outputFilenames = new ObjectOpenHashSet<String>();

    /**
     * An "unset" value for startPosition and endPosition.
     */
    private boolean hasStartOrEndPosition;

    /**
     * The start position for the reformat.
     */
    private long startPosition;

    /**
     * The end position for the reformat.
     */
    private long endPosition = Long.MAX_VALUE;
    /**
     * The number of bases to trim at the start of the sequence.
     */
    private int trimReadStartLength = -1;
    /**
     * The writer to use to output the reformatted output, or null to create one on the fly.
     */
    private ReadsWriter writer;

    @Override
    public String getModeName() {
        return MODE_NAME;
    }

    @Override
    public String getModeDescription() {
        return MODE_DESCRIPTION;
    }

    /**
     * Configure.
     *
     * @param args command line arguments
     * @return this object for chaining
     * @throws IOException   error parsing
     * @throws JSAPException error parsing
     */
    @Override
    public AbstractCommandLineMode configure(final String[] args) throws IOException, JSAPException {
        final JSAPResult jsapResult = parseJsapArguments(args);
        inputFilenames = jsapResult.getStringArray("input");
        pushDescription = jsapResult.getBoolean("include-descriptions");
        pushIdentifier = jsapResult.getBoolean("include-identifiers");
        excludeSequences = jsapResult.getBoolean("exclude-sequences");
        outputFile = jsapResult.getString("output");
        sequencePerChunk = jsapResult.getInt("sequence-per-chunk");
        sequencePerOutput = jsapResult.getInt("sequence-per-output", sequencePerOutput);
        minReadLength = jsapResult.getInt("minimum-read-length", minReadLength);
        maxReadLength = jsapResult.getInt("maximum-read-length", maxReadLength);
        trimReadLength = jsapResult.getInt("trim-read-length", trimReadLength);
        trimReadStartLength = jsapResult.getInt("trim-read-start", trimReadLength);
        mutateSequences = jsapResult.getBoolean("mutate-sequences");
        numberOfMismatches = jsapResult.getInt("mismatch-number");
        readIndexFilterFile = jsapResult.getFile("read-index-filter");

        if (jsapResult.contains("start-position") || jsapResult.contains("end-position")) {
            hasStartOrEndPosition = true;
            startPosition = jsapResult.getLong("start-position", 0L);
            endPosition = jsapResult.getLong("end-position", Long.MAX_VALUE);
            if (startPosition == 0 && endPosition == 0) {
                endPosition = Long.MAX_VALUE;
            }
        }

        if (startPosition < 0L) {
            throw new JSAPException("Start position must not be less than zero");
        }
        if (endPosition < 0L) {
            throw new JSAPException("End position must not be less than zero");
        }
        if (startPosition > endPosition) {
            throw new JSAPException("Start position must not be greater than the end position");
        }

        bases = new CharArraySet();
        return this;
    }

    public void setReadIndexFilterFile(final File readIndexFilterFile) {
        this.readIndexFilterFile = readIndexFilterFile;
    }

    /**
     * Reformat compact reads.
     *
     * @throws IOException
     */
    @Override
    public void execute() throws IOException {
        final int numToProcess = inputFilenames.length;
        int numProcessed = 0;
        final MutableString sequence = new MutableString();
        final MutableString sequencePair = new MutableString();
        ReadSet readIndexFilter = new ReadSet();
        if (readIndexFilterFile == null) {
            readIndexFilter = null;
        } else {
            readIndexFilter.load(readIndexFilterFile);
        }

        for (final String inputFilename : inputFilenames) {
            int splitIndex = 0;
            final String outputBasename;
            final String outputFilename;

            // if there is only one input file to process
            if (numToProcess == 1 && StringUtils.isNotBlank(outputFile)) {
                outputBasename = outputFile;

                boolean hasCompactExtension = false;
                for (final String extension : FileExtensionHelper.COMPACT_READS_FILE_EXTS) {
                    if (outputFile.endsWith(extension)) {
                        hasCompactExtension = true;
                        break;
                    }
                }

                // if it already has a compact-reads extension leave it alone
                // otherwise build it based on the basename
                if (hasCompactExtension) {
                    outputFilename = outputFile;
                } else {
                    outputFilename = getOutputFilename(outputBasename, ++splitIndex);
                }
            } else {
                // there are a bunch of files to process so number them uniquely based on the name
                outputBasename = stripCompactReadExtensions(inputFilename);
                outputFilename = getOutputFilename(outputBasename, ++splitIndex);
            }

            if (inputFilename.equals(outputFilename)) {
                System.err.println("input cannot equal the output name");
                System.exit(2);
            }
            System.out.printf("Converting [%d/%d] %s to %s%n", ++numProcessed, numToProcess, inputFilename,
                    outputFilename);

            outputFilenames.add(outputFilename);
            // if the instance already has a writer, we write there:
            ReadsWriter writer = this.writer != null ? this.writer
                    : new ReadsWriterImpl(new FileOutputStream(outputFilename));
            writer.setNumEntriesPerChunk(sequencePerChunk);

            final ReadsReader readsReader;
            final FastBufferedInputStream inputFileStream = new FastBufferedInputStream(
                    new FileInputStream(inputFilename));

            if (hasStartOrEndPosition) {
                readsReader = new ReadsReader(startPosition, endPosition, inputFileStream);
            } else {
                readsReader = new ReadsReader(inputFileStream);
            }

            int entriesInOutputFile = 0;
            //DistinctIntValueCounterBitSet allEntries = new DistinctIntValueCounterBitSet();
            int numReadsKept = 0;
            for (final Reads.ReadEntry entry : readsReader) {
                //    allEntries.observe(entry.getReadIndex());
                if (readIndexFilter == null || readIndexFilter.contains(entry.getReadIndex())) {

                    final int readLength = entry.getReadLength();
                    if (readLength < minReadLength || readLength > maxReadLength) {
                        continue;
                    }
                    //transfer meta-data:
                    for (int i = 0; i < entry.getMetaDataCount(); i++) {
                        final Reads.MetaData metaData = entry.getMetaData(i);
                        writer.appendMetaData(metaData.getKey(), metaData.getValue());
                    }
                    if (pushDescription && entry.hasDescription()) {
                        writer.setDescription(entry.getDescription());
                    }

                    if (pushIdentifier && entry.hasReadIdentifier()) {
                        writer.setIdentifier(entry.getReadIdentifier());
                    }
                    if (entry.hasSequence()) {
                        ReadsReader.decodeSequence(entry, sequence);
                    }
                    if (entry.hasSequencePair()) {
                        ReadsReader.decodeSequence(entry, sequencePair, true);
                    }
                    final ByteArrayList qualScores = ByteArrayList.wrap(entry.getQualityScores().toByteArray());
                    processSequenceAndQualityScores(sequence, qualScores);
                    if (entry.hasSequence() && !excludeSequences) {
                        writer.setSequence(sequence);
                    }
                    if (entry.hasQualityScores()) {
                        writer.setQualityScores(qualScores.toByteArray());
                    }
                    if (entry.hasSequencePair()) {
                        final ByteArrayList pairQualScores = ByteArrayList
                                .wrap(entry.getQualityScores().toByteArray());
                        processSequenceAndQualityScores(sequencePair, pairQualScores);

                        if (entry.hasSequencePair() && !excludeSequences) {
                            writer.setPairSequence(sequencePair);
                        }
                        if (entry.hasQualityScoresPair()) {
                            writer.setQualityScoresPair(pairQualScores.toByteArray());
                        }
                    }
                    // Important: preserve the read index in the input entry:
                    writer.appendEntry(entry.getReadIndex());
                    numReadsKept++;
                    //writer.appendEntry();
                    entriesInOutputFile++;

                    if (entriesInOutputFile > sequencePerOutput) {
                        writer.close();

                        final String newOutputFilename = getOutputFilename(outputBasename, ++splitIndex);
                        outputFilenames.add(newOutputFilename);
                        System.out.printf("Splitting into %s%n", newOutputFilename);
                        writer = new ReadsWriterImpl(new FileOutputStream(newOutputFilename));
                        writer.setNumEntriesPerChunk(sequencePerChunk);
                        entriesInOutputFile = 0;
                    }
                }
            }
            /*  float rate = allEntries.count();
            rate -= numReadsKept;
            rate /= allEntries.count();
            if (readIndexFilter != null) System.out.printf("Percent reads redundant= %f3.2%% %n", rate);
            */
            writer.close();
            writer.printStats(System.out);
        }
    }

    private void processSequenceAndQualityScores(final MutableString sequence, final ByteArrayList qualityScores) {

        assert qualityScores.size() == 0
                || sequence.length() == qualityScores.size() : " sequence and quality score length must match";
        final int length = sequence.length();
        final int qualScoreSize = qualityScores.size();
        if (trimReadStartLength < 0) {
            trimReadStartLength = 0;
        }
        final int newLength = Math.min(length, trimReadLength) - trimReadStartLength;
        if (length == newLength) {
            // no trimming needed.
            return;
        }
        int j = 0;
        if (trimReadStartLength > 0) {
            for (int i = trimReadStartLength; i < length; i++) {
                sequence.charAt(j, sequence.charAt(i));
                if (qualScoreSize != 0) {
                    qualityScores.set(j, qualityScores.get(i));
                }
                j += 1;
            }
        }
        sequence.setLength(newLength);
        if (qualScoreSize != 0) {
            qualityScores.size(newLength);
        }
    }

    private byte[] trimQualityScores(final byte[] qualityScores, int trimReadStartLength, final int trimReadLength,
            final int initialLength) {
        if (trimReadStartLength < 0) {
            trimReadStartLength = 0;
        }
        final int newLength = Math.min(initialLength, trimReadLength) - trimReadStartLength;
        if (initialLength == newLength) {
            // no trimming needed.
            return qualityScores;
        }
        if (qualityScores == null) {
            return null;
        }

        final byte[] trimmedScores = new byte[newLength];
        System.arraycopy(qualityScores, trimReadStartLength + trimReadStartLength, trimmedScores,
                trimReadStartLength, newLength - trimReadStartLength);
        return trimmedScores;

    }

    /**
     * Introduce the given number of point mutations in the given sequence.
     *
     * @param sequence
     * @param numberOfMismatches
     */

    private void mutate(final MutableString sequence, final int numberOfMismatches) {
        final int sequenceLength = sequence.length();
        if (sequenceLength == 0) {
            // When there is no sequence, there is nothing to mutate. This is most notably true for
            // when there is no PAIR sequence.
            return;
        }
        for (final char base : sequence.toCharArray()) {
            bases.add(base);
        }
        if (sequenceLength < numberOfMismatches) {
            System.err.printf("Cannot introduce %d mismatches in a sequence that has only %d "
                    + "residues. Skipping this sequence.\n", numberOfMismatches, sequenceLength);
            return;
        }
        final IntSet alreadyMutated = new IntArraySet();
        final Random random = new Random(new Date().getTime());
        for (int i = 0; i < numberOfMismatches; i++) {
            int mutationPosition;
            do {
                mutationPosition = chooseRandom(random, 0, sequenceLength - 1);
            } while (alreadyMutated.contains(mutationPosition));
            char newBase;
            final char oldBase = sequence.charAt(mutationPosition);
            do {
                newBase = bases.toCharArray()[chooseRandom(random, 0, bases.size() - 1)];
                // introduce the mutation:
                sequence.charAt(mutationPosition, newBase);
            } while (newBase == oldBase);
            alreadyMutated.add(mutationPosition);
        }
    }

    /**
     * @param lo lower limit of range
     * @param hi upper limit of range
     * @return a random integer in the range <STRONG>lo</STRONG>,
     *         <STRONG>lo</STRONG>+1, ... ,<STRONG>hi</STRONG>
     */
    private int chooseRandom(final Random random, final int lo, final int hi) {
        final double r = random.nextDouble();
        final int result = (int) ((long) lo + (long) ((1L + (long) hi - (long) lo) * r));
        assert result >= lo && result <= hi;
        return result;
    }

    private String getOutputFilename(final String outputBasename, final int splitIndex) {
        final boolean neverSplits = sequencePerOutput == Integer.MAX_VALUE;
        return outputBasename + (neverSplits ? "" : "-" + splitIndex) + ".compact-reads";
    }

    /**
     * Get the filename including path WITHOUT fastx extensions (including .gz if it is there).
     *
     * @param name the full path to the file in question
     * @return the filename without the fastx/gz extensions or the same name of those extensions
     *         weren't found.
     */
    private static String stripCompactReadExtensions(final String name) {
        final String filename = FilenameUtils.getName(name);
        for (final String ext : FileExtensionHelper.COMPACT_READS_FILE_EXTS) {
            if (filename.endsWith(ext)) {
                return FilenameUtils.getFullPath(name) + filename.substring(0, filename.lastIndexOf(ext));
            }
        }
        return name;
    }

    public void setInputFilenames(final String... filenames) {
        inputFilenames = filenames;
    }

    public void setInputFilenames(final File... files) {
        inputFilenames = new String[files.length];
        int i = 0;
        for (final File in : files) {
            inputFilenames[i++] = in.getPath();
        }
    }

    /**
     * Set the number of sequences per output file.
     *
     * @param sequencePerOutput number of sequences per output file.
     */
    public void setSequencePerOutput(final int sequencePerOutput) {
        this.sequencePerOutput = sequencePerOutput;
    }

    /**
     * Returns the set of filenames where data was output.
     *
     * @return The list of filenames that were created as a result of the reformat operation
     */
    public String[] getOutputFilenames() {
        return outputFilenames.toArray(new String[outputFilenames.size()]);
    }

    public void setOutputFile(final String output) {
        outputFile = output;
    }

    /**
     * Set the end position. This will stop copying records
     * ending at the endPosition. If endPosition is in the
     * middle of a record, this will copy to the end of that record.
     *
     * @param endPosition the start position
     */
    public void setEndPosition(final long endPosition) {
        assert endPosition >= 0 : "End position must not be negative";
        this.endPosition = endPosition;
        hasStartOrEndPosition = true;
    }

    /**
     * Get the end position. This will stop copying records
     * ending at the endPosition. If endPosition is in the
     * middle of a record, this will copy to the end of that record.
     *
     * @return the start position
     */
    public long getEndPosition() {
        return endPosition;
    }

    /**
     * Set the start position. This will start copying records
     * starting at the startPosition. If startPosition is in the
     * middle of a record, this will advance to the start of the
     * next record.
     *
     * @param startPosition the start position
     */
    public void setStartPosition(final long startPosition) {
        assert startPosition >= 0 : "Start position must not be negative";
        this.startPosition = startPosition;
        hasStartOrEndPosition = true;
    }

    /**
     * Get the start position. This will start copying records
     * starting at the startPosition. If startPosition is in the
     * middle of a record, this will advance to the start of the
     * next record.
     *
     * @return the start position
     */
    public long getStartPosition() {
        return startPosition;
    }

    public int getMinReadLength() {
        return minReadLength;
    }

    public void setMinReadLength(final int minReadLength) {
        this.minReadLength = minReadLength;
    }

    public int getMaxReadLength() {
        return maxReadLength;
    }

    public void setMaxReadLength(final int maxReadLength) {
        this.maxReadLength = maxReadLength;
    }

    public int getTrimReadLength() {
        return trimReadLength;
    }

    public void setTrimReadLength(final int trimReadLength) {
        this.trimReadLength = trimReadLength;
    }

    public int getSequencePerChunk() {
        return sequencePerChunk;
    }

    public void setSequencePerChunk(final int sequencePerChunk) {
        this.sequencePerChunk = sequencePerChunk;
    }

    public static void main(final String[] args) throws IOException, JSAPException {
        new ReformatCompactReadsMode().configure(args).execute();
    }

    public void setWriter(final ReadsWriter writer) {
        this.writer = writer;
    }

    public void setTrimStart(final int startTrim) {
        this.trimReadStartLength = startTrim;
    }
}