it.crs4.seal.prq.PairReadsQSeqReducer.java Source code

Introduction

Here is the source code for it.crs4.seal.prq.PairReadsQSeqReducer.java
Source

// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal.  If not, see <http://www.gnu.org/licenses/>.

package it.crs4.seal.prq;

import it.crs4.seal.common.IMRContext;
import it.crs4.seal.common.ReadPair;
import it.crs4.seal.common.SequenceId;
import it.crs4.seal.common.WritableMapping;

import org.apache.hadoop.io.Text;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.IOException;
import java.nio.ByteBuffer;

public class PairReadsQSeqReducer {
    private static final Log LOG = LogFactory.getLog(PairReadsQSeqReducer.class);

    private Text outputKey = new Text();
    private ReadPair outputValue = new ReadPair();

    private ByteBuffer[] sequence;
    private ByteBuffer[] quality;
    private WritableMapping[] mapping;

    private int minBasesThreshold = 0;
    private boolean dropFailedFilter = true;
    private boolean warnOnlyIfUnpaired = false;
    private int nReadsPerTemplate = 2;

    private static final byte[] delimByte = { 9 }; // tab character
    private static final String delim = "\t";
    private static final char UnknownBase = 'N';
    private static final int INIT_BUF_SIZE = 150;

    public static enum ReadCounters {
        NotEnoughBases, FailedFilter, Unpaired, Dropped
    }

    public void setMinBasesThreshold(int v) {
        minBasesThreshold = v;
    }

    public void setDropFailedFilter(boolean v) {
        dropFailedFilter = v;
    }

    public void setWarnOnlyIfUnpaired(boolean v) {
        warnOnlyIfUnpaired = v;
    }

    public void setNumReadsPerTemplate(int v) {
        if (v <= 0)
            throw new IllegalArgumentException("number of reads per template must be 0 or 1");
        if (v > 2)
            throw new UnsupportedOperationException("number of reads per template must be 0 or 1");
        nReadsPerTemplate = v;
    }

    public void setup(IMRContext<Text, ReadPair> context) {
        sequence = new ByteBuffer[nReadsPerTemplate];
        quality = new ByteBuffer[nReadsPerTemplate];
        mapping = new WritableMapping[nReadsPerTemplate];

        for (int i = 0; i < sequence.length; ++i) {
            sequence[i] = ByteBuffer.allocate(INIT_BUF_SIZE);
            sequence[i].limit(0).position(0);
            quality[i] = ByteBuffer.allocate(INIT_BUF_SIZE);
            quality[i].limit(0).position(0);
            mapping[i] = new WritableMapping();
        }

        // create counters with a value of 0.
        context.increment(ReadCounters.NotEnoughBases, 0);
        context.increment(ReadCounters.FailedFilter, 0);
        context.increment(ReadCounters.Dropped, 0);

        LOG.debug("Set up prq reducer for templates with " + nReadsPerTemplate + " reads");
    }

    public void reduce(SequenceId key, Iterable<Text> values, IMRContext<Text, ReadPair> context)
            throws IOException, InterruptedException {
        outputKey.set(key.getLocation());
        outputValue.clear();

        int nReads = 0;
        int nBadReads = 0;
        for (Text read : values) {
            ++nReads;
            if (nReads > nReadsPerTemplate)
                throw new RuntimeException("got more than " + nReadsPerTemplate + " reads for sequence key " + key
                        + ". Record: " + read);

            int[] fieldsPos = findFields(read);
            // filtered read?
            // If dropFailedFilter is false it shortcuts the test and sets filterPassed directly to true.
            // If it's true then we check whether the field is equal to '1'
            boolean filterPassed = !dropFailedFilter || read.getBytes()[fieldsPos[2]] == (byte) '1';

            if (!filterPassed) {
                context.increment(ReadCounters.FailedFilter, 1);
                ++nBadReads;
            } else if (!checkReadQuality(read, fieldsPos)) {
                context.increment(ReadCounters.NotEnoughBases, 1);
                ++nBadReads;
            }

            // In here we do all the work to prepare the read for output.  It will be written to the
            // appropriate WritableMapping, which will in turn be inserted into the ReadPair outputValue.
            prepMapping(read.getBytes(), fieldsPos, nReads - 1);
        }

        if (nReads < nReadsPerTemplate) {
            context.increment(ReadCounters.Unpaired, nReads);
            String msg = String.format("Too few reads for template! (found %s). Key: %s", nReads, key);
            if (warnOnlyIfUnpaired)
                LOG.warn(msg);
            else
                throw new RuntimeException(msg + "\nread: " + outputValue.toString());
        }
        // nReads can't be > nReadsPerTemplate since that should be caught in the loop above.

        // If they're a complete template and they're not all bad write. Unpaired are dropped
        if (nReads == nReadsPerTemplate && nBadReads < nReads)
            context.write(outputKey, outputValue);
        else
            context.increment(ReadCounters.Dropped, nReads);

        context.progress();
    }

    /**
     * Set a mapping from the mapper-serialized data.
     *
     * @param data The byte array from the Text object where the data was serialized by the mapper.
     * @param fieldPositions Array of positions indexing the byte array, as produced by findFields.
     * @param index Index of the Mapping we're creating, whether 0 or 1.  We use this to index into
     * the sequence, quality and mapping arrays and to set the read number in the WritableMapping.
     *
     * @post The WritableMapping in mapping[index] will be reset with the data from the input byte
     * array data.  It will use the ByteBuffers in sequence and quality to store its
     * sequence and base quality data.  The reset mapping will be set as read 1 or 2 of the
     * outputValue ReadPair.
     */
    private void prepMapping(byte[] data, int[] fieldPositions, int index) {
        WritableMapping map = mapping[index];
        int length = fieldPositions[1] - 1;
        ensureCapacity(length);

        map.clear();

        // ByteBuffer.clear() resets position to 0 and limit to capacity()
        sequence[index].clear();
        sequence[index].put(data, 0, length).rewind().mark().limit(length);
        map.setSequence(sequence[index]);

        quality[index].clear();
        quality[index].put(data, fieldPositions[1], length).rewind().mark().limit(length);
        map.setBaseQualities(quality[index]);

        if (index == 0) {
            map.setIsRead1(true);
            outputValue.setRead1(map);
        } else if (index == 1) {
            map.setIsRead2(true);
            // set both reads as paired
            mapping[0].setIsPaired(true);
            map.setIsPaired(true);
            outputValue.setRead2(map);
        } else
            throw new UnsupportedOperationException(
                    "working with more than two reads per template is not supported");
    }

    private void ensureCapacity(int required) {
        int current = sequence[0].capacity();
        if (required > current) {
            int newSize = required * 2;
            // grow
            for (int i = 0; i < sequence.length; ++i) {
                ByteBuffer temp;
                // allocate and copy sequence first
                temp = ByteBuffer.allocate(newSize);
                temp.put(sequence[i]).rewind().mark().limit(sequence[i].limit());
                sequence[i] = temp;
                if (mapping[i].getSequence() != null)
                    mapping[i].setSequence(sequence[i]);

                // repeat for quality
                temp = ByteBuffer.allocate(newSize);
                temp.put(quality[i]).rewind().mark().limit(quality[i].limit());
                quality[i] = temp;
                if (mapping[i].getBaseQualities() != null)
                    mapping[i].setBaseQualities(quality[i]);
            }
        }
    }

    // read format:
    //             read1 <tab> quality1 <tab> filter flag
    // field idx   0           1              2
    private int[] findFields(Text read) {
        int[] fieldsPos = new int[3];
        fieldsPos[0] = 0;

        for (int i = 1; i <= 2; ++i) {
            fieldsPos[i] = read.find(delim, fieldsPos[i - 1]) + 1; // +1 since we get the position of the delimiter
            if (fieldsPos[i] <= 0)
                throw new RuntimeException("invalid read/quality format: " + read.toString());
        }

        int seqLength = fieldsPos[1] - 1;
        int qualLength = fieldsPos[2] - fieldsPos[1] - 1;
        if (seqLength != qualLength)
            throw new RuntimeException(
                    "sequence and quality lengths don't match! (got " + seqLength + " and " + qualLength + ")");

        return fieldsPos;
    }

    /**
     * Verify whether a read satisfies quality standards.
     * For now this method verifies whether the read has at least
     * minBasesThreshold known bases (ignoring unknown bases N).
     */
    protected boolean checkReadQuality(Text read, int[] fieldsPos) {
        /* The read's delimiter is at the bytes before the second field starts */
        int readEnd = fieldsPos[1] - 1;

        // The condition is "min number of valid bases".  However, we consider
        // the inverse condition "max number of unknowns".
        // readEnd is also the length of the read fragment
        // readEnd - minBasesThreshold gives us the maximum number of unknowns acceptable.
        int nAcceptableUnknowns = readEnd - minBasesThreshold;

        if (nAcceptableUnknowns < 0) // the fragment is shorter than minBasesThreshold
            return false;

        int nUnknownBases = 0;
        byte[] data = read.getBytes(); // we can work directly in bytes as long as we only have ASCII characters
        for (int pos = 0; pos < readEnd; ++pos) {
            if (data[pos] == UnknownBase) {
                ++nUnknownBases;
                if (nUnknownBases > nAcceptableUnknowns)
                    return false;
            }
        }
        return true;
    }
}