it.crs4.seal.recab.RecabTableMapper.java Source code

Java tutorial

Introduction

Here is the source code for it.crs4.seal.recab.RecabTableMapper.java

Source

// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal.  If not, see <http://www.gnu.org/licenses/>.

package it.crs4.seal.recab;

import it.crs4.seal.common.IMRContext;
import it.crs4.seal.common.AbstractTaggedMapping;
import it.crs4.seal.common.ReadPair;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class RecabTableMapper {
    private static final Log LOG = LogFactory.getLog(RecabTableMapper.class);

    public static final String CONF_SKIP_KNOWN_VAR_SITES = "seal.recab.skip-known-variant-sites"; // mainly for testing

    private static final byte SANGER_OFFSET = 33;

    public static enum BaseCounters {
        All, Used, BadBases, VariantMismatches, VariantBases, NonVariantMismatches, AdaptorBasesTrimmed
    };

    public static enum ReadCounters {
        Processed, FilteredTotal, FilteredUnmapped, FilteredMapQ, FilteredDuplicate, FilteredQC, FilteredSecondaryAlignment
    };

    private VariantTable snps;

    private AbstractTaggedMapping currentMapping;
    private ArrayList<Integer> referenceCoordinates;
    private ArrayList<Boolean> referenceMatches;
    private ArrayList<Covariate> covariateList;
    private IMRContext<Text, ObservationCount> context;
    private boolean skipKnownVariantPositions = true;

    private Text key = new Text();
    private ObservationCount value = new ObservationCount();

    public void setup(VariantReader reader, IMRContext<Text, ObservationCount> context, Configuration conf)
            throws IOException {
        this.context = context;
        snps = new ArrayVariantTable();
        LOG.info("Using " + snps.getClass().getName() + " snp table implementation.");
        LOG.info("loading known variation sites.");
        snps.load(reader);
        if (LOG.isInfoEnabled())
            LOG.info("loaded " + snps.size() + " known variation sites.");

        referenceCoordinates = new ArrayList<Integer>(200);
        referenceMatches = new ArrayList<Boolean>(200);

        // TODO:  make it configurable
        covariateList = new ArrayList<Covariate>(5);
        covariateList.add(new ReadGroupCovariate(conf));
        covariateList.add(new QualityCovariate());
        covariateList.add(new CycleCovariate());
        covariateList.add(new DinucCovariate());

        // set counters
        for (BaseCounters c : BaseCounters.class.getEnumConstants())
            context.increment(c, 0);

        for (ReadCounters c : ReadCounters.class.getEnumConstants())
            context.increment(c, 0);

        skipKnownVariantPositions = conf.getBoolean(CONF_SKIP_KNOWN_VAR_SITES, true);
        if (!skipKnownVariantPositions)
            LOG.warn("Not skipping known variant sites.  This is not recommended for regular usage.");
    }

    protected boolean readFailsFilters(AbstractTaggedMapping map) {
        boolean fails = false;
        if (map.isUnmapped()) {
            context.increment(ReadCounters.FilteredUnmapped, 1);
            fails = true;
        } else if (map.getMapQ() == 0 || map.getMapQ() == 255) {
            context.increment(ReadCounters.FilteredMapQ, 1);
            fails = true;
        } else if (map.isDuplicate()) {
            context.increment(ReadCounters.FilteredDuplicate, 1);
            fails = true;
        } else if (map.isFailedQC()) {
            context.increment(ReadCounters.FilteredQC, 1);
            fails = true;
        } else if (map.isSecondaryAlign()) {
            context.increment(ReadCounters.FilteredSecondaryAlignment, 1);
            fails = true;
        }

        if (fails)
            context.increment(ReadCounters.FilteredTotal, 1);

        return fails;
    }

    public void map(LongWritable ignored, ReadPair pair, IMRContext<Text, ObservationCount> context)
            throws IOException, InterruptedException {
        for (AbstractTaggedMapping mapping : pair)
            processMapping(mapping, context);
    }

    protected void processMapping(AbstractTaggedMapping currentMapping, IMRContext<Text, ObservationCount> context)
            throws IOException, InterruptedException {
        context.increment(ReadCounters.Processed, 1);
        context.increment(BaseCounters.All, currentMapping.getLength());

        if (readFailsFilters(currentMapping))
            return;

        final String contig = currentMapping.getContig();

        int left, right; // left and right "limits" within the current sequence
        if (currentMapping.isTemplateLengthAvailable()
                && currentMapping.getTemplateLength() < currentMapping.getLength()) {
            // Insert size is less than the read size, so we've sequenced part of the read adapter.
            // We need to trim it the last read_length - insert_size bases sequenced.
            if (currentMapping.isOnReverse()) {
                // trim from front
                left = currentMapping.getLength() - currentMapping.getTemplateLength();
                right = currentMapping.getLength();
            } else {
                // trim from the back
                left = 0;
                right = currentMapping.getTemplateLength();
            }

            context.increment(BaseCounters.AdaptorBasesTrimmed, currentMapping.getLength() - (right - left));
        } else {
            left = 0;
            right = currentMapping.getLength();
        }

        currentMapping.calculateReferenceCoordinates(referenceCoordinates);
        currentMapping.calculateReferenceMatches(referenceMatches);

        for (Covariate cov : covariateList)
            cov.applyToMapping(currentMapping);

        final ByteBuffer seq = currentMapping.getSequence();
        final ByteBuffer qual = currentMapping.getBaseQualities();

        if (left > 0) {
            // we're using a relative get() method in the loop below, so here we
            // advance the buffer position to our starting point.
            seq.position(seq.position() + left);
            qual.position(qual.position() + left);
        }

        for (int i = left; i < right; ++i) {
            byte base = seq.get();
            byte quality = qual.get();

            if (base == 'N' || quality <= SANGER_OFFSET)
                context.increment(BaseCounters.BadBases, 1);
            else {
                int pos = referenceCoordinates.get(i);
                if (pos > 0) // valid reference position
                {
                    // is it a known variation site?
                    if (skipKnownVariantPositions && snps.isVariantLocation(contig, pos)) {
                        context.increment(BaseCounters.VariantBases, 1);

                        if (!referenceMatches.get(i))
                            context.increment(BaseCounters.VariantMismatches, 1);
                    } else {
                        // use this base
                        context.increment(BaseCounters.Used, 1);

                        key.clear();
                        for (Covariate cov : covariateList) {
                            String tmp = cov.getValue(i);
                            key.append(tmp.getBytes(RecabTable.ASCII), 0, tmp.length());
                            key.append(RecabTable.TableDelimBytes, 0, RecabTable.TableDelimBytes.length);
                        }

                        boolean match = referenceMatches.get(i);
                        if (match) {
                            value.set(1, 0); // (num observations, num mismatches)
                        } else { // mismatch
                            context.increment(BaseCounters.NonVariantMismatches, 1);
                            value.set(1, 1);
                        }

                        context.write(key, value);
                    }
                }
            }
        }
    }
}