fi.tkk.ics.hadoop.bam.BAMRecordReader.java Source code

Introduction

Here is the source code for fi.tkk.ics.hadoop.bam.BAMRecordReader.java
Source

// Copyright (c) 2010 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

// File created: 2010-08-09 14:34:08

package fi.tkk.ics.hadoop.bam;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import net.sf.samtools.BAMRecordCodec;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.BlockCompressedInputStream;

import fi.tkk.ics.hadoop.bam.util.MurmurHash3;
import fi.tkk.ics.hadoop.bam.util.SAMHeaderReader;
import fi.tkk.ics.hadoop.bam.util.WrapSeekable;

import hbparquet.hadoop.util.ContextUtil;

/** The key is the bitwise OR of the reference sequence ID in the upper 32 bits
 * and the 0-based leftmost coordinate in the lower.
 */
public class BAMRecordReader extends RecordReader<LongWritable, SAMRecordWritable> {
    private final LongWritable key = new LongWritable();
    private final SAMRecordWritable record = new SAMRecordWritable();

    private SAMFileReader.ValidationStringency stringency;

    private BlockCompressedInputStream bci;
    private BAMRecordCodec codec;
    private long fileStart, virtualEnd;
    private boolean isInitialized = false;

    /** Note: this is the only getKey function that handles unmapped reads
     * specially!
     */
    public static long getKey(final SAMRecord rec) {
        final int refIdx = rec.getReferenceIndex();
        final int start = rec.getAlignmentStart();

        if (!(rec.getReadUnmappedFlag() || refIdx < 0 || start < 0))
            return getKey(refIdx, start);

        // Put unmapped reads at the end, but don't give them all the exact same
        // key so that they can be distributed to different reducers.
        //
        // A random number would probably be best, but to ensure that the same
        // record always gets the same key we use a fast hash instead.
        //
        // We avoid using hashCode(), because it's not guaranteed to have the
        // same value across different processes.

        int hash = 0;
        byte[] var;
        if ((var = rec.getVariableBinaryRepresentation()) != null) {
            // Undecoded BAM record: just hash its raw data.
            hash = (int) MurmurHash3.murmurhash3(var, hash);
        } else {
            // Decoded BAM record or any SAM record: hash a few representative
            // fields together.
            hash = (int) MurmurHash3.murmurhash3(rec.getReadName(), hash);
            hash = (int) MurmurHash3.murmurhash3(rec.getReadBases(), hash);
            hash = (int) MurmurHash3.murmurhash3(rec.getBaseQualities(), hash);
            hash = (int) MurmurHash3.murmurhash3(rec.getCigarString(), hash);
        }
        return getKey0(Integer.MAX_VALUE, hash);
    }

    /** @param alignmentStart 1-based leftmost coordinate. */
    public static long getKey(int refIdx, int alignmentStart) {
        return getKey0(refIdx, alignmentStart - 1);
    }

    /** @param alignmentStart0 0-based leftmost coordinate. */
    public static long getKey0(int refIdx, int alignmentStart0) {
        return (long) refIdx << 32 | alignmentStart0;
    }

    @Override
    public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
        // This method should only be called once (see Hadoop API). However,
        // there seems to be disagreement between implementations that call
        // initialize() and Hadoop-BAM's own code that relies on
        // {@link BAMInputFormat} to call initialize() when the reader is
        // created. Therefore we add this check for the time being. 
        if (isInitialized)
            close();
        isInitialized = true;

        final Configuration conf = ContextUtil.getConfiguration(ctx);

        final FileVirtualSplit split = (FileVirtualSplit) spl;
        final Path file = split.getPath();
        final FileSystem fs = file.getFileSystem(conf);

        this.stringency = SAMHeaderReader.getValidationStringency(conf);

        final FSDataInputStream in = fs.open(file);

        codec = new BAMRecordCodec(SAMHeaderReader.readSAMHeaderFrom(in, conf));

        in.seek(0);
        bci = new BlockCompressedInputStream(
                new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file));

        final long virtualStart = split.getStartVirtualOffset();

        fileStart = virtualStart >>> 16;
        virtualEnd = split.getEndVirtualOffset();

        bci.seek(virtualStart);
        codec.setInputStream(bci);

        if (BAMInputFormat.DEBUG_BAM_SPLITTER) {
            final long recordStart = virtualStart & 0xffff;
            System.err.println(
                    "XXX inizialized BAMRecordReader byte offset: " + fileStart + " record offset: " + recordStart);
        }
    }

    @Override
    public void close() throws IOException {
        bci.close();
    }

    /** Unless the end has been reached, this only takes file position into
     * account, not the position within the block.
     */
    @Override
    public float getProgress() {
        final long virtPos = bci.getFilePointer();
        final long filePos = virtPos >>> 16;
        if (virtPos >= virtualEnd)
            return 1;
        else {
            final long fileEnd = virtualEnd >>> 16;
            // Add 1 to the denominator to make sure it doesn't reach 1 here when
            // filePos == fileEnd.
            return (float) (filePos - fileStart) / (fileEnd - fileStart + 1);
        }
    }

    @Override
    public LongWritable getCurrentKey() {
        return key;
    }

    @Override
    public SAMRecordWritable getCurrentValue() {
        return record;
    }

    @Override
    public boolean nextKeyValue() {
        if (bci.getFilePointer() >= virtualEnd)
            return false;

        final SAMRecord r = codec.decode();
        if (r == null)
            return false;

        // Since we're reading from a BAMRecordCodec directly we have to set the
        // validation stringency ourselves.
        if (this.stringency != null)
            r.setValidationStringency(this.stringency);

        key.set(getKey(r));
        record.set(r);
        return true;
    }
}