gov.jgi.meta.hadoop.input.FastqLineReader.java Source code

Introduction

Here is the source code for gov.jgi.meta.hadoop.input.FastqLineReader.java
Source

/*
 * Copyright (c) 2010, The Regents of the University of California, through Lawrence Berkeley
 * National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy).
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided
 * that the following conditions are met:
 *
 * (1) Redistributions of source code must retain the above copyright notice, this list of conditions and the
 * following disclaimer.
 *
 * (2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions
 * and the following disclaimer in the documentation and/or other materials provided with the distribution.
 *
 * (3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept.
 * of Energy, nor the names of its contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the
 * features, functionality or performance of the source code ("Enhancements") to anyone; however,
 * if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley
 * National Laboratory, without imposing a separate written license agreement for such Enhancements,
 * then you hereby grant the following license: a  non-exclusive, royalty-free perpetual license to install,
 * use, modify, prepare derivative works, incorporate into other computer software, distribute, and
 * sublicense such enhancements or derivative works thereof, in binary and source code form.
 */

package gov.jgi.meta.hadoop.input;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

/**
 * A class that provides a line reader from an input stream.
 */
public class FastqLineReader {
    private static final Log LOG = LogFactory.getLog(FastqLineReader.class);
    private static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
    private int bufferSize = DEFAULT_BUFFER_SIZE;
    private InputStream in;
    private byte[] buffer;

    // the number of bytes of real data in the buffer
    private int bufferLength = 0;
    // the current position in the buffer
    private int bufferPosn = 0;

    private static final byte CR = '\r';
    private static final byte LF = '\n';
    private static final byte seperator = '>';

    /**
     * Create a line reader that reads from the given stream using the
     * default buffer-size (64k).
     *
     * @param in The input stream
     * @throws java.io.IOException
     */
    public FastqLineReader(InputStream in) {
        this(in, DEFAULT_BUFFER_SIZE);
    }

    /**
     * Create a line reader that reads from the given stream using the
     * given buffer-size.
     *
     * @param in         The input stream
     * @param bufferSize Size of the read buffer
     * @throws java.io.IOException
     */
    public FastqLineReader(InputStream in, int bufferSize) {
        this.in = in;
        this.bufferSize = bufferSize;
        this.buffer = new byte[this.bufferSize];
    }

    /**
     * Create a line reader that reads from the given stream using the
     * <code>io.file.buffer.size</code> specified in the given
     * <code>Configuration</code>.
     *
     * @param in   input stream
     * @param conf configuration
     * @throws java.io.IOException
     */
    public FastqLineReader(InputStream in, Configuration conf) throws IOException {
        this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE));
    }

    /**
     * Close the underlying stream.
     *
     * @throws java.io.IOException
     */
    public void close() throws IOException {
        in.close();
    }

    /**
     * Read one line from the InputStream into the given Text.  A line
     * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
     * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
     * line.
     *
     * @param str               the object to store the given line (without newline)
     * @param maxLineLength     the maximum number of bytes to store into str;
     *                          the rest of the line is silently discarded.
     * @param maxBytesToConsume the maximum number of bytes to consume
     *                          in this call.  This is only a hint, because if the line cross
     *                          this threshold, we allow it to happen.  It can overshoot
     *                          potentially by as much as one buffer length.
     * @return the number of bytes read including the (longest) newline
     *         found.
     * @throws java.io.IOException if the underlying stream throws
     */
    public int readLine(Text key, Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
        int totalBytesRead = 0;
        int numRecordsRead = 0;
        Boolean eof = false;
        int startPosn;
        Text recordBlock = new Text();

        /*
        first thing to do is to move forward till you see a start character
         */
        startPosn = bufferPosn;
        do {
            if (bufferPosn >= bufferLength) {
                totalBytesRead += bufferPosn - startPosn;
                bufferPosn = 0;
                bufferLength = in.read(buffer);
                if (bufferLength <= 0) {
                    eof = true;
                    break; // EOF
                }
            }
        } while (buffer[bufferPosn++] != '@');

        /*
        if we hit the end of file already, then just return 0 bytes processed
         */
        if (eof)
            return totalBytesRead;

        /*
        now bufferPosn should be at the start of a fastq record
         */
        totalBytesRead += (bufferPosn - 1) - startPosn;
        startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@"

        /*
        find the next record start
         */
        eof = false;
        int numOfNewlines = 0;//Added by lanhin
        do {
            if (bufferPosn >= bufferLength) {

                /*
                copy the current buffer before refreshing the buffer
                 */
                int appendLength = bufferPosn - startPosn;
                recordBlock.append(buffer, startPosn, appendLength);
                totalBytesRead += appendLength;

                startPosn = bufferPosn = 0;
                bufferLength = in.read(buffer);
                if (bufferLength <= 0) {
                    eof = true;
                    break; // EOF
                }
            }
            //Modefied by lanhin
            if (buffer[bufferPosn] == CR || buffer[bufferPosn] == LF) {
                numOfNewlines++;
            }
            if ((numOfNewlines >= 4) && buffer[bufferPosn] == '@') {
                bufferPosn++;
                break;
            }
            bufferPosn++;
        } while (true);//buffer[bufferPosn++] != '@');  // only read one record at a time
        //Modefied by lanhin end

        if (!eof) {
            bufferPosn--; // make sure we leave bufferPosn pointing to the next record
            int appendLength = bufferPosn - startPosn;
            recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;
        }

        /*
        record block now has the byte array we want to process for reads
         */

        int i = 1; // skip initial record seperator "@"
        int j = 1;
        do {
            key.clear();
            str.clear();
            /*
            first parse the key
             */
            i = j;
            Boolean junkOnLine = false;
            while (j < recordBlock.getLength()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                } else if (c == ' ' || c == '\t') {
                    junkOnLine = true;
                    break;
                }
            }
            key.append(recordBlock.getBytes(), i, j - i - 1);

            /*
            in case there is additional metadata on the header line, ignore everything after
            the first word.
             */
            if (junkOnLine) {
                while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                    j++;
            }

            //LOG.info ("key = " + k.toString());

            /*
            now skip the newlines
            */
            while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

            /*
            now read the sequence
            */
            do {
                i = j;
                while (j < recordBlock.getLength()) {
                    int c = recordBlock.charAt(j++);
                    if (c == CR || c == LF) {
                        break;
                    }
                }
                str.append(recordBlock.getBytes(), i, j - i - 1);

                while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                    j++;

            } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+');

            numRecordsRead++;

            /*
            now skip characters (newline or carige return most likely) till record start
            */
            while (j < recordBlock.getLength()) {
                // && recordBlock.charAt(j) != '@') {  // Modified by lanhin
                /* Should go straight to the end of recordBlock,
                   ignore all the left info.  --lanhin*/

                j++;
            }

            j++; // skip the "@"

        } while (j < recordBlock.getLength());

        return totalBytesRead;
    }

    /**
     * Read from the InputStream into the given Text.
     *
     * @param str           the object to store the given line
     * @param maxLineLength the maximum number of bytes to store into str.
     * @return the number of bytes read including the newline
     * @throws java.io.IOException if the underlying stream throws
     */
    public int readLine(Text key, Text str, int maxLineLength) throws IOException {
        return readLine(key, str, maxLineLength, Integer.MAX_VALUE);
    }

    /**
     * Read from the InputStream into the given Text.
     *
     * @param str the object to store the given line
     * @return the number of bytes read including the newline
     * @throws java.io.IOException if the underlying stream throws
     */
    public int readLine(Text key, Text str) throws IOException {
        return readLine(key, str, Integer.MAX_VALUE, Integer.MAX_VALUE);
    }

    public static void main(String[] args) {

        int num = 100;
        int last = -1;

        try {
            FileInputStream fstream = new FileInputStream("/ifs/scratch/karan/derep-perf/HiSeq-8343080.fq");
            FastqLineReader fqlr = new FastqLineReader(fstream);

            Text key = new Text();
            Text sequence = new Text();

            int total = 0;
            int bytes = 0;

            do {
                //System.out.print("reading: ");
                if ((bytes = fqlr.readLine(key, sequence)) > 0) {
                    //System.out.println(key + "/" + sequence);
                    total++;
                }
            } while (bytes > 0);
            System.out.println("total = " + total);
        } catch (Exception e) {
            System.out.println(e);
        }
    }
}