gov.jgi.meta.hadoop.input.FastaLineReader.java Source code

Introduction

Here is the source code for gov.jgi.meta.hadoop.input.FastaLineReader.java
Source

/*
 * Copyright (c) 2010, The Regents of the University of California, through Lawrence Berkeley
 * National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy).
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided
 * that the following conditions are met:
 *
 * (1) Redistributions of source code must retain the above copyright notice, this list of conditions and the
 * following disclaimer.
 *
 * (2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions
 * and the following disclaimer in the documentation and/or other materials provided with the distribution.
 *
 * (3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept.
 * of Energy, nor the names of its contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the
 * features, functionality or performance of the source code ("Enhancements") to anyone; however,
 * if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley
 * National Laboratory, without imposing a separate written license agreement for such Enhancements,
 * then you hereby grant the following license: a  non-exclusive, royalty-free perpetual license to install,
 * use, modify, prepare derivative works, incorporate into other computer software, distribute, and
 * sublicense such enhancements or derivative works thereof, in binary and source code form.
 */

package gov.jgi.meta.hadoop.input;

import gov.jgi.meta.sequence.SequenceString;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import java.io.IOException;
import java.io.InputStream;

/**
 * A class that provides a line reader from an input stream.
 */
public class FastaLineReader {
    private static final Log LOG = LogFactory.getLog(FastaLineReader.class);
    private static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
    private int bufferSize = DEFAULT_BUFFER_SIZE;
    private InputStream in;
    private byte[] buffer;
    // the number of bytes of real data in the buffer
    private int bufferLength = 0;
    // the current position in the buffer
    private int bufferPosn = 0;

    private static final byte CR = '\r';
    private static final byte LF = '\n';
    private static final byte seperator = '>';

    /**
     * Create a line reader that reads from the given stream using the
     * default buffer-size (64k).
     * @param in The input stream
     * @throws java.io.IOException
     */
    public FastaLineReader(InputStream in) {
        this(in, DEFAULT_BUFFER_SIZE);
    }

    /**
     * Create a line reader that reads from the given stream using the
     * given buffer-size.
     * @param in The input stream
     * @param bufferSize Size of the read buffer
     * @throws java.io.IOException
     */
    public FastaLineReader(InputStream in, int bufferSize) {
        this.in = in;
        this.bufferSize = bufferSize;
        this.buffer = new byte[this.bufferSize];
    }

    /**
     * Create a line reader that reads from the given stream using the
     * <code>io.file.buffer.size</code> specified in the given
     * <code>Configuration</code>.
     * @param in input stream
     * @param conf configuration
     * @throws java.io.IOException
     */
    public FastaLineReader(InputStream in, Configuration conf) throws IOException {
        this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE));
    }

    /**
     * Close the underlying stream.
     * @throws IOException
     */
    public void close() throws IOException {
        in.close();
    }

    /**
     * Read one line from the InputStream into the given Text.  A line
     * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
     * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
     * line.
     *
     * @param str the object to store the given line (without newline)
     * @param maxLineLength the maximum number of bytes to store into str;
     *  the rest of the line is silently discarded.
     * @param maxBytesToConsume the maximum number of bytes to consume
     *  in this call.  This is only a hint, because if the line cross
     *  this threshold, we allow it to happen.  It can overshoot
     *  potentially by as much as one buffer length.
     *
     * @return the number of bytes read including the (longest) newline
     * found.
     *
     * @throws IOException if the underlying stream throws
     */
    public int readLine(Text key, Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
        int totalBytesRead = 0;
        int numRecordsRead = 0;
        Boolean eof = false;
        int startPosn;
        StringBuilder recordBlock = new StringBuilder(this.bufferSize);

        /*
        first thing to do is to move forward till you see a start character
         */
        startPosn = bufferPosn;
        do {
            if (bufferPosn >= bufferLength) {
                totalBytesRead += bufferPosn - startPosn;
                bufferPosn = 0;
                bufferLength = in.read(buffer);
                if (bufferLength <= 0) {
                    eof = true;
                    break; // EOF
                }
            }
        } while (buffer[bufferPosn++] != '>');

        /*
        if we hit the end of file already, then just return 0 bytes processed
         */
        if (eof)
            return totalBytesRead;

        /*
        now bufferPosn should be at the start of a fasta record
         */
        totalBytesRead += (bufferPosn - 1) - startPosn;
        startPosn = bufferPosn - 1; // startPosn guaranteed to be at a ">"

        /*
        find the next record start:  first scan to end of the line
         */
        eof = false;
        do {
            if (bufferPosn >= bufferLength) {

                /*
                copy the current buffer before refreshing the buffer
                 */
                int appendLength = bufferPosn - startPosn;
                for (int copyi = startPosn; copyi < startPosn + appendLength; copyi++) {
                    recordBlock.append((char) buffer[copyi]);
                }
                //recordBlock.append(buffer, startPosn, appendLength);
                totalBytesRead += appendLength;

                startPosn = bufferPosn = 0;
                bufferLength = in.read(buffer);
                if (bufferLength <= 0) {
                    eof = true;
                    break; // EOF
                }
            }
            bufferPosn++;
        } while (buffer[bufferPosn - 1] != CR && buffer[bufferPosn - 1] != LF);

        /*
        find the next record start:  scan till next ">"
         */
        do {
            if (bufferPosn >= bufferLength) {

                /*
                copy the current buffer before refreshing the buffer
                 */
                int appendLength = bufferPosn - startPosn;
                for (int copyi = startPosn; copyi < startPosn + appendLength; copyi++) {
                    recordBlock.append((char) buffer[copyi]);
                }
                //recordBlock.append(buffer, startPosn, appendLength);
                totalBytesRead += appendLength;

                startPosn = bufferPosn = 0;
                bufferLength = in.read(buffer);
                if (bufferLength <= 0) {
                    eof = true;
                    break; // EOF
                }
            }
        } while (buffer[bufferPosn++] != '>'); // only read one record at a time

        if (!eof) {
            bufferPosn--; // make sure we leave bufferPosn pointing to the next record
            int appendLength = bufferPosn - startPosn;
            for (int copyi = startPosn; copyi < startPosn + appendLength; copyi++) {
                recordBlock.append((char) buffer[copyi]);
            }
            //recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;
        }

        /*
        record block now has the byte array we want to process for reads
         */

        int i = 1; // skip initial record seperator ">"
        int j = 1;
        do {
            key.clear();
            str.clear();
            /*
            first parse the key
             */
            i = j;
            Boolean junkOnLine = false;
            while (j < recordBlock.length()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                } //else if (c == ' ' || c == '\t') {
                  //  junkOnLine = true;
                  //  break;
                  //}
            }
            if (j == i) {
                // then we didn't parse out a proper id
                LOG.error("Unable to parse entry: " + recordBlock);
                str.clear();
                key.clear();
                return totalBytesRead;
            }
            key.set(recordBlock.substring(i, j - 1));

            /*
            in case there is additional metadata on the header line, ignore everything after
            the first word.
             */
            if (junkOnLine) {
                while (j < recordBlock.length() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                    j++;
            }

            //LOG.info ("key = " + k.toString());

            /*
            now skip the newlines
            */
            while (j < recordBlock.length() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

            /*
            now read the sequence
            */
            StringBuilder sequenceTmp = new StringBuilder(recordBlock.length());
            do {
                i = j;
                while (j < recordBlock.length()) {
                    int c = recordBlock.charAt(j++);
                    if (c == CR || c == LF) {
                        break;
                    }
                }
                //byte[] ba = recordBlock.getBytes();
                //if (ba.length <= i || ba.length <= j - i - 1) {
                //    LOG.fatal("hmm... ba.length = " + ba.length + " i = " + i + " j-i-1 = " + (j-i-1));
                //}

                if (j == i) {
                    // then we didn't parse out a proper id
                    LOG.error("Unable to parse entry: " + recordBlock);
                    str.clear();
                    key.clear();
                    return totalBytesRead;
                }
                for (int copyi = i; copyi < j - 1; copyi++) {
                    sequenceTmp.append((char) recordBlock.charAt(copyi));
                }

                while (j < recordBlock.length() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                    j++;

            } while (j < recordBlock.length() && recordBlock.charAt(j) != '>');
            str.set(sequenceTmp.toString());

            numRecordsRead++;

            /*
            now skip characters (newline or carige return most likely) till record start
            */
            while (j < recordBlock.length() && recordBlock.charAt(j) != '>') {
                j++;
            }

            j++; // skip the ">"

        } while (j < recordBlock.length());

        //        LOG.info("");
        //        LOG.info("object key = " + key);
        byte[] strpacked = SequenceString.sequenceToByteArray(str.toString().toLowerCase());

        str.clear();
        str.append(strpacked, 0, strpacked.length);

        return totalBytesRead;
    }

    /**
     * Read from the InputStream into the given Text.
     * @param str the object to store the given line
     * @param maxLineLength the maximum number of bytes to store into str.
     * @return the number of bytes read including the newline
     * @throws IOException if the underlying stream throws
     */
    public int readLine(Text key, Text str, int maxLineLength) throws IOException {
        return readLine(key, str, maxLineLength, Integer.MAX_VALUE);
    }

    /**
     * Read from the InputStream into the given Text.
     * @param str the object to store the given line
     * @return the number of bytes read including the newline
     * @throws IOException if the underlying stream throws
     */
    public int readLine(Text key, Text str) throws IOException {
        return readLine(key, str, Integer.MAX_VALUE, Integer.MAX_VALUE);
    }

}