gov.jgi.meta.pig.storage.FastaStorage.java Source code

Java tutorial

Introduction

Here is the source code for gov.jgi.meta.pig.storage.FastaStorage.java

Source

/*
 * Copyright (c) 2010, The Regents of the University of California, through Lawrence Berkeley
 * National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy).
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided
 * that the following conditions are met:
 *
 * (1) Redistributions of source code must retain the above copyright notice, this list of conditions and the
 * following disclaimer.
 *
 * (2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions
 * and the following disclaimer in the documentation and/or other materials provided with the distribution.
 *
 * (3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept.
 * of Energy, nor the names of its contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the
 * features, functionality or performance of the source code ("Enhancements") to anyone; however,
 * if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley
 * National Laboratory, without imposing a separate written license agreement for such Enhancements,
 * then you hereby grant the following license: a  non-exclusive, royalty-free perpetual license to install,
 * use, modify, prepare derivative works, incorporate into other computer software, distribute, and
 * sublicense such enhancements or derivative works thereof, in binary and source code form.
 */

package gov.jgi.meta.pig.storage;

import java.io.IOException;
import java.util.ArrayList;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import gov.jgi.meta.hadoop.input.*;
import org.apache.pig.LoadFunc;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;

/**
 * A pig loader for fasta files.  The loader reads fasta sequence files and returns tuples of the form
 * <seqid: chararray, direction: int, sequence: bytearray, header: chararray>
 *
 * note that the sequence data itself is packed in a binary format with 3 bases/byte.  its ascii readable,
 * but won't give you atgc.
 *
 * usage: to use, do something like:
 *   pig> A = load 'test.fas' using gov.jgi.meta.pig.storage.FastaStorage as \
 *            (readid: chararray, d: int, seq: bytearray, header: chararray);
 *
 * if read is a read pair, the direction will be set to 1 or 2 and stripped off the readid.  if its not paired,
 * then direction will be 0.
 *
 * the header will be set to everything on the header line after the id (and direction).
 *
 * the id of the read is defined to be the first word after the > in the header.  that is, till the first space
 * or tab.
 *
 **/

public class FastaStorage extends LoadFunc {
    protected RecordReader in = null;
    private ArrayList<Object> mProtoTuple = null;
    private TupleFactory mTupleFactory = TupleFactory.getInstance();

    /**
     * null constructor
     */
    public FastaStorage() {
    }

    /**
     * returns the next sequence from the block
     */
    @Override
    public Tuple getNext() throws IOException {

        if (mProtoTuple == null) {
            mProtoTuple = new ArrayList<Object>();
        }

        try {
            boolean notDone = in.nextKeyValue();
            if (!notDone) {
                return (null);
            }

            /*
              check the id of the sequence to see if its a paired read
             */
            String seqid = (in.getCurrentKey()).toString();
            String seqkey = null;
            String seqkey2;
            String header = "";
            String direction;
            for (int i = 0; i < seqid.length(); i++) {
                if (seqid.charAt(i) == ' ' || seqid.charAt(i) == '\t') {
                    seqkey = seqid.substring(0, i);
                    header = seqid.substring(i, seqid.length());
                    break;
                }
            }
            if (seqkey == null)
                seqkey = seqid;
            if (seqkey.indexOf("/") >= 0) {
                String[] a = seqkey.split("/");
                seqkey2 = a[0];
                direction = a[1];
            } else {
                seqkey2 = seqkey;
                direction = "0";
            }
            Text value = ((Text) in.getCurrentValue());
            mProtoTuple.add(new DataByteArray(seqkey2.getBytes(), 0, seqkey2.length())); // add key
            mProtoTuple.add(new DataByteArray(direction.getBytes(), 0, direction.length())); // add direction
            mProtoTuple.add(new DataByteArray(value.getBytes(), 0, value.getLength())); // add sequence
            mProtoTuple.add(new DataByteArray(header.getBytes(), 0, header.length())); // add header

            Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
            mProtoTuple = null;
            return (t);
        } catch (InterruptedException e) {
            int errCode = 6018;
            String errMsg = "Error while reading input";
            throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
        }
    }

    @Override
    public InputFormat getInputFormat() {
        return (new FastaInputFormat());
    }

    @Override
    public void prepareToRead(RecordReader reader, PigSplit split) {
        in = reader;
    }

    @Override
    public void setLocation(String location, Job job) throws IOException {
        FileInputFormat.setInputPaths(job, location);
    }
}