gov.jgi.meta.sequence.SequenceStringCompress.java Source code

Introduction

Here is the source code for gov.jgi.meta.sequence.SequenceStringCompress.java
Source

/*
 * Copyright (c) 2017, The Regents of the University of California, through Lawrence Berkeley
 * National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy).
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided
 * that the following conditions are met:
 *
 * (1) Redistributions of source code must retain the above copyright notice, this list of conditions and the
 * following disclaimer.
 *
 * (2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions
 * and the following disclaimer in the documentation and/or other materials provided with the distribution.
 *
 * (3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept.
 * of Energy, nor the names of its contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the
 * features, functionality or performance of the source code ("Enhancements") to anyone; however,
 * if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley
 * National Laboratory, without imposing a separate written license agreement for such Enhancements,
 * then you hereby grant the following license: a  non-exclusive, royalty-free perpetual license to install,
 * use, modify, prepare derivative works, incorporate into other computer software, distribute, and
 * sublicense such enhancements or derivative works thereof, in binary and source code form.
 */

package gov.jgi.meta.sequence;

import org.apache.hadoop.io.Text;
import org.apache.commons.lang.StringUtils;

import java.io.UnsupportedEncodingException;
import java.util.HashMap;

/**
 * For debug
 * --lanhin
 */
// TODO: Remove these after debug
import org.apache.pig.data.*;
import java.io.IOException;
import java.util.*;
import org.apache.pig.EvalFunc;
import org.apache.pig.impl.logicalLayer.schema.Schema;

/**
 * This file is modified on the base of SequenceString.java
 * User: lanhin
 * Date: Dec 11, 2016
 * Time: 9:37:22 PM
 */
// TODO: Remove extends after debug
public class SequenceStringCompress extends EvalFunc<DataBag> {

    static HashMap<String, Byte> hash = null;
    static HashMap<Byte, String> reverseHash = null;

    /**
     * Init hash and reverseHash
     * Used in this file, I think.  --lanhin
     */
    public static void init() {
        if (hash == null) {
            hash = new HashMap();
            reverseHash = new HashMap();
            initHash();
        }
    }

    /**
     * To count how many bases there are in a given bytearray
     * The logic has changed  --lanhin
     */
    // TODO: 
    public static int numBases(byte[] seqarray) {
        int bitsLengthValid;
        int numberOfFill = (seqarray[0] >>> 6) & 0x3;
        bitsLengthValid = 8 * seqarray.length - 2;
        bitsLengthValid -= bitsLengthValid % 7;
        int last = seqarray[seqarray.length - 1] & 0x7f;
        if (last == 0) {
            bitsLengthValid -= 7;
        }
        return bitsLengthValid / 7 * 3 - numberOfFill;
    }

    /**
     * Return a sub-bytearray of the given one, from start to end
     * The logic has changed  --lanhin
     */
    // TODO: 
    public static byte[] subseq(byte[] seqarray, int start, int end) {
        String unpackedSeqSegment = byteArrayToSequence(seqarray);
        return pack(unpackedSeqSegment.substring(start, start + (end - start)));
    }

    /**
     * Check if a base contains in the given sequence in bytearray type
     * --lanhin
     */
    public static boolean contains(String sequence, CharSequence c) throws UnsupportedEncodingException {
        return byteArrayToSequence(sequence.getBytes("ISO-8859-1")).contains(c);
    }

    /**
     * Transfer sequence to bytearray
     * --lanhin
     */
    public static byte[] sequenceToByteArray(String sequence) {
        init();
        return pack(sequence.toLowerCase());

    }

    /**
     * Transfer bytearray to sequence
     * The logic has changed.  --lanhin
     */
    public static String byteArrayToSequence(byte[] bytes) {
        init();

        StringBuffer sb = new StringBuffer();

        int numberOfFill, leftBits, bytesLength = bytes.length;
        byte former, latter, current = 0;

        //In case the input is too short
        if (bytes.length <= 1)
            return sb.toString();

        former = latter = bytes[0];
        numberOfFill = (former >>> 6) & 0x3; //unsigned shift right
        leftBits = 6;

        for (int i = 1; i < bytes.length; i++) {
            latter = bytes[i];
            current = (byte) (former << (8 - leftBits));
            int latter_int = (int) latter & 0xff;
            current = (byte) (latter_int >>> leftBits | current);
            current = (byte) ((current >>> 1) & 0x7f);
            if (current == 0)
                break;

            sb.append(reverseHash.get(current));
            leftBits++;
            former = latter;
            if (leftBits == 8) {
                current = (byte) ((former >>> 1) & 0x7f);
                if (current == 0)
                    break;
                sb.append(reverseHash.get(current));
                leftBits = 1;
            }
        }

        //The last base group will miss only when leftBits = 7
        if (leftBits == 7 && current != 0) {
            current = (byte) (former << (8 - leftBits));
            int latter_int = (int) latter & 0xff;
            current = (byte) (latter_int >>> leftBits | current);
            current = (byte) ((current >>> 1) & 0x7f);

            //debug
            System.out.printf("leftBits:%d, current = %d, %s, Fill: %d\n ", leftBits, current,
                    reverseHash.get(current), numberOfFill);
            if (current != 0)
                sb.append(reverseHash.get(current));
        }
        //debug
        System.out.println("sb:" + sb.toString());
        assert (sb.length() > numberOfFill);
        return sb.toString().substring(0, sb.length() - numberOfFill);
    }

    /**
     * Another version
     * Two more parameters
     * This method cannot work correctly with the new compression way.  --lanhin
     */
    /*    public static String byteArrayToSequence(byte[] bytes, int startindex, int length)
    {
       init();
        
       StringBuffer sb = new StringBuffer();
        
       for (int i = startindex; i < startindex+length; i++) {
      sb.append(reverseHash.get(bytes[i]));
       }
        
       return sb.toString();
       }*/

    /**
     * Third version
     * Different input type
     */
    // because Text.bytes.length is not always the right length to use.
    public static String byteArrayToSequence(Text seq) {
        byte[] ba = seq.getBytes();

        return byteArrayToSequence(ba);
    }

    /**
     * Init the two hash tables for compress & decompress
     * The logic has changed.  --lanhin
     */
    private static void initHash() {
        String[] alphabet = { "a", "t", "g", "c", "n" };

        for (byte i = 0; i < 5; i++) {
            for (byte j = 0; j < 5; j++) {
                for (byte k = 0; k < 5; k++) {
                    hash.put(alphabet[i] + alphabet[j] + alphabet[k], (byte) (i * 25 + j * 5 + k + 1));
                    reverseHash.put((byte) (i * 25 + j * 5 + k + 1), alphabet[i] + alphabet[j] + alphabet[k]);
                }
            }
        }
    }

    /**
     * The real method to do sequenceToBytearray
     * The logic has changed.  --lanhin
     */
    private static byte[] pack(String sequenceToPack) {
        int numberOfBases = sequenceToPack.length();
        int numberOfFill = (3 - numberOfBases % 3) % 3;
        int numberOfBytes = (int) Math.ceil((2 + Math.ceil((double) numberOfBases / 3) * 7) / 8);//Wish I didn't make a mistake here  --lanhin
        int numberOfBaseGroup = (numberOfBases + numberOfFill) / 3;

        byte[] bytes = new byte[numberOfBytes];
        int leftBits, position = 0;// Position index in bytes[]

        byte former, latter, current;

        String newSequence = sequenceToPack;
        //Fill by 'n'
        for (int i = 0; i < numberOfFill; i++) {
            newSequence = newSequence + 'n';
        }

        former = (byte) (((byte) numberOfFill) << 6);//Record this var in the very first two bits.
        leftBits = 6;

        try {
            for (int i = 0; i < numberOfBaseGroup; i++) {

                StringBuilder subseq = new StringBuilder(newSequence.substring(i * 3, i * 3 + 3));
                for (int si = 0; si < 3; si++) {
                    char sichar = subseq.charAt(si);
                    if (sichar != 'a' && sichar != 't' && sichar != 'g' && sichar != 'c' && sichar != 'n') {
                        subseq.setCharAt(si, 'n');
                    }
                }
                current = hash.get(subseq.toString());
                if (leftBits == 8)
                    former = (byte) (current << 1);
                else
                    former = (byte) ((current >> (7 - leftBits)) | former);
                latter = (byte) (current << (1 + leftBits));
                if (leftBits <= 7) {//A new byte produced
                    bytes[position++] = former;
                    former = latter;
                }
                leftBits = (1 + leftBits) % 8;
                if (leftBits == 0) {
                    leftBits = 8;
                }
            }
            if (leftBits < 8)//Fill the last one byte
                bytes[position++] = former;

        } catch (Exception e) {
            System.out.println("Pack Error, Please Check." + e);
        }

        // sanity check
        assert (position == numberOfBytes);

        return bytes;
    }

    /**
     * Merge two bytearrays into one
     * --lanhin
     */
    static public byte[] merge(byte[] seq1, byte[] seq2) {

        // reverse the second sequence and merge

        String s1 = byteArrayToSequence(seq1);
        String s2 = byteArrayToSequence(seq2);

        String s3 = s1 + StringUtils.reverse(s2);

        return sequenceToByteArray(s3);
    }

    /**
     * Use this to try to test this unit, wish this can work.
     * --lanhin
     */
    public DataBag exec(Tuple input) throws IOException {
        DataBag output = DefaultBagFactory.getInstance().newDistinctBag();

        byte[] ba;
        String ab;
        Object values = input.get(0);

        try {
            if (values instanceof DataByteArray) {
                ba = ((DataByteArray) values).get();
                System.out.println("ins of");
                ab = byteArrayToSequence(ba);
                System.out.println("Lengthofbases:" + numBases(ba));
                System.out.println("ab=" + ab);
                System.out.println("ba=" + ba);
                Tuple t = DefaultTupleFactory.getInstance().newTuple(1);
                t.set(0, ab);
                output.add(t);
            } else {

                System.out.println("[Debug: ] values = " + values);
                ab = values.toString();
                ba = sequenceToByteArray(ab);
                //System.out.println("no ins");

                Tuple t = DefaultTupleFactory.getInstance().newTuple(1);
                t.set(0, new DataByteArray(ba));
                output.add(t);
            }
        } catch (Exception e) {
            System.err.println("SequenceStringCompress: failed to process input; error - " + e.getMessage());
            return (null);
        }

        return output;

    }
}