gov.jgi.meta.sequence.SequenceString.java Source code

Java tutorial

Introduction

Here is the source code for gov.jgi.meta.sequence.SequenceString.java

Source

/*
 * Copyright (c) 2010, The Regents of the University of California, through Lawrence Berkeley
 * National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy).
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided
 * that the following conditions are met:
 *
 * (1) Redistributions of source code must retain the above copyright notice, this list of conditions and the
 * following disclaimer.
 *
 * (2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions
 * and the following disclaimer in the documentation and/or other materials provided with the distribution.
 *
 * (3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept.
 * of Energy, nor the names of its contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the
 * features, functionality or performance of the source code ("Enhancements") to anyone; however,
 * if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley
 * National Laboratory, without imposing a separate written license agreement for such Enhancements,
 * then you hereby grant the following license: a  non-exclusive, royalty-free perpetual license to install,
 * use, modify, prepare derivative works, incorporate into other computer software, distribute, and
 * sublicense such enhancements or derivative works thereof, in binary and source code form.
 */

package gov.jgi.meta.sequence;

import org.apache.hadoop.io.Text;
import org.apache.commons.lang.StringUtils;

import java.io.UnsupportedEncodingException;
import java.util.HashMap;

/**
 * Created by IntelliJ IDEA.
 * User: kbhatia
 * Date: Dec 3, 2010
 * Time: 9:37:22 AM
 * To change this template use File | Settings | File Templates.
 */
public class SequenceString {

    static HashMap<String, Byte> hash = null;
    static HashMap<Byte, String> reverseHash = null;

    public static void init() {
        if (hash == null) {
            hash = new HashMap();
            reverseHash = new HashMap();
            initHash();
        }
    }

    public static int numBases(byte[] seqarray) {
        int lastByte = seqarray.length - 1;
        int lastByteNumBases = reverseHash.get(seqarray[lastByte]).length();
        return (3 * lastByte + lastByteNumBases);
    }

    public static byte[] subseq(byte[] seqarray, int start, int end) {
        int startindex = start / 3;
        int overflow = (((end % 3) == 0) ? 0 : 1);
        int endindex = end / 3 + overflow;

        String unpackedSeqSegment = byteArrayToSequence(seqarray, startindex, endindex - startindex);
        return pack(unpackedSeqSegment.substring(start % 3, start % 3 + (end - start)));

    }

    public static boolean contains(String sequence, CharSequence c) throws UnsupportedEncodingException {
        return byteArrayToSequence(sequence.getBytes("ISO-8859-1")).contains(c);
    }

    public static byte[] sequenceToByteArray(String sequence) {
        init();
        return pack(sequence);

    }

    public static String byteArrayToSequence(byte[] bytes) {
        init();

        StringBuffer sb = new StringBuffer();

        for (int i = 0; i < bytes.length; i++) {
            sb.append(reverseHash.get(bytes[i]));
        }

        return sb.toString();
    }

    public static String byteArrayToSequence(byte[] bytes, int startindex, int length) {
        init();

        StringBuffer sb = new StringBuffer();

        for (int i = startindex; i < startindex + length; i++) {
            sb.append(reverseHash.get(bytes[i]));
        }

        return sb.toString();
    }

    // because Text.bytes.length is not always the right length to use.
    public static String byteArrayToSequence(Text seq) {
        init();

        StringBuffer sb = new StringBuffer();
        byte[] ba = seq.getBytes();

        for (int i = 0; i < seq.getLength(); i++) {
            sb.append(reverseHash.get(ba[i]));
        }

        return sb.toString();
    }

    private static void initHash() {
        String[] alphabet = { "a", "t", "g", "c", "n" };

        for (byte i = 0; i < 5; i++) {
            hash.put(alphabet[i], (byte) (32 + 128 + i));
            reverseHash.put((byte) (32 + 128 + i), alphabet[i]);
        }

        for (byte i = 0; i < 5; i++) {
            for (byte j = 0; j < 5; j++) {
                hash.put(alphabet[i] + alphabet[j], (byte) (32 + 192 + i * 5 + j));
                reverseHash.put((byte) (32 + 192 + i * 5 + j), alphabet[i] + alphabet[j]);
            }
        }

        for (byte i = 0; i < 5; i++) {
            for (byte j = 0; j < 5; j++) {
                for (byte k = 0; k < 5; k++) {
                    hash.put(alphabet[i] + alphabet[j] + alphabet[k], (byte) (32 + i * 25 + j * 5 + k));
                    reverseHash.put((byte) (32 + i * 25 + j * 5 + k), alphabet[i] + alphabet[j] + alphabet[k]);
                }
            }
        }
    }

    private static byte[] pack(String sequenceToPack) {
        int numberOfBases = sequenceToPack.length();
        int numberOfFullBytes = numberOfBases / 3;
        int overflow = (numberOfBases % 3 == 0 ? 0 : 1);
        int numberOfBytes = numberOfFullBytes + overflow;

        byte[] bytes = new byte[numberOfBytes];

        int i = 0;

        try {
            for (i = 0; i < numberOfFullBytes; i++) {

                StringBuilder subseq = new StringBuilder(sequenceToPack.substring(i * 3, i * 3 + 3));
                for (int si = 0; si < 3; si++) {
                    char sichar = subseq.charAt(si);
                    if (sichar != 'a' && sichar != 't' && sichar != 'g' && sichar != 'c' && sichar != 'n') {
                        subseq.setCharAt(si, 'n');
                    }
                }
                bytes[i] = hash.get(subseq.toString());

            }
        } catch (Exception e) {
            System.out.println("i = " + i);
        }

        if (overflow > 0) {
            StringBuilder subseq = new StringBuilder(sequenceToPack.substring(i * 3, i * 3 + numberOfBases % 3));
            for (int si = 0; si < numberOfBases % 3; si++) {
                char sichar = subseq.charAt(si);
                if (sichar != 'a' && sichar != 't' && sichar != 'g' && sichar != 'c' && sichar != 'n') {
                    subseq.setCharAt(si, 'n');
                }
            }
            bytes[i++] = hash.get(subseq.toString());
        }

        // sanity check
        assert (i == numberOfBytes);

        return bytes;
    }

    static public byte[] merge(byte[] seq1, byte[] seq2) {

        // reverse the second sequence and merge

        String s1 = byteArrayToSequence(seq1);
        String s2 = byteArrayToSequence(seq2);

        String s3 = s1 + StringUtils.reverse(s2);

        return sequenceToByteArray(s3);
    }
}