org.apache.tajo.util.BytesUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tajo.util.BytesUtils.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tajo.util;

import io.netty.buffer.ByteBuf;

import java.io.ByteArrayOutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import static io.netty.util.internal.StringUtil.isSurrogate;

/**
 * Extra utilities for bytes
 */
public class BytesUtils {

    /**
     * Parse the first byte of a vint/vlong to determine the number of bytes
     * @param value the first byte of the vint/vlong
     * @return the total number of bytes (1 to 9)
     */
    public static int decodeVIntSize(byte value) {
        if (value >= -112) {
            return 1;
        } else if (value < -120) {
            return -119 - value;
        }
        return -111 - value;
    }

    /**
     * @param n Long to make a VLong of.
     * @return VLong as bytes array.
     */
    public static byte[] vlongToBytes(long n) {
        byte[] result;
        int offset = 0;
        if (n >= -112 && n <= 127) {
            result = new byte[1];
            result[offset] = (byte) n;
            return result;
        }

        int len = -112;
        if (n < 0) {
            n ^= -1L; // take one's complement'
            len = -120;
        }

        long tmp = n;
        while (tmp != 0) {
            tmp = tmp >> 8;
            len--;
        }

        int size = decodeVIntSize((byte) len);

        result = new byte[size];
        result[offset++] = (byte) len;
        len = (len < -120) ? -(len + 120) : -(len + 112);

        for (int idx = len; idx != 0; idx--) {
            int shiftbits = (idx - 1) * 8;
            long mask = 0xFFL << shiftbits;
            result[offset++] = (byte) ((n & mask) >> shiftbits);
        }
        return result;
    }

    public static void writeVLong(ByteArrayOutputStream byteStream, long l) {
        byte[] vLongBytes = vlongToBytes(l);
        byteStream.write(vLongBytes, 0, vLongBytes.length);
    }

    /**
     * Converts a char array to a ascii byte array.
     *
     * @param chars string
     * @return the byte array
     */
    static byte[] toASCIIBytes(char[] chars) {
        byte[] buffer = new byte[chars.length];
        for (int i = 0; i < chars.length; i++) {
            buffer[i] = (byte) chars[i];
        }
        return buffer;
    }

    public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int[] target, int numColumns) {
        return splitWorker(str, 0, -1, separatorChar, target, numColumns);
    }

    public static byte[][] splitPreserveAllTokens(byte[] str, int offset, int length, byte[] separator,
            int[] target, int numColumns) {
        return splitWorker(str, offset, length, separator, target, numColumns);
    }

    public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int numColumns) {
        return splitWorker(str, 0, -1, separatorChar, null, numColumns);
    }

    private static byte[][] splitWorker(byte[] str, int offset, int length, char separatorChar, int[] target,
            int numColumns) {
        return splitWorker(str, offset, length, new byte[] { (byte) separatorChar }, target, numColumns);
    }

    /**
     * Performs the logic for the <code>split</code> and
     * <code>splitPreserveAllTokens</code> methods that do not return a
     * maximum array length.
     *
     * @param str  the String to parse, may be <code>null</code>
     * @param length amount of bytes to str
     * @param separator the ascii separate characters
     * @param target the projection target
     * @param numColumns number of columns to be retrieved              
     * @return an array of parsed Strings, <code>null</code> if null String input
     */
    private static byte[][] splitWorker(byte[] str, int offset, int length, byte[] separator, int[] target,
            int numColumns) {
        if (str == null) {
            return null;
        }
        if (length == 0) {
            return new byte[numColumns][0];
        }
        if (length < 0) {
            length = str.length - offset;
        }
        int indexMax = 0;
        if (target != null) {
            for (int index : target) {
                indexMax = Math.max(indexMax, index + 1);
            }
        } else {
            indexMax = numColumns;
        }

        int[][] indices = split(str, offset, length, separator, new int[indexMax][]);
        byte[][] result = new byte[numColumns][];

        // not-picked -> null, picked but not-exists -> byte[0]
        if (target != null) {
            for (int i : target) {
                int[] index = indices[i];
                result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]);
            }
        } else {
            for (int i = 0; i < result.length; i++) {
                int[] index = indices[i];
                result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]);
            }
        }
        return result;
    }

    public static int[][] split(byte[] str, int offset, int length, byte[] separator, int[][] indices) {
        if (indices.length == 0) {
            return indices; // trivial
        }
        final int limit = offset + length;

        int start = offset;
        int colIndex = 0;
        for (int index = offset; index < limit;) {
            if (onDelimiter(str, index, limit, separator)) {
                indices[colIndex++] = new int[] { start, index };
                if (colIndex >= indices.length) {
                    return indices;
                }
                index += separator.length;
                start = index;
            } else {
                index++;
            }
        }
        if (colIndex < indices.length) {
            indices[colIndex] = new int[] { start, limit };
        }
        return indices;
    }

    private static boolean onDelimiter(byte[] input, int offset, int limit, byte[] delimiter) {
        for (int i = 0; i < delimiter.length; i++) {
            if (offset + i >= limit || input[offset + i] != delimiter[i]) {
                return false;
            }
        }
        return true;
    }

    public static byte[][] splitTrivial(byte[] value, byte delimiter) {
        List<byte[]> split = new ArrayList<>();
        int prev = 0;
        for (int i = 0; i < value.length; i++) {
            if (value[i] == delimiter) {
                split.add(Arrays.copyOfRange(value, prev, i));
                prev = i + 1;
            }
        }
        if (prev <= value.length) {
            split.add(Arrays.copyOfRange(value, prev, value.length));
        }
        return split.toArray(new byte[split.size()][]);
    }

    /**
     * It gets the maximum length among all given the array of bytes.
     * Then, it adds padding (i.e., \0) to byte arrays which are shorter
     * than the maximum length.
     *
     * @param bytes Byte arrays to be padded
     * @return The array of padded bytes
     */
    public static byte[][] padBytes(byte[]... bytes) {
        byte[][] padded = new byte[bytes.length][];

        int maxLen = Integer.MIN_VALUE;

        for (byte[] aByte : bytes) {
            maxLen = Math.max(maxLen, aByte.length);
        }

        for (int i = 0; i < bytes.length; i++) {
            int padLen = maxLen - bytes[i].length;
            if (padLen == 0) {
                padded[i] = bytes[i];
            } else if (padLen > 0) {
                padded[i] = Bytes.padTail(bytes[i], padLen);
            } else {
                throw new RuntimeException(
                        "maximum length: " + maxLen + ", bytes[" + i + "].length:" + bytes[i].length);
            }
        }

        return padded;
    }

    public static byte[] trimBytes(byte[] bytes) {
        return new String(bytes).trim().getBytes();
    }

    /**
     * this is an implementation copied from ByteBufUtil in netty4
     */
    public static int writeUtf8(ByteBuf buffer, char[] chars, boolean ignoreSurrogate) {
        int oldWriterIndex = buffer.writerIndex();
        int writerIndex = oldWriterIndex;

        // We can use the _set methods as these not need to do any index checks and reference checks.
        // This is possible as we called ensureWritable(...) before.
        for (int i = 0; i < chars.length; i++) {
            char c = chars[i];
            if (c < 0x80) {
                buffer.setByte(writerIndex++, (byte) c);
            } else if (c < 0x800) {
                buffer.setByte(writerIndex++, (byte) (0xc0 | (c >> 6)));
                buffer.setByte(writerIndex++, (byte) (0x80 | (c & 0x3f)));
            } else if (!ignoreSurrogate && isSurrogate(c)) {
                if (!Character.isHighSurrogate(c)) {
                    throw new IllegalArgumentException("Invalid encoding. "
                            + "Expected high (leading) surrogate at index " + i + " but got " + c);
                }
                final char c2;
                try {
                    // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid
                    // duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will
                    // re-throw a more informative exception describing the problem.
                    c2 = chars[++i];
                } catch (IndexOutOfBoundsException e) {
                    throw new IllegalArgumentException("Underflow. " + "Expected low (trailing) surrogate at index "
                            + i + " but no more characters found.", e);
                }
                if (!Character.isLowSurrogate(c2)) {
                    throw new IllegalArgumentException("Invalid encoding. "
                            + "Expected low (trailing) surrogate at index " + i + " but got " + c2);
                }
                int codePoint = Character.toCodePoint(c, c2);
                // See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630.
                buffer.setByte(writerIndex++, (byte) (0xf0 | (codePoint >> 18)));
                buffer.setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 12) & 0x3f)));
                buffer.setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 6) & 0x3f)));
                buffer.setByte(writerIndex++, (byte) (0x80 | (codePoint & 0x3f)));
            } else {
                buffer.setByte(writerIndex++, (byte) (0xe0 | (c >> 12)));
                buffer.setByte(writerIndex++, (byte) (0x80 | ((c >> 6) & 0x3f)));
                buffer.setByte(writerIndex++, (byte) (0x80 | (c & 0x3f)));
            }
        }
        // update the writerIndex without any extra checks for performance reasons
        buffer.writerIndex(writerIndex);
        return writerIndex - oldWriterIndex;
    }
}