org.apache.hadoop.hive.serde2.lazy.LazyUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hive.serde2.lazy.LazyUtils.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.serde2.lazy;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.util.Arrays;
import java.util.Map;

import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveIntervalDayTimeObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveIntervalYearMonthObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampTZObjectInspector;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;

/**
 * LazyUtils.
 *
 */
public final class LazyUtils {

    /**
     * Returns the digit represented by character b.
     *
     * @param b
     *          The ascii code of the character
     * @param radix
     *          The radix
     * @return -1 if it's invalid
     */
    public static int digit(int b, int radix) {
        int r = -1;
        if (b >= '0' && b <= '9') {
            r = b - '0';
        } else if (b >= 'A' && b <= 'Z') {
            r = b - 'A' + 10;
        } else if (b >= 'a' && b <= 'z') {
            r = b - 'a' + 10;
        }
        if (r >= radix) {
            r = -1;
        }
        return r;
    }

    /**
     * returns false, when the bytes definitely cannot be parsed into a base-10
     * Number (Long or a Double)
     * 
     * If it returns true, the bytes might still be invalid, but not obviously.
     */

    public static boolean isNumberMaybe(byte[] buf, int offset, int len) {
        switch (len) {
        case 0:
            return false;
        case 1:
            // space usually
            return Character.isDigit(buf[offset]);
        case 2:
            // \N or -1 (allow latter)
            return Character.isDigit(buf[offset + 1]) || Character.isDigit(buf[offset + 0]);
        case 4:
            // null or NULL
            if (buf[offset] == 'N' || buf[offset] == 'n') {
                return false;
            }
        }
        // maybe valid - too expensive to check without a parse
        return true;
    }

    /**
     * returns false, when the bytes definitely cannot be parsed into a date/timestamp.
     * 
     * Y2k requirements and dash requirements say the string has to be at least
     * yyyy-m-m = 8 bytes or more minimum; Timestamp needs to be at least 1 byte longer,
     * but the Date check is necessary, but not sufficient.
     */
    public static boolean isDateMaybe(byte[] buf, int offset, int len) {
        // maybe valid - too expensive to check without a parse
        return len >= 8;
    }

    /**
     * Returns -1 if the first byte sequence is lexicographically less than the
     * second; returns +1 if the second byte sequence is lexicographically less
     * than the first; otherwise return 0.
     */
    public static int compare(byte[] b1, int start1, int length1, byte[] b2, int start2, int length2) {

        int min = Math.min(length1, length2);

        for (int i = 0; i < min; i++) {
            if (b1[start1 + i] == b2[start2 + i]) {
                continue;
            }
            if (b1[start1 + i] < b2[start2 + i]) {
                return -1;
            } else {
                return 1;
            }
        }

        if (length1 < length2) {
            return -1;
        }
        if (length1 > length2) {
            return 1;
        }
        return 0;
    }

    /**
     * Convert a UTF-8 byte array to String.
     *
     * @param bytes
     *          The byte[] containing the UTF-8 String.
     * @param start
     *          The start position inside the bytes.
     * @param length
     *          The length of the data, starting from "start"
     * @return The unicode String
     */
    public static String convertToString(byte[] bytes, int start, int length) {
        try {
            return Text.decode(bytes, start, length);
        } catch (CharacterCodingException e) {
            return null;
        }
    }

    public static byte[] trueBytes = { (byte) 't', 'r', 'u', 'e' };
    public static byte[] falseBytes = { (byte) 'f', 'a', 'l', 's', 'e' };

    /**
     * Write the bytes with special characters escaped.
     *
     * @param escaped
     *          Whether the data should be written out in an escaped way.
     * @param escapeChar
     *          If escaped, the char for prefixing special characters.
     * @param needsEscape
     *          If escaped, whether a specific character needs escaping. This
     *          array should have size of 256.
     */
    public static void writeEscaped(OutputStream out, byte[] bytes, int start, int len, boolean escaped,
            byte escapeChar, boolean[] needsEscape) throws IOException {
        if (escaped) {
            int end = start + len;
            for (int i = start; i <= end; i++) {
                if (i == end || needsEscape[bytes[i] & 0xFF]) { // Converts negative byte to positive index
                    if (i > start) {
                        out.write(bytes, start, i - start);
                    }

                    if (i == end)
                        break;

                    out.write(escapeChar);
                    if (bytes[i] == '\r') {
                        out.write('r');
                        start = i + 1;
                    } else if (bytes[i] == '\n') {
                        out.write('n');
                        start = i + 1;
                    } else {
                        // the current char will be written out later.
                        start = i;
                    }
                }
            }
        } else {
            out.write(bytes, start, len);
        }
    }

    /**
     * Write out the text representation of a Primitive Object to a UTF8 byte
     * stream.
     *
     * @param out
     *          The UTF8 byte OutputStream
     * @param o
     *          The primitive Object
     * @param needsEscape
     *          Whether a character needs escaping. 
     */
    public static void writePrimitiveUTF8(OutputStream out, Object o, PrimitiveObjectInspector oi, boolean escaped,
            byte escapeChar, boolean[] needsEscape) throws IOException {

        PrimitiveObjectInspector.PrimitiveCategory category = oi.getPrimitiveCategory();
        switch (category) {
        case BOOLEAN: {
            boolean b = ((BooleanObjectInspector) oi).get(o);
            if (b) {
                out.write(trueBytes, 0, trueBytes.length);
            } else {
                out.write(falseBytes, 0, falseBytes.length);
            }
            break;
        }
        case BYTE: {
            LazyInteger.writeUTF8(out, ((ByteObjectInspector) oi).get(o));
            break;
        }
        case SHORT: {
            LazyInteger.writeUTF8(out, ((ShortObjectInspector) oi).get(o));
            break;
        }
        case INT: {
            LazyInteger.writeUTF8(out, ((IntObjectInspector) oi).get(o));
            break;
        }
        case LONG: {
            LazyLong.writeUTF8(out, ((LongObjectInspector) oi).get(o));
            break;
        }
        case FLOAT: {
            float f = ((FloatObjectInspector) oi).get(o);
            ByteBuffer b = Text.encode(String.valueOf(f));
            out.write(b.array(), 0, b.limit());
            break;
        }
        case DOUBLE: {
            double d = ((DoubleObjectInspector) oi).get(o);
            ByteBuffer b = Text.encode(String.valueOf(d));
            out.write(b.array(), 0, b.limit());
            break;
        }
        case STRING: {
            Text t = ((StringObjectInspector) oi).getPrimitiveWritableObject(o);
            writeEscaped(out, t.getBytes(), 0, t.getLength(), escaped, escapeChar, needsEscape);
            break;
        }
        case CHAR: {
            HiveCharWritable hc = ((HiveCharObjectInspector) oi).getPrimitiveWritableObject(o);
            Text t = hc.getPaddedValue();
            writeEscaped(out, t.getBytes(), 0, t.getLength(), escaped, escapeChar, needsEscape);
            break;
        }
        case VARCHAR: {
            HiveVarcharWritable hc = ((HiveVarcharObjectInspector) oi).getPrimitiveWritableObject(o);
            Text t = hc.getTextValue();
            writeEscaped(out, t.getBytes(), 0, t.getLength(), escaped, escapeChar, needsEscape);
            break;
        }
        case BINARY: {
            BytesWritable bw = ((BinaryObjectInspector) oi).getPrimitiveWritableObject(o);
            byte[] toEncode = new byte[bw.getLength()];
            System.arraycopy(bw.getBytes(), 0, toEncode, 0, bw.getLength());
            byte[] toWrite = Base64.encodeBase64(toEncode);
            out.write(toWrite, 0, toWrite.length);
            break;
        }
        case DATE: {
            LazyDate.writeUTF8(out, ((DateObjectInspector) oi).getPrimitiveWritableObject(o));
            break;
        }
        case TIMESTAMP: {
            LazyTimestamp.writeUTF8(out, ((TimestampObjectInspector) oi).getPrimitiveWritableObject(o));
            break;
        }
        case TIMESTAMPTZ: {
            LazyTimestampTZ.writeUTF8(out, ((TimestampTZObjectInspector) oi).getPrimitiveWritableObject(o));
            break;
        }
        case INTERVAL_YEAR_MONTH: {
            LazyHiveIntervalYearMonth.writeUTF8(out,
                    ((HiveIntervalYearMonthObjectInspector) oi).getPrimitiveWritableObject(o));
            break;
        }
        case INTERVAL_DAY_TIME: {
            LazyHiveIntervalDayTime.writeUTF8(out,
                    ((HiveIntervalDayTimeObjectInspector) oi).getPrimitiveWritableObject(o));
            break;
        }
        case DECIMAL: {
            HiveDecimalObjectInspector decimalOI = (HiveDecimalObjectInspector) oi;
            LazyHiveDecimal.writeUTF8(out, decimalOI.getPrimitiveJavaObject(o), decimalOI.scale());
            break;
        }
        default: {
            throw new RuntimeException("Unknown primitive type: " + category);
        }
        }
    }

    /**
     * Write out a binary representation of a PrimitiveObject to a byte stream.
     *
     * @param out ByteStream.Output, an unsynchronized version of ByteArrayOutputStream, used as a
     *            backing buffer for the the DataOutputStream
     * @param o the PrimitiveObject
     * @param oi the PrimitiveObjectInspector
     * @throws IOException on error during the write operation
     */
    public static void writePrimitive(OutputStream out, Object o, PrimitiveObjectInspector oi) throws IOException {

        DataOutputStream dos = new DataOutputStream(out);

        try {
            switch (oi.getPrimitiveCategory()) {
            case BOOLEAN:
                boolean b = ((BooleanObjectInspector) oi).get(o);
                dos.writeBoolean(b);
                break;

            case BYTE:
                byte bt = ((ByteObjectInspector) oi).get(o);
                dos.writeByte(bt);
                break;

            case SHORT:
                short s = ((ShortObjectInspector) oi).get(o);
                dos.writeShort(s);
                break;

            case INT:
                int i = ((IntObjectInspector) oi).get(o);
                dos.writeInt(i);
                break;

            case LONG:
                long l = ((LongObjectInspector) oi).get(o);
                dos.writeLong(l);
                break;

            case FLOAT:
                float f = ((FloatObjectInspector) oi).get(o);
                dos.writeFloat(f);
                break;

            case DOUBLE:
                double d = ((DoubleObjectInspector) oi).get(o);
                dos.writeDouble(d);
                break;

            case BINARY: {
                BytesWritable bw = ((BinaryObjectInspector) oi).getPrimitiveWritableObject(o);
                out.write(bw.getBytes(), 0, bw.getLength());
                break;
            }

            default:
                throw new RuntimeException("Hive internal error.");
            }
        } finally {
            // closing the underlying ByteStream should have no effect, the data should still be
            // accessible
            dos.close();
        }
    }

    public static int hashBytes(byte[] data, int start, int len) {
        int hash = 1;
        for (int i = start; i < len; i++) {
            hash = (31 * hash) + data[i];
        }
        return hash;
    }

    /**
     * gets a byte[] with copy of data from source BytesWritable
     * @param sourceBw - source BytesWritable
     */
    public static byte[] createByteArray(BytesWritable sourceBw) {
        //TODO should replace with BytesWritable.copyData() once Hive
        //removes support for the Hadoop 0.20 series.
        return Arrays.copyOf(sourceBw.getBytes(), sourceBw.getLength());
    }

    /**
     * Utility function to get separator for current level used in serialization.
     * Used to get a better log message when out of bound lookup happens
     * @param separators - array of separators byte, byte at index x indicates
     *  separator used at that level
     * @param level - nesting level
     * @return separator at given level
     * @throws SerDeException
     */
    static byte getSeparator(byte[] separators, int level) throws SerDeException {
        try {
            return separators[level];
        } catch (ArrayIndexOutOfBoundsException e) {
            String msg = "Number of levels of nesting supported for " + "LazySimpleSerde is "
                    + (separators.length - 1) + " Unable to work with level " + level;

            String txt = ". Use %s serde property for tables using LazySimpleSerde.";

            if (separators.length < 9) {
                msg += String.format(txt, LazySerDeParameters.SERIALIZATION_EXTEND_NESTING_LEVELS);
            } else if (separators.length < 25) {
                msg += String.format(txt, LazySerDeParameters.SERIALIZATION_EXTEND_ADDITIONAL_NESTING_LEVELS);
            }

            throw new SerDeException(msg, e);
        }
    }

    public static void copyAndEscapeStringDataToText(byte[] inputBytes, int start, int length, byte escapeChar,
            Text data) {

        // First calculate the length of the output string
        int outputLength = 0;
        for (int i = 0; i < length; i++) {
            if (inputBytes[start + i] != escapeChar) {
                outputLength++;
            } else {
                outputLength++;
                i++;
            }
        }

        // Copy the data over, so that the internal state of Text will be set to
        // the required outputLength.
        data.set(inputBytes, start, outputLength);

        // We need to copy the data byte by byte only in case the
        // "outputLength < length" (which means there is at least one escaped
        // byte.
        if (outputLength < length) {
            int k = 0;
            byte[] outputBytes = data.getBytes();
            for (int i = 0; i < length; i++) {
                byte b = inputBytes[start + i];
                if (b == escapeChar && i < length - 1) {
                    ++i;
                    // Check if it's '\r' or '\n'
                    if (inputBytes[start + i] == 'r') {
                        outputBytes[k++] = '\r';
                    } else if (inputBytes[start + i] == 'n') {
                        outputBytes[k++] = '\n';
                    } else {
                        // get the next byte
                        outputBytes[k++] = inputBytes[start + i];
                    }
                } else {
                    outputBytes[k++] = b;
                }
            }
            assert (k == outputLength);
        }
    }

    /**
     * Return the byte value of the number string.
     *
     * @param altValue
     *          The string containing a number.
     * @param defaultVal
     *          If the altValue does not represent a number, return the
     *          defaultVal.
     */
    public static byte getByte(String altValue, byte defaultVal) {
        if (altValue != null && altValue.length() > 0) {
            try {
                return Byte.parseByte(altValue);
            } catch (NumberFormatException e) {
                return (byte) altValue.charAt(0);
            }
        }
        return defaultVal;
    }

    private LazyUtils() {
        // prevent instantiation
    }
}