org.apache.hadoop.hive.serde2.teradata.TeradataBinaryDataInputStream.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.serde2.teradata.TeradataBinaryDataInputStream.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.serde2.teradata;

import org.apache.commons.io.input.SwappedDataInputStream;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.common.type.Date;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.Timestamp;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigInteger;
import java.text.ParseException;

import static java.lang.String.format;

/**
 * The TeradataBinaryDataInputStream is used to handle the Teradata binary format input for record.
 * Since the TD binary format uses little-endian to handle the SHORT, INT, LONG, DOUBLE and etc.
 * while the Hadoop uses big-endian,
 * We extend SwappedDataInputStream to handle these types and extend to handle the Teradata
 * specific types like VARCHAR, CHAR, TIMESTAMP, DATE...
 */
public class TeradataBinaryDataInputStream extends SwappedDataInputStream {

    private static final int DATE_STRING_LENGTH = 8;

    /**
     * Instantiates a new Teradata binary data input stream.
     *
     * @param input the input
     */
    public TeradataBinaryDataInputStream(InputStream input) {
        super(input);
    }

    /**
     * Read VARCHAR(N).
     * The representation of Varchar in Teradata binary format is:
     * the first two bytes represent the length N of this varchar field,
     * the next N bytes represent the content of this varchar field.
     * To pad the null varchar, the length will be 0 and the content will be none.
     *
     * @return the string
     * @throws IOException the io exception
     */
    public String readVarchar() throws IOException {
        int varcharLength = readUnsignedShort();
        byte[] varcharContent = new byte[varcharLength];
        int numOfBytesRead = in.read(varcharContent);
        if (varcharContent.length != 0 && numOfBytesRead != varcharLength) {
            throw new EOFException(format("Fail to read the varchar. Expect %d bytes, get %d bytes", varcharLength,
                    numOfBytesRead));
        }
        //force it to be UTF8 string
        return new String(varcharContent, "UTF8");
    }

    /**
     * Read TIMESTAMP(P).
     * The representation of timestamp in Teradata binary format is:
     * the byte number to read is based on the precision of timestamp,
     * each byte represents one char and the timestamp is using string representation,
     * eg: for TIMESTAMP(6), we need to read 26 bytes
     * 31 39  31 31 2d 31 31 2d 31 31 20 31 39 3a 32 30 3a 32 31 2e 34 33 33 32 30 30
     * will represent 1911-11-11 19:20:21.433200.
     * the null timestamp will use space to pad.
     *
     * @param byteNum the byte number that will be read from inputstream
     * @return the timestamp
     * @throws IOException the io exception
     */
    public Timestamp readTimestamp(Integer byteNum) throws IOException {
        // yyyy-mm-dd hh:mm:ss
        byte[] timestampContent = new byte[byteNum];
        int numOfBytesRead = in.read(timestampContent);
        if (timestampContent.length != 0 && numOfBytesRead != byteNum) {
            throw new EOFException(
                    format("Fail to read the timestamp. Expect %d bytes, get %d bytes", byteNum, numOfBytesRead));
        }
        String timestampStr = new String(timestampContent, "UTF8");
        if (timestampStr.trim().length() == 0) {
            return null;
        }
        return Timestamp.valueOf(timestampStr);
    }

    /**
     * Read DATE.
     * The representation of date in Teradata binary format is:
     * The Date D is a int with 4 bytes using little endian,
     * The representation is (D+19000000).ToString -> YYYYMMDD,
     * eg: Date 07 b2 01 00 -> 111111 in little endian -> 19111111 - > 1911.11.11.
     * the null date will use 0 to pad.
     *
     * @return the date
     * @throws IOException the io exception
     * @throws ParseException the parse exception
     */
    public Date readDate() throws IOException, ParseException {
        int di = readInt();
        if (di == 0) {
            return null;
        }
        String dateString = String.valueOf(di + 19000000);
        if (dateString.length() < DATE_STRING_LENGTH) {
            dateString = StringUtils.leftPad(dateString, DATE_STRING_LENGTH, '0');
        }
        Date date = new Date();
        date.setYear(Integer.parseInt(dateString.substring(0, 4)));
        date.setMonth(Integer.parseInt(dateString.substring(4, 6)));
        date.setDayOfMonth(Integer.parseInt(dateString.substring(6, 8)));
        return date;
    }

    /**
     * Read CHAR(N).
     * The representation of char in Teradata binary format is
     * the byte number to read is based on the [charLength] * [bytePerChar] <- totalLength,
     * bytePerChar is decided by the charset: LATAIN charset is 2 bytes per char and UNICODE charset is 3 bytes per char.
     * the null char will use space to pad.
     *
     * @param totalLength the total length
     * @return the string
     * @throws IOException the io exception
     */
    public String readChar(int totalLength) throws IOException {
        byte[] charContent = new byte[totalLength];
        int numOfBytesRead = in.read(charContent);
        if (charContent.length != 0 && numOfBytesRead != totalLength) {
            throw new EOFException(
                    format("Fail to read the varchar. Expect %d bytes, get %d bytes", totalLength, numOfBytesRead));
        }
        return new String(charContent, "UTF8");
    }

    /**
     * Read DECIMAL(P, S).
     * The representation of decimal in Teradata binary format is
     * the byte number to read is decided solely by the precision(P),
     * HiveDecimal is constructed through the byte array and scale.
     * the null DECIMAL will use 0x00 to pad.
     *
     * @param scale the scale
     * @param byteNum the byte num
     * @return the hive decimal
     * @throws IOException the io exception
     */
    public HiveDecimal readDecimal(int scale, int byteNum) throws IOException {
        byte[] decimalContent = new byte[byteNum];
        int numOfBytesRead = in.read(decimalContent);
        if (decimalContent.length != 0 && numOfBytesRead != byteNum) {
            throw new EOFException(
                    format("Fail to read the decimal. Expect %d bytes, get %d bytes", byteNum, numOfBytesRead));
        }
        ArrayUtils.reverse(decimalContent);
        return HiveDecimal.create(new BigInteger(decimalContent), scale);
    }

    /**
     * Read VARBYTE(N).
     * The representation of VARBYTE in Teradata binary format is:
     * the first two bytes represent the length N of this varchar field
     * the next N bytes represent the content of this varchar field.
     * To pad the null varbyte, the length will be 0 and the content will be none.
     *
     * @return the byte [ ]
     * @throws IOException the io exception
     */
    public byte[] readVarbyte() throws IOException {
        int varbyteLength = readUnsignedShort();
        byte[] varbyteContent = new byte[varbyteLength];
        int numOfBytesRead = in.read(varbyteContent);
        if (varbyteContent.length != 0 && numOfBytesRead != varbyteLength) {
            throw new EOFException(format("Fail to read the varbyte. Expect %d bytes, get %d bytes", varbyteLength,
                    numOfBytesRead));
        }
        return varbyteContent;
    }
}