org.apache.hawq.pxf.service.io.Text.java Source code

Introduction

Here is the source code for org.apache.hawq.pxf.service.io.Text.java
Source

package org.apache.hawq.pxf.service.io;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.*;
import java.util.Arrays;

/**
 * This class stores text using standard UTF8 encoding. It provides methods to
 * serialize, deserialize. The type of length is integer and is serialized using
 * zero-compressed format.
 */
public class Text implements Writable {

    // for write
    private byte[] buf;
    private static final Log LOG = LogFactory.getLog(Text.class);
    int curLoc;
    private static final char LINE_DELIMITER = '\n';
    private static final int BUF_SIZE = 1024;
    private static final int EOF = -1;

    private static final byte[] EMPTY_BYTES = new byte[0];
    private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY = new ThreadLocal<CharsetEncoder>() {
        @Override
        protected CharsetEncoder initialValue() {
            return Charset.forName("UTF-8").newEncoder().onMalformedInput(CodingErrorAction.REPORT)
                    .onUnmappableCharacter(CodingErrorAction.REPORT);
        }
    };
    private static ThreadLocal<CharsetDecoder> DECODER_FACTORY = new ThreadLocal<CharsetDecoder>() {
        @Override
        protected CharsetDecoder initialValue() {
            return Charset.forName("UTF-8").newDecoder().onMalformedInput(CodingErrorAction.REPORT)
                    .onUnmappableCharacter(CodingErrorAction.REPORT);
        }
    };
    private byte[] bytes;
    private int length;

    public Text() {
        bytes = EMPTY_BYTES;
        buf = new byte[BUF_SIZE];
    }

    /**
     * Construct from a string.
     *
     * @param string input string
     */
    public Text(String string) {
        set(string);
    }

    /**
     * Construct from another text.
     *
     * @param utf8 text to copy
     */
    public Text(Text utf8) {
        set(utf8);
    }

    /**
     * Construct from a byte array.
     *
     * @param utf8 input byte array
     */
    public Text(byte[] utf8) {
        set(utf8);
    }

    public static boolean isNegativeVInt(byte value) {
        return value < -120 || (value >= -112 && value < 0);
    }

    public static long readVLong(DataInput stream) throws IOException {
        byte firstByte = stream.readByte();
        int len = decodeVIntSize(firstByte);
        if (len == 1) {
            return firstByte;
        }
        long i = 0;
        for (int idx = 0; idx < len - 1; idx++) {
            byte b = stream.readByte();
            i = i << 8;
            i = i | (b & 0xFF);
        }
        return (isNegativeVInt(firstByte) ? (i ^ -1L) : i);
    }

    public static int decodeVIntSize(byte value) {
        if (value >= -112) {
            return 1;
        } else if (value < -120) {
            return -119 - value;
        }
        return -111 - value;
    }

    public static String decode(byte[] utf8, int start, int length) throws CharacterCodingException {
        return decode(ByteBuffer.wrap(utf8, start, length), true);
    }

    /**
     * Converts the provided byte array to a String using the UTF-8 encoding. If
     * <code>replace</code> is true, then malformed input is replaced with the
     * substitution character, which is U+FFFD. Otherwise the method throws a
     * MalformedInputException.
     *
     * @param utf8 UTF-8 encoded byte array
     * @param start start point
     * @param length length of array
     * @param replace whether to replace malformed input with substitution
     *            character
     * @return decoded string
     * @throws MalformedInputException if a malformed input is used
     * @throws CharacterCodingException if the conversion failed
     */
    public static String decode(byte[] utf8, int start, int length, boolean replace)
            throws CharacterCodingException {
        return decode(ByteBuffer.wrap(utf8, start, length), replace);
    }

    private static String decode(ByteBuffer utf8, boolean replace) throws CharacterCodingException {
        CharsetDecoder decoder = DECODER_FACTORY.get();
        if (replace) {
            decoder.onMalformedInput(java.nio.charset.CodingErrorAction.REPLACE);
            decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
        }
        String str = decoder.decode(utf8).toString();
        // set decoder back to its default value: REPORT
        if (replace) {
            decoder.onMalformedInput(CodingErrorAction.REPORT);
            decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
        }
        return str;
    }

    /**
     * Converts the provided String to bytes using the UTF-8 encoding. If the
     * input is malformed, invalid chars are replaced by a default value.
     *
     * @param string string to encode
     * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is
     *         ByteBuffer.limit()
     * @throws CharacterCodingException if conversion failed
     */
    public static ByteBuffer encode(String string) throws CharacterCodingException {
        return encode(string, true);
    }

    /**
     * Converts the provided String to bytes using the UTF-8 encoding. If
     * <code>replace</code> is true, then malformed input is replaced with the
     * substitution character, which is U+FFFD. Otherwise the method throws a
     * MalformedInputException.
     *
     * @param string string to encode
     * @param replace whether to replace malformed input with substitution
     *            character
     * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is
     *         ByteBuffer.limit()
     * @throws MalformedInputException if a malformed input is used
     * @throws CharacterCodingException if the conversion failed
     */
    public static ByteBuffer encode(String string, boolean replace) throws CharacterCodingException {
        CharsetEncoder encoder = ENCODER_FACTORY.get();
        if (replace) {
            encoder.onMalformedInput(CodingErrorAction.REPLACE);
            encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
        }
        ByteBuffer bytes = encoder.encode(CharBuffer.wrap(string.toCharArray()));
        if (replace) {
            encoder.onMalformedInput(CodingErrorAction.REPORT);
            encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
        }
        return bytes;
    }

    /**
     * Returns the raw bytes; however, only data up to {@link #getLength()} is
     * valid.
     *
     * @return raw bytes of byte array
     */
    public byte[] getBytes() {
        return bytes;
    }

    /**
     * Returns the number of bytes in the byte array
     *
     * @return number of bytes in byte array
     */
    public int getLength() {
        return length;
    }

    /**
     * Sets to contain the contents of a string.
     *
     * @param string input string
     */
    public void set(String string) {
        try {
            ByteBuffer bb = encode(string, true);
            bytes = bb.array();
            length = bb.limit();
        } catch (CharacterCodingException e) {
            throw new RuntimeException("Should not have happened " + e.toString());
        }
    }

    /**
     * Sets to a UTF-8 byte array.
     *
     * @param utf8 input UTF-8 byte array
     */
    public void set(byte[] utf8) {
        set(utf8, 0, utf8.length);
    }

    /**
     * Copies a text.
     *
     * @param other text object to copy.
     */
    public void set(Text other) {
        set(other.getBytes(), 0, other.getLength());
    }

    /**
     * Sets the Text to range of bytes.
     *
     * @param utf8 the data to copy from
     * @param start the first position of the new string
     * @param len the number of bytes of the new string
     */
    public void set(byte[] utf8, int start, int len) {
        setCapacity(len, false);
        System.arraycopy(utf8, start, bytes, 0, len);
        this.length = len;
    }

    /**
     * Appends a range of bytes to the end of the given text.
     *
     * @param utf8 the data to copy from
     * @param start the first position to append from utf8
     * @param len the number of bytes to append
     */
    public void append(byte[] utf8, int start, int len) {
        setCapacity(length + len, true);
        System.arraycopy(utf8, start, bytes, length, len);
        length += len;
    }

    /**
     * Clears the string to empty.
     */
    public void clear() {
        length = 0;
    }

    /*
     * Sets the capacity of this Text object to <em>at least</em>
     * <code>len</code> bytes. If the current buffer is longer, then the
     * capacity and existing content of the buffer are unchanged. If
     * <code>len</code> is larger than the current capacity, the Text object's
     * capacity is increased to match.
     *
     * @param len the number of bytes we need
     *
     * @param keepData should the old data be kept
     */
    private void setCapacity(int len, boolean keepData) {
        if (bytes == null || bytes.length < len) {
            byte[] newBytes = new byte[len];
            if (bytes != null && keepData) {
                System.arraycopy(bytes, 0, newBytes, 0, length);
            }
            bytes = newBytes;
        }
    }

    /**
     * Convert text back to string
     *
     * @see java.lang.Object#toString()
     */
    @Override
    public String toString() {
        try {
            return decode(bytes, 0, length);
        } catch (CharacterCodingException e) {
            throw new RuntimeException("Should not have happened " + e.toString());
        }
    }

    @Override
    public void write(DataOutput out) throws IOException {
        byte[] bytes = getBytes();
        out.write(bytes, 0, getLength());
    }

    /**
     * deserialize
     */
    @Override
    public void readFields(DataInput inputStream) throws IOException {

        byte c;
        curLoc = 0;
        clear();
        while ((c = (byte) ((DataInputStream) inputStream).read()) != EOF) {
            buf[curLoc] = c;
            curLoc++;

            if (c == LINE_DELIMITER) {
                LOG.trace("read one line, size " + curLoc);
                break;
            }

            if (isBufferFull()) {
                flushBuffer();
            }
        }

        if (!isBufferEmpty()) {
            // the buffer doesn't end with a line break.
            if (c == EOF) {
                LOG.warn("Stream ended without line break");
            }
            flushBuffer();
        }
    }

    private boolean isBufferEmpty() {
        return (curLoc == 0);
    }

    private boolean isBufferFull() {
        return (curLoc == BUF_SIZE);
    }

    private void flushBuffer() {
        append(buf, 0, curLoc);
        curLoc = 0;
    }

    /**
     * Returns true iff <code>o</code> is a Text with the same contents.
     */
    @Override
    public boolean equals(Object o) {
        return (o instanceof Text && Arrays.equals(bytes, ((Text) o).bytes));
    }

    @Override
    public int hashCode() {
        return Arrays.hashCode(bytes);
    }
}