org.cloudata.core.common.io.CUTF8.java Source code

Introduction

Here is the source code for org.cloudata.core.common.io.CUTF8.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.cloudata.core.common.io;

import java.io.IOException;
import java.io.DataInput;
import java.io.DataOutput;

import org.apache.commons.logging.*;

/** A WritableComparable for strings that uses the UTF8 encoding.
 * 
 * <p>Also includes utilities for efficiently reading and writing UTF-8.
 *
 * @author Doug Cutting
 */
public class CUTF8 implements CWritableComparable {
    private static final Log LOG = LogFactory.getLog("org.apache.hadoop.io.UTF8");
    private static final CDataOutputBuffer OBUF = new CDataOutputBuffer();
    private static final CDataInputBuffer IBUF = new CDataInputBuffer();

    private static final byte[] EMPTY_BYTES = new byte[0];

    private byte[] bytes = EMPTY_BYTES;
    private int length;

    public CUTF8() {
        //set("");
    }

    /** Construct from a given string. */
    public CUTF8(String string) {
        set(string);
    }

    /** Construct from a given string. */
    public CUTF8(CUTF8 utf8) {
        set(utf8);
    }

    /** The raw bytes. */
    public byte[] getBytes() {
        return bytes;
    }

    /** The number of bytes in the encoded string. */
    public int getLength() {
        return length;
    }

    /** Set to contain the contents of a string. */
    public void set(String string) {
        if (string.length() > 0xffff / 3) { // maybe too long
            LOG.warn("truncating long string: " + string.length() + " chars, starting with "
                    + string.substring(0, 20));
            string = string.substring(0, 0xffff / 3);
        }

        length = utf8Length(string); // compute length
        if (length > 0xffff) // double-check length
            throw new RuntimeException("string too long!");

        if (bytes == null || length > bytes.length) // grow buffer
            bytes = new byte[length];

        try { // avoid sync'd allocations
            synchronized (OBUF) {
                OBUF.reset();
                writeChars(OBUF, string, 0, string.length());
                System.arraycopy(OBUF.getData(), 0, bytes, 0, length);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /** Set to contain the contents of a string. */
    public void set(CUTF8 other) {
        length = other.length;
        if (bytes == null || length > bytes.length) // grow buffer
            bytes = new byte[length];
        System.arraycopy(other.bytes, 0, bytes, 0, length);
    }

    public void readFields(DataInput in) throws IOException {
        length = in.readUnsignedShort();
        if (bytes == null || bytes.length < length)
            bytes = new byte[length];
        in.readFully(bytes, 0, length);
    }

    /** Skips over one UTF8 in the input. */
    public static void skip(DataInput in) throws IOException {
        int length = in.readUnsignedShort();
        CWritableUtils.skipFully(in, length);
    }

    public void write(DataOutput out) throws IOException {
        out.writeShort(length);
        out.write(bytes, 0, length);
    }

    /** Compare two UTF8s. */
    public int compareTo(Object o) {
        CUTF8 that = (CUTF8) o;
        return CWritableComparator.compareBytes(bytes, 0, length, that.bytes, 0, that.length);
    }

    /** Convert to a String. */
    public String toString() {
        StringBuffer buffer = new StringBuffer(length);
        try {
            synchronized (IBUF) {
                IBUF.reset(bytes, length);
                readChars(IBUF, buffer, length);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return buffer.toString();
    }

    /** Returns true iff <code>o</code> is a UTF8 with the same contents.  */
    public boolean equals(Object o) {
        if (!(o instanceof CUTF8))
            return false;
        CUTF8 that = (CUTF8) o;
        if (this.length != that.length)
            return false;
        else
            return CWritableComparator.compareBytes(bytes, 0, length, that.bytes, 0, that.length) == 0;
    }

    public int hashCode() {
        return CWritableComparator.hashBytes(bytes, length);
    }

    /** A WritableComparator optimized for UTF8 keys. */
    public static class Comparator extends CWritableComparator {
        public Comparator() {
            super(CUTF8.class);
        }

        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            int n1 = readUnsignedShort(b1, s1);
            int n2 = readUnsignedShort(b2, s2);
            return compareBytes(b1, s1 + 2, n1, b2, s2 + 2, n2);
        }
    }

    static { // register this comparator
        CWritableComparator.define(CUTF8.class, new Comparator());
    }

    /// STATIC UTILITIES FROM HERE DOWN

    /// These are probably not used much anymore, and might be removed...

    /** Convert a string to a UTF-8 encoded byte array.
     * @see String#getBytes(String)
     */
    public static byte[] getBytes(String string) {
        byte[] result = new byte[utf8Length(string)];
        try { // avoid sync'd allocations
            synchronized (OBUF) {
                OBUF.reset();
                writeChars(OBUF, string, 0, string.length());
                System.arraycopy(OBUF.getData(), 0, result, 0, OBUF.getLength());
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return result;
    }

    /** Read a UTF-8 encoded string.
     *
     * @see DataInput#readUTF()
     */
    public static String readString(DataInput in) throws IOException {
        int bytes = in.readUnsignedShort();
        StringBuffer buffer = new StringBuffer(bytes);
        readChars(in, buffer, bytes);
        return buffer.toString();
    }

    private static void readChars(DataInput in, StringBuffer buffer, int nBytes) throws IOException {
        synchronized (OBUF) {
            OBUF.reset();
            OBUF.write(in, nBytes);
            byte[] bytes = OBUF.getData();
            int i = 0;
            while (i < nBytes) {
                byte b = bytes[i++];
                if ((b & 0x80) == 0) {
                    buffer.append((char) (b & 0x7F));
                } else if ((b & 0xE0) != 0xE0) {
                    buffer.append((char) (((b & 0x1F) << 6) | (bytes[i++] & 0x3F)));
                } else {
                    buffer.append((char) (((b & 0x0F) << 12) | ((bytes[i++] & 0x3F) << 6) | (bytes[i++] & 0x3F)));
                }
            }
        }
    }

    /** Write a UTF-8 encoded string.
     *
     * @see DataOutput#writeUTF(String)
     */
    public static int writeString(DataOutput out, String s) throws IOException {
        if (s.length() > 0xffff / 3) { // maybe too long
            LOG.warn("truncating long string: " + s.length() + " chars, starting with " + s.substring(0, 20));
            s = s.substring(0, 0xffff / 3);
        }

        int len = utf8Length(s);
        if (len > 0xffff) // double-check length
            throw new IOException("string too long!");

        out.writeShort(len);
        writeChars(out, s, 0, s.length());
        return len;
    }

    /** Returns the number of bytes required to write this. */
    private static int utf8Length(String string) {
        int stringLength = string.length();
        int utf8Length = 0;
        for (int i = 0; i < stringLength; i++) {
            int c = string.charAt(i);
            if ((c >= 0x0001) && (c <= 0x007F)) {
                utf8Length++;
            } else if (c > 0x07FF) {
                utf8Length += 3;
            } else {
                utf8Length += 2;
            }
        }
        return utf8Length;
    }

    private static void writeChars(DataOutput out, String s, int start, int length) throws IOException {
        final int end = start + length;
        for (int i = start; i < end; i++) {
            int code = s.charAt(i);
            if (code >= 0x01 && code <= 0x7F) {
                out.writeByte((byte) code);
            } else if (code <= 0x07FF) {
                out.writeByte((byte) (0xC0 | ((code >> 6) & 0x1F)));
                out.writeByte((byte) (0x80 | code & 0x3F));
            } else {
                out.writeByte((byte) (0xE0 | ((code >> 12) & 0X0F)));
                out.writeByte((byte) (0x80 | ((code >> 6) & 0x3F)));
                out.writeByte((byte) (0x80 | (code & 0x3F)));
            }
        }
    }

}