com.buaa.cfs.io.UTF8.java Source code

Introduction

Here is the source code for com.buaa.cfs.io.UTF8.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE
 * file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
 * License.  You may obtain a copy of the License at
 * <p>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p>
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */

package com.buaa.cfs.io;

import com.buaa.cfs.utils.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.UTFDataFormatException;

/**
 * A WritableComparable for strings that uses the UTF8 encoding.
 * <p>
 * <p>Also includes utilities for efficiently reading and writing UTF-8.
 * <p>
 * Note that this decodes UTF-8 but actually encodes CESU-8, a variant of UTF-8: see
 * http://en.wikipedia.org/wiki/CESU-8
 *
 */
public class UTF8 implements WritableComparable<UTF8> {
    private static final Log LOG = LogFactory.getLog(UTF8.class);
    private static final DataInputBuffer IBUF = new DataInputBuffer();

    private static final ThreadLocal<DataOutputBuffer> OBUF_FACTORY = new ThreadLocal<DataOutputBuffer>() {
        @Override
        protected DataOutputBuffer initialValue() {
            return new DataOutputBuffer();
        }
    };

    private static final byte[] EMPTY_BYTES = new byte[0];

    private byte[] bytes = EMPTY_BYTES;
    private int length;

    public UTF8() {
        //set("");
    }

    /** Construct from a given string. */
    public UTF8(String string) {
        set(string);
    }

    /** Construct from a given string. */
    public UTF8(UTF8 utf8) {
        set(utf8);
    }

    /** The raw bytes. */
    public byte[] getBytes() {
        return bytes;
    }

    /** The number of bytes in the encoded string. */
    public int getLength() {
        return length;
    }

    /** Set to contain the contents of a string. */
    public void set(String string) {
        if (string.length() > 0xffff / 3) { // maybe too long
            LOG.warn("truncating long string: " + string.length() + " chars, starting with "
                    + string.substring(0, 20));
            string = string.substring(0, 0xffff / 3);
        }

        length = utf8Length(string); // compute length
        if (length > 0xffff) // double-check length
            throw new RuntimeException("string too long!");

        if (bytes == null || length > bytes.length) // grow buffer
            bytes = new byte[length];

        try { // avoid sync'd allocations
            DataOutputBuffer obuf = OBUF_FACTORY.get();
            obuf.reset();
            writeChars(obuf, string, 0, string.length());
            System.arraycopy(obuf.getData(), 0, bytes, 0, length);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /** Set to contain the contents of a string. */
    public void set(UTF8 other) {
        length = other.length;
        if (bytes == null || length > bytes.length) // grow buffer
            bytes = new byte[length];
        System.arraycopy(other.bytes, 0, bytes, 0, length);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        length = in.readUnsignedShort();
        if (bytes == null || bytes.length < length)
            bytes = new byte[length];
        in.readFully(bytes, 0, length);
    }

    /** Skips over one UTF8 in the input. */
    public static void skip(DataInput in) throws IOException {
        int length = in.readUnsignedShort();
        WritableUtils.skipFully(in, length);
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeShort(length);
        out.write(bytes, 0, length);
    }

    /** Compare two UTF8s. */
    @Override
    public int compareTo(UTF8 o) {
        return WritableComparator.compareBytes(bytes, 0, length, o.bytes, 0, o.length);
    }

    /** Convert to a String. */
    @Override
    public String toString() {
        StringBuilder buffer = new StringBuilder(length);
        try {
            synchronized (IBUF) {
                IBUF.reset(bytes, length);
                readChars(IBUF, buffer, length);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return buffer.toString();
    }

    /**
     * Convert to a string, checking for valid UTF8.
     *
     * @return the converted string
     *
     * @throws UTFDataFormatException if the underlying bytes contain invalid UTF8 data.
     */
    public String toStringChecked() throws IOException {
        StringBuilder buffer = new StringBuilder(length);
        synchronized (IBUF) {
            IBUF.reset(bytes, length);
            readChars(IBUF, buffer, length);
        }
        return buffer.toString();
    }

    /** Returns true iff <code>o</code> is a UTF8 with the same contents. */
    @Override
    public boolean equals(Object o) {
        if (!(o instanceof UTF8))
            return false;
        UTF8 that = (UTF8) o;
        if (this.length != that.length)
            return false;
        else
            return WritableComparator.compareBytes(bytes, 0, length, that.bytes, 0, that.length) == 0;
    }

    @Override
    public int hashCode() {
        return WritableComparator.hashBytes(bytes, length);
    }

    /** A WritableComparator optimized for UTF8 keys. */
    public static class Comparator extends WritableComparator {
        public Comparator() {
            super(UTF8.class);
        }

        @Override
        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            int n1 = readUnsignedShort(b1, s1);
            int n2 = readUnsignedShort(b2, s2);
            return compareBytes(b1, s1 + 2, n1, b2, s2 + 2, n2);
        }
    }

    static { // register this comparator
        WritableComparator.define(UTF8.class, new Comparator());
    }

    /// STATIC UTILITIES FROM HERE DOWN

    /// These are probably not used much anymore, and might be removed...

    /**
     * Convert a string to a UTF-8 encoded byte array.
     *
     * @see String#getBytes(String)
     */
    public static byte[] getBytes(String string) {
        byte[] result = new byte[utf8Length(string)];
        try { // avoid sync'd allocations
            DataOutputBuffer obuf = OBUF_FACTORY.get();
            obuf.reset();
            writeChars(obuf, string, 0, string.length());
            System.arraycopy(obuf.getData(), 0, result, 0, obuf.getLength());
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return result;
    }

    /**
     * Convert a UTF-8 encoded byte array back into a string.
     *
     * @throws IOException if the byte array is invalid UTF8
     */
    public static String fromBytes(byte[] bytes) throws IOException {
        DataInputBuffer dbuf = new DataInputBuffer();
        dbuf.reset(bytes, 0, bytes.length);
        StringBuilder buf = new StringBuilder(bytes.length);
        readChars(dbuf, buf, bytes.length);
        return buf.toString();
    }

    /**
     * Read a UTF-8 encoded string.
     *
     * @see DataInput#readUTF()
     */
    public static String readString(DataInput in) throws IOException {
        int bytes = in.readUnsignedShort();
        StringBuilder buffer = new StringBuilder(bytes);
        readChars(in, buffer, bytes);
        return buffer.toString();
    }

    private static void readChars(DataInput in, StringBuilder buffer, int nBytes)
            throws UTFDataFormatException, IOException {
        DataOutputBuffer obuf = OBUF_FACTORY.get();
        obuf.reset();
        obuf.write(in, nBytes);
        byte[] bytes = obuf.getData();
        int i = 0;
        while (i < nBytes) {
            byte b = bytes[i++];
            if ((b & 0x80) == 0) {
                // 0b0xxxxxxx: 1-byte sequence
                buffer.append((char) (b & 0x7F));
            } else if ((b & 0xE0) == 0xC0) {
                if (i >= nBytes) {
                    throw new UTFDataFormatException(
                            "Truncated UTF8 at " + StringUtils.byteToHexString(bytes, i - 1, 1));
                }
                // 0b110xxxxx: 2-byte sequence
                buffer.append((char) (((b & 0x1F) << 6) | (bytes[i++] & 0x3F)));
            } else if ((b & 0xF0) == 0xE0) {
                // 0b1110xxxx: 3-byte sequence
                if (i + 1 >= nBytes) {
                    throw new UTFDataFormatException(
                            "Truncated UTF8 at " + StringUtils.byteToHexString(bytes, i - 1, 2));
                }
                buffer.append((char) (((b & 0x0F) << 12) | ((bytes[i++] & 0x3F) << 6) | (bytes[i++] & 0x3F)));
            } else if ((b & 0xF8) == 0xF0) {
                if (i + 2 >= nBytes) {
                    throw new UTFDataFormatException(
                            "Truncated UTF8 at " + StringUtils.byteToHexString(bytes, i - 1, 3));
                }
                // 0b11110xxx: 4-byte sequence
                int codepoint = ((b & 0x07) << 18) | ((bytes[i++] & 0x3F) << 12) | ((bytes[i++] & 0x3F) << 6)
                        | ((bytes[i++] & 0x3F));
                buffer.append(highSurrogate(codepoint)).append(lowSurrogate(codepoint));
            } else {
                // The UTF8 standard describes 5-byte and 6-byte sequences, but
                // these are no longer allowed as of 2003 (see RFC 3629)

                // Only show the next 6 bytes max in the error code - in case the
                // buffer is large, this will prevent an exceedingly large message.
                int endForError = Math.min(i + 5, nBytes);
                throw new UTFDataFormatException(
                        "Invalid UTF8 at " + StringUtils.byteToHexString(bytes, i - 1, endForError));
            }
        }
    }

    private static char highSurrogate(int codePoint) {
        return (char) ((codePoint >>> 10)
                + (Character.MIN_HIGH_SURROGATE - (Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));
    }

    private static char lowSurrogate(int codePoint) {
        return (char) ((codePoint & 0x3ff) + Character.MIN_LOW_SURROGATE);
    }

    /**
     * Write a UTF-8 encoded string.
     *
     * @see DataOutput#writeUTF(String)
     */
    public static int writeString(DataOutput out, String s) throws IOException {
        if (s.length() > 0xffff / 3) { // maybe too long
            LOG.warn("truncating long string: " + s.length() + " chars, starting with " + s.substring(0, 20));
            s = s.substring(0, 0xffff / 3);
        }

        int len = utf8Length(s);
        if (len > 0xffff) // double-check length
            throw new IOException("string too long!");

        out.writeShort(len);
        writeChars(out, s, 0, s.length());
        return len;
    }

    /** Returns the number of bytes required to write this. */
    private static int utf8Length(String string) {
        int stringLength = string.length();
        int utf8Length = 0;
        for (int i = 0; i < stringLength; i++) {
            int c = string.charAt(i);
            if (c <= 0x007F) {
                utf8Length++;
            } else if (c > 0x07FF) {
                utf8Length += 3;
            } else {
                utf8Length += 2;
            }
        }
        return utf8Length;
    }

    private static void writeChars(DataOutput out, String s, int start, int length) throws IOException {
        final int end = start + length;
        for (int i = start; i < end; i++) {
            int code = s.charAt(i);
            if (code <= 0x7F) {
                out.writeByte((byte) code);
            } else if (code <= 0x07FF) {
                out.writeByte((byte) (0xC0 | ((code >> 6) & 0x1F)));
                out.writeByte((byte) (0x80 | code & 0x3F));
            } else {
                out.writeByte((byte) (0xE0 | ((code >> 12) & 0X0F)));
                out.writeByte((byte) (0x80 | ((code >> 6) & 0x3F)));
                out.writeByte((byte) (0x80 | (code & 0x3F)));
            }
        }
    }

}