org.apache.hadoop.io.UTF8.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.io.UTF8.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.io;

import java.io.IOException;
import java.io.DataInput;
import java.io.DataOutput;

import org.apache.commons.logging.*;

/** A WritableComparable for strings that uses the UTF8 encoding.
 * 
 * <p>Also includes utilities for efficiently reading and writing UTF-8.
 *
 * @deprecated replaced by Text
 */
public class UTF8 implements WritableComparable {
    private static final Log LOG = LogFactory.getLog(UTF8.class);
    private static final DataInputBuffer IBUF = new DataInputBuffer();

    private static final ThreadLocal<DataOutputBuffer> OBUF_FACTORY = new ThreadLocal<DataOutputBuffer>() {
        @Override
        protected DataOutputBuffer initialValue() {
            return new DataOutputBuffer();
        }
    };

    private static final byte[] EMPTY_BYTES = new byte[0];

    private byte[] bytes = EMPTY_BYTES;
    private int length;

    public UTF8() {
        //set("");
    }

    /** Construct from a given string. */
    public UTF8(String string) {
        set(string);
    }

    /** Construct from a given string. */
    public UTF8(UTF8 utf8) {
        set(utf8);
    }

    /** The raw bytes. */
    public byte[] getBytes() {
        return bytes;
    }

    /** The number of bytes in the encoded string. */
    public int getLength() {
        return length;
    }

    /** Set to contain the contents of a string. */
    public void set(String string) {
        if (string.length() > 0xffff / 3) { // maybe too long
            LOG.warn("truncating long string: " + string.length() + " chars, starting with "
                    + string.substring(0, 20));
            string = string.substring(0, 0xffff / 3);
        }

        length = utf8Length(string); // compute length
        if (length > 0xffff) // double-check length
            throw new RuntimeException("string too long!");

        if (bytes == null || length > bytes.length) // grow buffer
            bytes = new byte[length];

        try { // avoid sync'd allocations
            DataOutputBuffer obuf = OBUF_FACTORY.get();
            obuf.reset();
            writeChars(obuf, string, 0, string.length());
            System.arraycopy(obuf.getData(), 0, bytes, 0, length);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /** Set to contain the contents of a string. */
    public void set(UTF8 other) {
        length = other.length;
        if (bytes == null || length > bytes.length) // grow buffer
            bytes = new byte[length];
        System.arraycopy(other.bytes, 0, bytes, 0, length);
    }

    public void readFields(DataInput in) throws IOException {
        length = in.readUnsignedShort();
        if (bytes == null || bytes.length < length)
            bytes = new byte[length];
        in.readFully(bytes, 0, length);
    }

    /** Skips over one UTF8 in the input. */
    public static void skip(DataInput in) throws IOException {
        int length = in.readUnsignedShort();
        WritableUtils.skipFully(in, length);
    }

    public void write(DataOutput out) throws IOException {
        out.writeShort(length);
        out.write(bytes, 0, length);
    }

    /** Compare two UTF8s. */
    public int compareTo(Object o) {
        UTF8 that = (UTF8) o;
        return WritableComparator.compareBytes(bytes, 0, length, that.bytes, 0, that.length);
    }

    /** Convert to a String. */
    public String toString() {
        StringBuffer buffer = new StringBuffer(length);
        try {
            synchronized (IBUF) {
                IBUF.reset(bytes, length);
                readChars(IBUF, buffer, length);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return buffer.toString();
    }

    /** Returns true iff <code>o</code> is a UTF8 with the same contents.  */
    public boolean equals(Object o) {
        if (!(o instanceof UTF8))
            return false;
        UTF8 that = (UTF8) o;
        if (this.length != that.length)
            return false;
        else
            return WritableComparator.compareBytes(bytes, 0, length, that.bytes, 0, that.length) == 0;
    }

    public int hashCode() {
        return WritableComparator.hashBytes(bytes, length);
    }

    /** A WritableComparator optimized for UTF8 keys. */
    public static class Comparator extends WritableComparator {
        public Comparator() {
            super(UTF8.class);
        }

        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            int n1 = readUnsignedShort(b1, s1);
            int n2 = readUnsignedShort(b2, s2);
            return compareBytes(b1, s1 + 2, n1, b2, s2 + 2, n2);
        }
    }

    static { // register this comparator
        WritableComparator.define(UTF8.class, new Comparator());
    }

    /// STATIC UTILITIES FROM HERE DOWN

    /// These are probably not used much anymore, and might be removed...

    /** Convert a string to a UTF-8 encoded byte array.
     * @see String#getBytes(String)
     */
    public static byte[] getBytes(String string) {
        byte[] result = new byte[utf8Length(string)];
        try { // avoid sync'd allocations
            DataOutputBuffer obuf = OBUF_FACTORY.get();
            obuf.reset();
            writeChars(obuf, string, 0, string.length());
            System.arraycopy(obuf.getData(), 0, result, 0, obuf.getLength());
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return result;
    }

    /** Read a UTF-8 encoded string.
     *
     * @see DataInput#readUTF()
     */
    public static String readString(DataInput in) throws IOException {
        int bytes = in.readUnsignedShort();
        StringBuffer buffer = new StringBuffer(bytes);
        readChars(in, buffer, bytes);
        return buffer.toString();
    }

    private static void readChars(DataInput in, StringBuffer buffer, int nBytes) throws IOException {
        DataOutputBuffer obuf = OBUF_FACTORY.get();
        obuf.reset();
        obuf.write(in, nBytes);
        byte[] bytes = obuf.getData();
        int i = 0;
        while (i < nBytes) {
            byte b = bytes[i++];
            if ((b & 0x80) == 0) {
                buffer.append((char) (b & 0x7F));
            } else if ((b & 0xE0) != 0xE0) {
                buffer.append((char) (((b & 0x1F) << 6) | (bytes[i++] & 0x3F)));
            } else {
                buffer.append((char) (((b & 0x0F) << 12) | ((bytes[i++] & 0x3F) << 6) | (bytes[i++] & 0x3F)));
            }
        }
    }

    /** Write a UTF-8 encoded string.
     *
     * @see DataOutput#writeUTF(String)
     */
    public static int writeString(DataOutput out, String s) throws IOException {
        if (s.length() > 0xffff / 3) { // maybe too long
            LOG.warn("truncating long string: " + s.length() + " chars, starting with " + s.substring(0, 20));
            s = s.substring(0, 0xffff / 3);
        }

        int len = utf8Length(s);
        if (len > 0xffff) // double-check length
            throw new IOException("string too long!");

        out.writeShort(len);
        writeChars(out, s, 0, s.length());
        return len;
    }

    /** Returns the number of bytes required to write this. */
    private static int utf8Length(String string) {
        int stringLength = string.length();
        int utf8Length = 0;
        for (int i = 0; i < stringLength; i++) {
            int c = string.charAt(i);
            if ((c >= 0x0001) && (c <= 0x007F)) {
                utf8Length++;
            } else if (c > 0x07FF) {
                utf8Length += 3;
            } else {
                utf8Length += 2;
            }
        }
        return utf8Length;
    }

    private static void writeChars(DataOutput out, String s, int start, int length) throws IOException {
        final int end = start + length;
        for (int i = start; i < end; i++) {
            int code = s.charAt(i);
            if (code >= 0x01 && code <= 0x7F) {
                out.writeByte((byte) code);
            } else if (code <= 0x07FF) {
                out.writeByte((byte) (0xC0 | ((code >> 6) & 0x1F)));
                out.writeByte((byte) (0x80 | code & 0x3F));
            } else {
                out.writeByte((byte) (0xE0 | ((code >> 12) & 0X0F)));
                out.writeByte((byte) (0x80 | ((code >> 6) & 0x3F)));
                out.writeByte((byte) (0x80 | (code & 0x3F)));
            }
        }
    }

}