org.commoncrawl.util.TextBytes.java Source code

Introduction

Here is the source code for org.commoncrawl.util.TextBytes.java
Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.util;

import java.io.IOException;
import java.io.DataInput;
import java.io.DataOutput;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.MalformedInputException;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;

import junit.framework.Assert;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.BinaryComparable;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.io.WritableUtils;

/**
 * basically a clone of hadoop Text class with FlexBuffer as the backing store
 * and a cacheable String object that is lazilly populated by the call to the
 * toString API.
 * 
 * From Text.java:
 * 
 * This class stores text using standard UTF8 encoding. It provides methods to
 * serialize, deserialize, and compare texts at byte level. The type of length
 * is integer and is serialized using zero-compressed format.
 * <p>
 * In addition, it provides methods for string traversal without converting the
 * byte array to a string.
 * <p>
 * Also includes utilities for serializing/deserialing a string, coding/decoding
 * a string, checking if a byte array contains valid UTF8 code, calculating the
 * length of an encoded string.
 * 
 * @author rana
 * 
 */
public final class TextBytes extends BinaryComparableWithOffset implements WritableComparable<BinaryComparable> {

    private static final Log LOG = LogFactory.getLog(TextBytes.class);

    private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY = new ThreadLocal<CharsetEncoder>() {
        protected CharsetEncoder initialValue() {
            return Charset.forName("UTF-8").newEncoder().onMalformedInput(CodingErrorAction.REPORT)
                    .onUnmappableCharacter(CodingErrorAction.REPORT);
        }
    };

    private static ThreadLocal<CharsetDecoder> DECODER_FACTORY = new ThreadLocal<CharsetDecoder>() {
        protected CharsetDecoder initialValue() {
            return Charset.forName("UTF-8").newDecoder().onMalformedInput(CodingErrorAction.REPORT)
                    .onUnmappableCharacter(CodingErrorAction.REPORT);
        }
    };

    private static final byte[] EMPTY_BYTES = new byte[0];
    private static final String EMPTY_STRING = new String("");

    private FlexBuffer bytes = new FlexBuffer();
    private String cachedUTF8 = null;

    public TextBytes() {

    }

    /**
     * Construct from a string.
     */
    public TextBytes(String string) {
        set(string);
    }

    /** Construct from another textbytes. */
    public TextBytes(TextBytes utf8) {
        bytes = new FlexBuffer(utf8.bytes);
        cachedUTF8 = utf8.cachedUTF8;
    }

    /** Construct from another text. */
    public TextBytes(Text utf8) {
        set(utf8);
    }

    /**
     * Construct from a byte array.
     */
    public TextBytes(byte[] utf8) {
        set(utf8);
    }

    /**
     * get underlying buffer object
     */
    public FlexBuffer getBuffer() {
        return bytes;
    }

    /**
     * Returns the raw bytes; however, only data up to {@link #getLength()} is
     * valid.
     */
    public byte[] getBytes() {
        return bytes.get();
    }

    /** Returns the number of bytes in the byte array */
    public int getLength() {
        return bytes.getCount();
    }

    /** Get offset (if set) **/
    public int getOffset() {
        return bytes.getOffset();
    }

    public void setLength(int newLength) {
        if (newLength > getCapacity()) {
            setCapacity(newLength, true);
        }
        bytes.setCount(newLength);
    }

    /**
     * Returns the Unicode Scalar Value (32-bit integer value) for the character
     * at <code>position</code>. Note that this method avoids using the converter
     * or doing String instatiation
     * 
     * @return the Unicode scalar value at position or -1 if the position is
     *         invalid or points to a trailing byte
     */
    public int charAt(int position) {
        if (position > bytes.getCount())
            return -1; // too long
        if (position < 0)
            return -1; // duh.

        ByteBuffer bb = (ByteBuffer) ByteBuffer.wrap(bytes.get(), bytes.getOffset(), bytes.getCount())
                .position(position);
        return bytesToCodePoint(bb.slice());
    }

    public int find(String what) {
        return find(what, 0);
    }

    /**
     * Finds any occurence of <code>what</code> in the backing buffer, starting as
     * position <code>start</code>. The starting position is measured in bytes and
     * the return value is in terms of byte position in the buffer. The backing
     * buffer is not converted to a string for this operation.
     * 
     * @return byte position of the first occurence of the search string in the
     *         UTF-8 buffer or -1 if not found
     */
    public int find(String what, int start) {
        try {
            ByteBuffer src = ByteBuffer.wrap(bytes.get(), bytes.getOffset(), bytes.getCount());
            ByteBuffer tgt = encode(what);
            byte b = tgt.get();
            src.position(start);

            while (src.hasRemaining()) {
                if (b == src.get()) { // matching first byte
                    src.mark(); // save position in loop
                    tgt.mark(); // save position in target
                    boolean found = true;
                    int pos = src.position() - 1;
                    while (tgt.hasRemaining()) {
                        if (!src.hasRemaining()) { // src expired first
                            tgt.reset();
                            src.reset();
                            found = false;
                            break;
                        }
                        if (!(tgt.get() == src.get())) {
                            tgt.reset();
                            src.reset();
                            found = false;
                            break; // no match
                        }
                    }
                    if (found)
                        return pos;
                }
            }
            return -1; // not found
        } catch (CharacterCodingException e) {
            // can't get here
            e.printStackTrace();
            return -1;
        }
    }

    /**
     * Set to contain the contents of a string.
     */
    public void set(String string) {
        try {
            ByteBuffer bb = encode(string, true);
            set(bb.array(), 0, bb.limit());
            cachedUTF8 = string;
            // zbytes = bb.array();
            // length = bb.limit();
            // offset = 0;
        } catch (CharacterCodingException e) {
            throw new RuntimeException("Should not have happened " + e.toString());
        }
    }

    /**
     * Set to a utf8 byte array
     */
    public void set(byte[] utf8) {
        set(utf8, 0, utf8.length);
    }

    /** copy a text. */
    public void set(Text other) {
        set(other.getBytes(), 0, other.getLength());
    }

    /** copy a textbytes. */
    public void set(TextBytes other) {
        set(other.getBytes(), other.getOffset(), other.getLength());
    }

    /**
     * Set the Text to range of bytes
     * 
     * @param utf8
     *          the data to copy from
     * @param start
     *          the first position of the new string
     * @param len
     *          the number of bytes of the new string
     */
    public void set(byte[] utf8, int start, int len) {
        bytes.set(utf8, start, len);
        // reset string cache ...
        cachedUTF8 = null;
    }

    /**
     * Append a range of bytes to the end of the given text
     * 
     * @param utf8
     *          the data to copy from
     * @param start
     *          the first position to append from utf8
     * @param len
     *          the number of bytes to append
     */
    public void append(byte[] utf8, int start, int len) {
        setCapacity(bytes.getCount() + len, true);
        if (bytes.isShared()) {
            bytes.copyOnWrite();
        }
        System.arraycopy(utf8, start, bytes.get(), bytes.getCount(), len);
        bytes.setCount(bytes.getCount() + len);
        cachedUTF8 = null;
    }

    /**
     * Clear the string to empty.
     */
    public void clear() {
        bytes.setCount(0);
        cachedUTF8 = null;
    }

    /*
     * Sets the capacity of this Text object to <em>at least</em> <code>len</code>
     * bytes. If the current buffer is longer, then the capacity and existing
     * content of the buffer are unchanged. If <code>len</code> is larger than the
     * current capacity, the Text object's capacity is increased to match.
     * 
     * @param len the number of bytes we need
     * 
     * @param keepData should the old data be kept
     */
    private void setCapacity(int len, boolean keepData) {
        if (!keepData) {
            bytes.setCount(0);
        }
        bytes.setCapacity(len);
    }

    private int getCapacity() {
        return bytes.getCapacity();
    }

    /**
     * Convert text back to string
     * 
     * @see java.lang.Object#toString()
     */
    public String toString() {
        if (cachedUTF8 == null) {
            if (bytes.getCount() == 0) {
                cachedUTF8 = EMPTY_STRING;
            } else {
                try {
                    cachedUTF8 = decode(bytes.get(), bytes.getOffset(), bytes.getCount());
                    return cachedUTF8;
                } catch (CharacterCodingException e) {
                    throw new RuntimeException("Should not have happened " + e.toString());
                }
            }
        }
        return cachedUTF8;
    }

    /**
     * deserialize
     */
    public void readFields(DataInput in) throws IOException {
        int newLength = WritableUtils.readVInt(in);
        // ensure capacity
        setCapacity(newLength, false);
        // in case we need to, ensure we have a private copy of the underlying
        // array
        bytes.copyOnWrite();
        // read into the array
        in.readFully(bytes.get(), bytes.getOffset(), newLength);
        // reset count varaible
        bytes.setCount(newLength);
        // clear cached String pointer
        cachedUTF8 = null;
    }

    /** Skips over one Text in the input. */
    public static void skip(DataInput in) throws IOException {
        int length = WritableUtils.readVInt(in);
        WritableUtils.skipFully(in, length);
    }

    /**
     * serialize write this object to out length uses zero-compressed encoding
     * 
     * @see Writable#write(DataOutput)
     */
    public void write(DataOutput out) throws IOException {
        WritableUtils.writeVInt(out, bytes.getCount());
        if (bytes.getCount() != 0) {
            out.write(bytes.get(), bytes.getOffset(), bytes.getCount());
        }
    }

    /** Returns true iff <code>o</code> is a TextBytes with the same contents. */
    public boolean equals(Object o) {
        if (o instanceof TextBytes)
            return super.equals(o);
        return false;
    }

    public int hashCode() {
        int hash = 1;
        int offset = getOffset();
        int length = getLength();
        byte bytesArray[] = bytes.get();
        for (int i = offset; i < offset + length; i++)
            hash = (31 * hash) + (int) bytesArray[i];
        return hash;
    }

    /** A WritableComparator optimized for Text keys. */
    public static class Comparator extends WritableComparator {
        public Comparator() {
            super(Text.class);
        }

        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            int n1 = WritableUtils.decodeVIntSize(b1[s1]);
            int n2 = WritableUtils.decodeVIntSize(b2[s2]);
            return compareBytes(b1, s1 + n1, l1 - n1, b2, s2 + n2, l2 - n2);
        }
    }

    static {
        // register this comparator
        WritableComparator.define(Text.class, new Comparator());
    }

    // / STATIC UTILITIES FROM HERE DOWN
    /**
     * Converts the provided byte array to a String using the UTF-8 encoding. If
     * the input is malformed, replace by a default value.
     */
    public static String decode(byte[] utf8) throws CharacterCodingException {
        return decode(ByteBuffer.wrap(utf8), true);
    }

    public static String decode(byte[] utf8, int start, int length) throws CharacterCodingException {
        return decode(ByteBuffer.wrap(utf8, start, length), true);
    }

    /**
     * Converts the provided byte array to a String using the UTF-8 encoding. If
     * <code>replace</code> is true, then malformed input is replaced with the
     * substitution character, which is U+FFFD. Otherwise the method throws a
     * MalformedInputException.
     */
    public static String decode(byte[] utf8, int start, int length, boolean replace)
            throws CharacterCodingException {
        return decode(ByteBuffer.wrap(utf8, start, length), replace);
    }

    private static String decode(ByteBuffer utf8, boolean replace) throws CharacterCodingException {
        CharsetDecoder decoder = DECODER_FACTORY.get();
        if (replace) {
            decoder.onMalformedInput(java.nio.charset.CodingErrorAction.REPLACE);
            decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
        }
        String str = decoder.decode(utf8).toString();
        // set decoder back to its default value: REPORT
        if (replace) {
            decoder.onMalformedInput(CodingErrorAction.REPORT);
            decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
        }
        return str;
    }

    /**
     * Converts the provided String to bytes using the UTF-8 encoding. If the
     * input is malformed, invalid chars are replaced by a default value.
     * 
     * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is
     *         ByteBuffer.limit()
     */

    public static ByteBuffer encode(String string) throws CharacterCodingException {
        return encode(string, true);
    }

    /**
     * Converts the provided String to bytes using the UTF-8 encoding. If
     * <code>replace</code> is true, then malformed input is replaced with the
     * substitution character, which is U+FFFD. Otherwise the method throws a
     * MalformedInputException.
     * 
     * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is
     *         ByteBuffer.limit()
     */
    public static ByteBuffer encode(String string, boolean replace) throws CharacterCodingException {
        CharsetEncoder encoder = ENCODER_FACTORY.get();
        if (replace) {
            encoder.onMalformedInput(CodingErrorAction.REPLACE);
            encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
        }
        ByteBuffer bytes = encoder.encode(CharBuffer.wrap(string.toCharArray()));
        if (replace) {
            encoder.onMalformedInput(CodingErrorAction.REPORT);
            encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
        }
        return bytes;
    }

    /**
     * Read a UTF8 encoded string from in
     */
    public static String readString(DataInput in) throws IOException {
        int length = WritableUtils.readVInt(in);
        byte[] bytes = new byte[length];
        in.readFully(bytes, 0, length);
        return decode(bytes);
    }

    /**
     * Write a UTF8 encoded string to out
     */
    public static int writeString(DataOutput out, String s) throws IOException {
        ByteBuffer bytes = encode(s);
        int length = bytes.limit();
        WritableUtils.writeVInt(out, length);
        out.write(bytes.array(), 0, length);
        return length;
    }

    // //// states for validateUTF8

    private static final int LEAD_BYTE = 0;

    private static final int TRAIL_BYTE_1 = 1;

    private static final int TRAIL_BYTE = 2;

    /**
     * Check if a byte array contains valid utf-8
     * 
     * @param utf8
     *          byte array
     * @throws MalformedInputException
     *           if the byte array contains invalid utf-8
     */
    public static void validateUTF8(byte[] utf8) throws MalformedInputException {
        validateUTF8(utf8, 0, utf8.length);
    }

    /**
     * Check to see if a byte array is valid utf-8
     * 
     * @param utf8
     *          the array of bytes
     * @param start
     *          the offset of the first byte in the array
     * @param len
     *          the length of the byte sequence
     * @throws MalformedInputException
     *           if the byte array contains invalid bytes
     */
    public static void validateUTF8(byte[] utf8, int start, int len) throws MalformedInputException {
        int count = start;
        int leadByte = 0;
        int length = 0;
        int state = LEAD_BYTE;
        while (count < start + len) {
            int aByte = ((int) utf8[count] & 0xFF);

            switch (state) {
            case LEAD_BYTE:
                leadByte = aByte;
                length = bytesFromUTF8[aByte];

                switch (length) {
                case 0: // check for ASCII
                    if (leadByte > 0x7F)
                        throw new MalformedInputException(count);
                    break;
                case 1:
                    if (leadByte < 0xC2 || leadByte > 0xDF)
                        throw new MalformedInputException(count);
                    state = TRAIL_BYTE_1;
                    break;
                case 2:
                    if (leadByte < 0xE0 || leadByte > 0xEF)
                        throw new MalformedInputException(count);
                    state = TRAIL_BYTE_1;
                    break;
                case 3:
                    if (leadByte < 0xF0 || leadByte > 0xF4)
                        throw new MalformedInputException(count);
                    state = TRAIL_BYTE_1;
                    break;
                default:
                    // too long! Longest valid UTF-8 is 4 bytes (lead + three)
                    // or if < 0 we got a trail byte in the lead byte position
                    throw new MalformedInputException(count);
                } // switch (length)
                break;

            case TRAIL_BYTE_1:
                if (leadByte == 0xF0 && aByte < 0x90)
                    throw new MalformedInputException(count);
                if (leadByte == 0xF4 && aByte > 0x8F)
                    throw new MalformedInputException(count);
                if (leadByte == 0xE0 && aByte < 0xA0)
                    throw new MalformedInputException(count);
                if (leadByte == 0xED && aByte > 0x9F)
                    throw new MalformedInputException(count);
                // falls through to regular trail-byte test!!
            case TRAIL_BYTE:
                if (aByte < 0x80 || aByte > 0xBF)
                    throw new MalformedInputException(count);
                if (--length == 0) {
                    state = LEAD_BYTE;
                } else {
                    state = TRAIL_BYTE;
                }
                break;
            } // switch (state)
            count++;
        }
    }

    /**
     * Magic numbers for UTF-8. These are the number of bytes that <em>follow</em>
     * a given lead byte. Trailing bytes have the value -1. The values 4 and 5 are
     * presented in this table, even though valid UTF-8 cannot include the five
     * and six byte sequences.
     */
    static final int[] bytesFromUTF8 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            // trail bytes
            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
            3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };

    /**
     * Returns the next code point at the current position in the buffer. The
     * buffer's position will be incremented. Any mark set on this buffer will be
     * changed by this method!
     */
    public static int bytesToCodePoint(ByteBuffer bytes) {
        bytes.mark();
        byte b = bytes.get();
        bytes.reset();
        int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
        if (extraBytesToRead < 0)
            return -1; // trailing byte!
        int ch = 0;

        switch (extraBytesToRead) {
        case 5:
            ch += (bytes.get() & 0xFF);
            ch <<= 6; /* remember, illegal UTF-8 */
        case 4:
            ch += (bytes.get() & 0xFF);
            ch <<= 6; /* remember, illegal UTF-8 */
        case 3:
            ch += (bytes.get() & 0xFF);
            ch <<= 6;
        case 2:
            ch += (bytes.get() & 0xFF);
            ch <<= 6;
        case 1:
            ch += (bytes.get() & 0xFF);
            ch <<= 6;
        case 0:
            ch += (bytes.get() & 0xFF);
        }
        ch -= offsetsFromUTF8[extraBytesToRead];

        return ch;
    }

    static final int offsetsFromUTF8[] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };

    /**
     * For the given string, returns the number of UTF-8 bytes required to encode
     * the string.
     * 
     * @param string
     *          text to encode
     * @return number of UTF-8 bytes required to encode
     */
    public static int utf8Length(String string) {
        CharacterIterator iter = new StringCharacterIterator(string);
        char ch = iter.first();
        int size = 0;
        while (ch != CharacterIterator.DONE) {
            if ((ch >= 0xD800) && (ch < 0xDC00)) {
                // surrogate pair?
                char trail = iter.next();
                if ((trail > 0xDBFF) && (trail < 0xE000)) {
                    // valid pair
                    size += 4;
                } else {
                    // invalid pair
                    size += 3;
                    iter.previous(); // rewind one
                }
            } else if (ch < 0x80) {
                size++;
            } else if (ch < 0x800) {
                size += 2;
            } else {
                // ch < 0x10000, that is, the largest char value
                size += 3;
            }
            ch = iter.next();
        }
        return size;
    }

    public static void main(String[] args) {
        // run some tests on the new code
        String aTestString = new String("A Test Strnig");
        // convert it to bytes
        byte bytes[] = aTestString.getBytes();
        // over allocate an array
        byte overAllocated[] = new byte[bytes.length * 2];
        // copy source
        System.arraycopy(bytes, 0, overAllocated, bytes.length, bytes.length);
        // now allocate a TextBytes
        TextBytes textBytes = new TextBytes();
        // set the overallocated buffer as the backing store
        textBytes.set(overAllocated, bytes.length, bytes.length);
        // convert it to string first
        String toString = textBytes.toString();
        // validate equal to original
        Assert.assertTrue(aTestString.equals(toString));
        // ok now write it to output buffer
        DataOutputBuffer outputBuffer = new DataOutputBuffer();
        // write string
        try {
            textBytes.write(outputBuffer);
            // read length
            DataInputBuffer inputBuffer = new DataInputBuffer();
            inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size());
            int encodedLength = WritableUtils.readVInt(inputBuffer);
            // validate arrays match ...
            Assert.assertTrue(encodedLength == bytes.length);
            Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, outputBuffer.getData(),
                    inputBuffer.getPosition(), outputBuffer.getLength() - inputBuffer.getPosition()), 0);
            // ok reset input buffer again ...
            inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size());
            // read in fields
            textBytes.readFields(inputBuffer);
            // ok see if we are not using the original backing store ...
            Assert.assertTrue(textBytes.getBytes() != overAllocated);
            // validate buffers match to original
            Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, textBytes.getBytes(),
                    textBytes.getOffset(), textBytes.getLength()), 0);

        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    static class TextBytesArray extends ArrayWritable {

        public TextBytesArray() {
            super(TextBytes.class);
        }
    }
}