org.apache.hyracks.data.std.primitive.UTF8StringPointable.java Source code

Introduction

Here is the source code for org.apache.hyracks.data.std.primitive.UTF8StringPointable.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.hyracks.data.std.primitive;

import java.io.IOException;
import java.nio.charset.Charset;

import org.apache.commons.lang3.CharSet;
import org.apache.hyracks.api.dataflow.value.ITypeTraits;
import org.apache.hyracks.data.std.api.AbstractPointable;
import org.apache.hyracks.data.std.api.IComparable;
import org.apache.hyracks.data.std.api.IHashable;
import org.apache.hyracks.data.std.api.IPointable;
import org.apache.hyracks.data.std.api.IPointableFactory;
import org.apache.hyracks.data.std.util.GrowableArray;
import org.apache.hyracks.data.std.util.UTF8StringBuilder;
import org.apache.hyracks.util.string.UTF8StringUtil;

public final class UTF8StringPointable extends AbstractPointable implements IHashable, IComparable {

    // These values are cached to speed up the length data access.
    // Since we are using the variable-length encoding, we can save the repeated decoding efforts.
    // WARNING: must call the resetConstants() method after each reset().
    private int utf8Length;
    private int metaLength;
    private int hashValue;
    private int stringLength;

    /**
     * reset those meta length.
     * Since the {@code utf8Length} and the {@code metaLength} are often used, we compute those two values in advance.
     * As for the {@code stringLength} and the {@code hashValue}, they will be lazily initialized after the first call.
     */
    @Override
    protected void afterReset() {
        utf8Length = UTF8StringUtil.getUTFLength(bytes, start);
        metaLength = UTF8StringUtil.getNumBytesToStoreLength(getUTF8Length());
        hashValue = 0;
        stringLength = -1;
    }

    public static final ITypeTraits TYPE_TRAITS = new ITypeTraits() {
        private static final long serialVersionUID = 1L;

        @Override
        public boolean isFixedLength() {
            return false;
        }

        @Override
        public int getFixedLength() {
            return 0;
        }
    };

    public static final IPointableFactory FACTORY = new IPointableFactory() {
        private static final long serialVersionUID = 1L;

        @Override
        public IPointable createPointable() {
            return new UTF8StringPointable();
        }

        @Override
        public ITypeTraits getTypeTraits() {
            return TYPE_TRAITS;
        }
    };

    public static UTF8StringPointable generateUTF8Pointable(String string) {
        byte[] bytes;
        bytes = UTF8StringUtil.writeStringToBytes(string);
        UTF8StringPointable ptr = new UTF8StringPointable();
        ptr.set(bytes, 0, bytes.length);
        return ptr;
    }

    /**
     * Returns the character at the given byte offset. The caller is responsible for making sure that
     * the provided offset is within bounds and points to the beginning of a valid UTF8 character.
     *
     * @param offset
     *            - Byte offset
     * @return Character at the given offset.
     */
    public char charAt(int offset) {
        return UTF8StringUtil.charAt(bytes, start + offset);
    }

    public int charSize(int offset) {
        return UTF8StringUtil.charSize(bytes, start + offset);
    }

    /**
     * Gets the length of the string in characters.
     * The first time call will need to go through the entire string, the following call will just return the pre-caculated result
     *
     * @return length of string in characters
     */
    public int getStringLength() {
        if (stringLength < 0) {
            stringLength = UTF8StringUtil.getStringLength(bytes, start);
        }
        return stringLength;
    }

    /**
     * Gets the length of the UTF-8 encoded string in bytes.
     *
     * @return length of UTF-8 encoded string in bytes
     */
    public int getUTF8Length() {
        return utf8Length;
    }

    public int getMetaDataLength() {
        return metaLength;
    }

    public int getCharStartOffset() {
        return getStartOffset() + getMetaDataLength();
    }

    @Override
    public int compareTo(IPointable pointer) {
        return compareTo(pointer.getByteArray(), pointer.getStartOffset(), pointer.getLength());
    }

    @Override
    public int compareTo(byte[] bytes, int start, int length) {
        return UTF8StringUtil.compareTo(this.bytes, this.start, bytes, start);
    }

    @Override
    public int hash() {
        if (hashValue == 0) {
            hashValue = UTF8StringUtil.hash(this.bytes, this.start);
        }
        return hashValue;
    }

    public void toString(StringBuilder buffer) {
        UTF8StringUtil.toString(buffer, bytes, start);
    }

    @Override
    public String toString() {
        return new String(this.bytes, this.getCharStartOffset(), this.getUTF8Length(), Charset.forName("UTF-8"));
    }

    /****
     * String functions
     */

    public int ignoreCaseCompareTo(UTF8StringPointable other) {
        return UTF8StringUtil.lowerCaseCompareTo(this.getByteArray(), this.getStartOffset(), other.getByteArray(),
                other.getStartOffset());
    }

    public int find(UTF8StringPointable pattern, boolean ignoreCase) {
        return find(this, pattern, ignoreCase);
    }

    /**
     * @param src,
     *            the source string.
     * @param pattern,
     *            the pattern string.
     * @param ignoreCase,
     *            to ignore case or not.
     * @return the byte offset of the first character of the matching string. Not including the MetaLength.
     */
    public static int find(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase) {
        return find(src, pattern, ignoreCase, 0);
    }

    /**
     * @param src,
     *            the source string.
     * @param pattern,
     *            the pattern string.
     * @param ignoreCase,
     *            to ignore case or not.
     * @param startMatch,
     *            the start offset.
     * @return the byte offset of the first character of the matching string after <code>startMatchPos}</code>.
     *         Not including the MetaLength.
     */
    public static int find(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase,
            int startMatch) {
        int startMatchPos = startMatch;
        final int srcUtfLen = src.getUTF8Length();
        final int pttnUtfLen = pattern.getUTF8Length();
        final int srcStart = src.getMetaDataLength();
        final int pttnStart = pattern.getMetaDataLength();

        int maxStart = srcUtfLen - pttnUtfLen;
        while (startMatchPos <= maxStart) {
            int c1 = startMatchPos;
            int c2 = 0;
            while (c1 < srcUtfLen && c2 < pttnUtfLen) {
                char ch1 = src.charAt(srcStart + c1);
                char ch2 = pattern.charAt(pttnStart + c2);

                if (ch1 != ch2) {
                    if (!ignoreCase || ignoreCase && Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) {
                        break;
                    }
                }
                c1 += src.charSize(srcStart + c1);
                c2 += pattern.charSize(pttnStart + c2);
            }
            if (c2 == pttnUtfLen) {
                return startMatchPos;
            }
            startMatchPos += src.charSize(srcStart + startMatchPos);
        }
        return -1;
    }

    public boolean contains(UTF8StringPointable pattern, boolean ignoreCase) {
        return contains(this, pattern, ignoreCase);
    }

    public static boolean contains(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase) {
        return find(src, pattern, ignoreCase) >= 0;
    }

    public boolean startsWith(UTF8StringPointable pattern, boolean ignoreCase) {
        return startsWith(this, pattern, ignoreCase);
    }

    public static boolean startsWith(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase) {
        int utflen1 = src.getUTF8Length();
        int utflen2 = pattern.getUTF8Length();
        if (utflen2 > utflen1) {
            return false;
        }

        int s1Start = src.getMetaDataLength();
        int s2Start = pattern.getMetaDataLength();

        int c1 = 0;
        int c2 = 0;
        while (c1 < utflen1 && c2 < utflen2) {
            char ch1 = src.charAt(s1Start + c1);
            char ch2 = pattern.charAt(s2Start + c2);
            if (ch1 != ch2) {
                if (!ignoreCase || ignoreCase && Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) {
                    break;
                }
            }
            c1 += src.charSize(s1Start + c1);
            c2 += pattern.charSize(s2Start + c2);
        }
        return (c2 == utflen2);
    }

    public boolean endsWith(UTF8StringPointable pattern, boolean ignoreCase) {
        return endsWith(this, pattern, ignoreCase);
    }

    public static boolean endsWith(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase) {
        int len1 = src.getUTF8Length();
        int len2 = pattern.getUTF8Length();
        if (len2 > len1) {
            return false;
        }

        int s1Start = src.getMetaDataLength();
        int s2Start = pattern.getMetaDataLength();

        int c1 = len1 - len2;
        int c2 = 0;
        while (c1 < len1 && c2 < len2) {
            char ch1 = src.charAt(s1Start + c1);
            char ch2 = pattern.charAt(s2Start + c2);

            if (ch1 != ch2) {
                if (!ignoreCase || ignoreCase && Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) {
                    break;
                }
            }
            c1 += src.charSize(s1Start + c1);
            c2 += pattern.charSize(s2Start + c2);
        }
        return (c2 == len2);
    }

    public void concat(UTF8StringPointable next, UTF8StringBuilder builder, GrowableArray out) throws IOException {
        concat(this, next, builder, out);
    }

    public static void concat(UTF8StringPointable first, UTF8StringPointable next, UTF8StringBuilder builder,
            GrowableArray out) throws IOException {
        int firstUtfLen = first.getUTF8Length();
        int nextUtfLen = next.getUTF8Length();

        builder.reset(out, firstUtfLen + nextUtfLen);
        builder.appendUtf8StringPointable(first);
        builder.appendUtf8StringPointable(next);
        builder.finish();
    }

    public void substr(int charOffset, int charLength, UTF8StringBuilder builder, GrowableArray out)
            throws IOException {
        substr(this, charOffset, charLength, builder, out);
    }

    public static void substr(UTF8StringPointable src, int charOffset, int charLength, UTF8StringBuilder builder,
            GrowableArray out) throws IOException {
        // Really don't understand why we need to support the charOffset < 0 case.
        // At this time, usually there is mistake on user side, we'd better give him a warning.
        // assert charOffset >= 0;
        if (charOffset < 0) {
            charOffset = 0;
        }
        if (charLength < 0) {
            charLength = 0;
        }

        int utfLen = src.getUTF8Length();
        int chIdx = 0;
        int byteIdx = 0;
        while (byteIdx < utfLen && chIdx < charOffset) {
            byteIdx += src.charSize(src.getMetaDataLength() + byteIdx);
            chIdx++;
        }
        if (byteIdx >= utfLen) {
            // Again, why do we tolerant this kind of mistakes?
            // throw new StringIndexOutOfBoundsException(charOffset);
            builder.reset(out, 0);
            builder.finish();
            return;
        }

        builder.reset(out, Math.min(utfLen - byteIdx, (int) (charLength * 1.0 * byteIdx / chIdx)));
        chIdx = 0;
        while (byteIdx < utfLen && chIdx < charLength) {
            builder.appendChar(src.charAt(src.getMetaDataLength() + byteIdx));
            chIdx++;
            byteIdx += src.charSize(src.getMetaDataLength() + byteIdx);
        }
        builder.finish();
    }

    public void substrBefore(UTF8StringPointable match, UTF8StringBuilder builder, GrowableArray out)
            throws IOException {
        substrBefore(this, match, builder, out);
    }

    /**
     * Write the substring before the given pattern. It will write a empty string if the matching fails.
     *
     * @param src
     * @param match
     * @param builder
     * @param out
     * @throws IOException
     */
    public static void substrBefore(UTF8StringPointable src, UTF8StringPointable match, UTF8StringBuilder builder,
            GrowableArray out) throws IOException {

        int byteOffset = find(src, match, false);
        if (byteOffset < 0) {
            builder.reset(out, 0);
            builder.finish();
            return;
        }

        final int srcMetaLen = src.getMetaDataLength();

        builder.reset(out, byteOffset);
        for (int idx = 0; idx < byteOffset;) {
            builder.appendChar(src.charAt(srcMetaLen + idx));
            idx += src.charSize(srcMetaLen + idx);
        }
        builder.finish();
    }

    public void substrAfter(UTF8StringPointable match, UTF8StringBuilder builder, GrowableArray out)
            throws IOException {
        substrAfter(this, match, builder, out);
    }

    /**
     * Write the substring after the given pattern. It will write a empty string if the matching fails.
     *
     * @param src
     * @param match
     * @param builder
     * @param out
     */
    public static void substrAfter(UTF8StringPointable src, UTF8StringPointable match, UTF8StringBuilder builder,
            GrowableArray out) throws IOException {

        int byteOffset = find(src, match, false);
        if (byteOffset < 0) {
            builder.reset(out, 0);
            builder.finish();
            return;
        }

        final int srcUtfLen = src.getUTF8Length();
        final int matchUtfLen = match.getUTF8Length();

        final int resultLen = srcUtfLen - byteOffset - matchUtfLen;
        builder.reset(out, resultLen);
        builder.appendUtf8StringPointable(src, src.getCharStartOffset() + byteOffset + matchUtfLen, resultLen);
        builder.finish();
    }

    public void lowercase(UTF8StringBuilder builder, GrowableArray out) throws IOException {
        lowercase(this, builder, out);
    }

    /**
     * Generates a lower case string of an input string.
     *
     * @param src
     *            , the input source string.
     * @param builder
     *            , a builder for the resulting string.
     * @param out
     *            , the storage for a result string.
     * @throws IOException
     */
    public static void lowercase(UTF8StringPointable src, UTF8StringBuilder builder, GrowableArray out)
            throws IOException {
        final int srcUtfLen = src.getUTF8Length();
        final int srcStart = src.getMetaDataLength();

        builder.reset(out, srcUtfLen);
        int byteIndex = 0;
        while (byteIndex < srcUtfLen) {
            builder.appendChar(Character.toLowerCase(src.charAt(srcStart + byteIndex)));
            byteIndex += src.charSize(srcStart + byteIndex);
        }
        builder.finish();
    }

    public void uppercase(UTF8StringBuilder builder, GrowableArray out) throws IOException {
        uppercase(this, builder, out);
    }

    /**
     * Generates an upper case string of an input string.
     *
     * @param src
     *            , the input source string.
     * @param builder
     *            , a builder for the resulting string.
     * @param out
     *            , the storage for a result string.
     * @throws IOException
     */
    public static void uppercase(UTF8StringPointable src, UTF8StringBuilder builder, GrowableArray out)
            throws IOException {
        final int srcUtfLen = src.getUTF8Length();
        final int srcStart = src.getMetaDataLength();

        builder.reset(out, srcUtfLen);
        int byteIndex = 0;
        while (byteIndex < srcUtfLen) {
            builder.appendChar(Character.toUpperCase(src.charAt(srcStart + byteIndex)));
            byteIndex += src.charSize(srcStart + byteIndex);
        }
        builder.finish();
    }

    public void initCap(UTF8StringBuilder builder, GrowableArray out) throws IOException {
        initCap(this, builder, out);
    }

    /**
     * Generates a "title" format string from an input source string, i.e., the first letter of each word
     * is in the upper case while the other letter is in the lower case.
     *
     * @param src
     *            , the input source string.
     * @param builder
     *            , a builder for the resulting string.
     * @param out
     *            , the storage for a result string.
     * @throws IOException
     */
    public static void initCap(UTF8StringPointable src, UTF8StringBuilder builder, GrowableArray out)
            throws IOException {
        final int srcUtfLen = src.getUTF8Length();
        final int srcStart = src.getMetaDataLength();

        builder.reset(out, srcUtfLen);
        boolean toUpperCase = true;
        int byteIndex = 0;
        while (byteIndex < srcUtfLen) {
            char originalChar = src.charAt(srcStart + byteIndex);
            boolean isLetter = Character.isLetter(originalChar);

            // Make the first character into upper case while the later ones into lower case.
            char resultChar = toUpperCase && isLetter ? Character.toUpperCase(originalChar)
                    : (isLetter ? Character.toLowerCase(originalChar) : originalChar);
            builder.appendChar(resultChar);
            byteIndex += src.charSize(srcStart + byteIndex);

            // Whether the next letter needs to switch to the upper case.
            toUpperCase = !isLetter;
        }
        builder.finish();
    }

    public void trim(UTF8StringBuilder builder, GrowableArray out, boolean left, boolean right, CharSet charSet)
            throws IOException {
        trim(this, builder, out, left, right, charSet);
    }

    /**
     * Generates a trimmed string of an input source string.
     *
     * @param srcPtr
     *            , the input source string.
     * @param builder
     *            , the result string builder.
     * @param out
     *            , the storage for the output string.
     * @param left
     *            , whether to trim the left side.
     * @param right
     *            , whether to trim the right side.
     * @param charSet
     *            , the chars that should be trimmed.
     * @throws IOException
     */
    public static void trim(UTF8StringPointable srcPtr, UTF8StringBuilder builder, GrowableArray out, boolean left,
            boolean right, CharSet charSet) throws IOException {
        final int srcUtfLen = srcPtr.getUTF8Length();
        final int srcStart = srcPtr.getMetaDataLength();
        // Finds the start Index (inclusive).
        int startIndex = 0;
        if (left) {
            while (startIndex < srcUtfLen) {
                char ch = srcPtr.charAt(srcStart + startIndex);
                if (!charSet.contains(ch)) {
                    break;
                }
                startIndex += srcPtr.charSize(srcStart + startIndex);
            }
        }

        // Finds the end index (exclusive).
        int endIndex = srcUtfLen;
        if (right) {
            endIndex = startIndex;
            int cursorIndex = startIndex;
            while (cursorIndex < srcUtfLen) {
                char ch = srcPtr.charAt(srcStart + cursorIndex);
                cursorIndex += srcPtr.charSize(srcStart + cursorIndex);
                if (!charSet.contains(ch)) {
                    endIndex = cursorIndex;
                }
            }
        }

        // Outputs the desired substring.
        int len = endIndex - startIndex;
        builder.reset(out, len);
        builder.appendUtf8StringPointable(srcPtr, srcPtr.getStartOffset() + srcStart + startIndex, len);
        builder.finish();
    }

}