Interprets the given byte array as UTF-8 and converts to UTF-16.

Description

Demo Code

/*//from   w  w  w  .  j av  a2s.c om
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

public class Main{
    public static final int UNI_SUR_HIGH_START = 0xD800;
    public static final int UNI_SUR_HIGH_END = 0xDBFF;
    public static final int UNI_SUR_LOW_START = 0xDC00;
    public static final int UNI_SUR_LOW_END = 0xDFFF;
    private static final long UNI_MAX_BMP = 0x0000FFFF;
    private static final long HALF_MASK = 0x3FFL;
    /**
     * Interprets the given byte array as UTF-8 and converts to UTF-16. It is the
     * responsibility of the caller to make sure that the destination array is large enough.
     * <p>
     * NOTE: Full characters are read, even if this reads past the length passed (and
     * can result in an ArrayOutOfBoundsException if invalid UTF-8 is passed).
     * Explicit checks for valid UTF-8 are not performed. 
     */
    // TODO: broken if chars.offset != 0
    public static int UTF8toUTF16(byte[] utf8, int offset, int length,
            char[] out) {
        int out_offset = 0;
        final int limit = offset + length;
        while (offset < limit) {
            int b = utf8[offset++] & 0xff;
            if (b < 0xc0) {
                assert b < 0x80;
                out[out_offset++] = (char) b;
            } else if (b < 0xe0) {
                out[out_offset++] = (char) (((b & 0x1f) << 6) + (utf8[offset++] & 0x3f));
            } else if (b < 0xf0) {
                out[out_offset++] = (char) (((b & 0xf) << 12)
                        + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f));
                offset += 2;
            } else {
                assert b < 0xf8 : "b = 0x" + Integer.toHexString(b);
                int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12)
                        + ((utf8[offset + 1] & 0x3f) << 6)
                        + (utf8[offset + 2] & 0x3f);
                offset += 3;
                if (ch < UNI_MAX_BMP) {
                    out[out_offset++] = (char) ch;
                } else {
                    int chHalf = ch - 0x0010000;
                    out[out_offset++] = (char) ((chHalf >> 10) + 0xD800);
                    out[out_offset++] = (char) ((chHalf & HALF_MASK) + 0xDC00);
                }
            }
        }
        return out_offset;
    }
    /**
     * Utility method for {@link #UTF8toUTF16(byte[], int, int, char[])}
     * @see #UTF8toUTF16(byte[], int, int, char[])
     */
    public static int UTF8toUTF16(BytesRef bytesRef, char[] chars) {
        return UTF8toUTF16(bytesRef.bytes, bytesRef.offset,
                bytesRef.length, chars);
    }
    public static String toHexString(String s) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < s.length(); i++) {
            char ch = s.charAt(i);
            if (i > 0) {
                sb.append(' ');
            }
            if (ch < 128) {
                sb.append(ch);
            } else {
                if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
                    sb.append("H:");
                } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
                    sb.append("L:");
                } else if (ch > UNI_SUR_LOW_END) {
                    if (ch == 0xffff) {
                        sb.append("F:");
                    } else {
                        sb.append("E:");
                    }
                }

                sb.append("0x" + Integer.toHexString(ch));
            }
        }
        return sb.toString();
    }
}
Interprets the given byte array as UTF-8 and converts to UTF-16. - Java java.lang

Description

Demo Code

Related Tutorials