Java UTF from toUtf8Codes(String str)

Here you can find the source of toUtf8Codes(String str)

Description

convert string to utf8 codes

License

Apache License

Parameter

Parameter Description
str not null

Declaration

public static int[] toUtf8Codes(String str) 

Method Source Code

//package com.java2s;
/*//from  ww  w. ja  va  2 s  .  c o m
 * Copyright (C) 2014-2015 Nagisa Sekiguchi
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;

public class Main {
    public final static Charset DEFAULT_CHARSET = Charset.forName("UTF-8");
    /**
     * for utf8. drop support 5-6 byte character
     */
    private final static byte[] utf8SkipData = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
            3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1 };

    /**
     * convert string to utf8 codes
     *
     * @param str not null
     * @return
     */
    public static int[] toUtf8Codes(String str) {
        byte[] buf = str.getBytes(DEFAULT_CHARSET);
        int startIndex = 0;
        List<Integer> utf8CodeList = new ArrayList<>();

        while (startIndex < buf.length) {
            byte b = buf[startIndex];
            int charLength = getUtf8Length(b);
            utf8CodeList.add(toUtf8Code(buf, startIndex, charLength));
            startIndex += charLength;
        }

        final int size = utf8CodeList.size();
        int[] utf8Codes = new int[size];
        for (int i = 0; i < size; i++) {
            utf8Codes[i] = utf8CodeList.get(i);
        }
        return utf8Codes;
    }

    /**
     * get length of utf8 character
     *
     * @param ch start character.
     * @return
     */
    public static int getUtf8Length(byte ch) {
        return utf8SkipData[Byte.toUnsignedInt(ch)];
    }

    /**
     * convert java character literal to utf8 code
     *
     * @param utf16 java character literal
     * @return utf8 code
     */
    public static int toUtf8Code(char utf16) {
        byte[] buf = Character.toString(utf16).getBytes(DEFAULT_CHARSET);
        return toUtf8Code(buf, 0, buf.length);
    }

    /**
     * convert unicode code point to utf8 code
     *
     * @param codePoint
     * @return utf8 code
     */
    public static int toUtf8Code(int codePoint) {
        byte[] buf = new String(Character.toChars(codePoint)).getBytes(DEFAULT_CHARSET);
        return toUtf8Code(buf, 0, buf.length);
    }

    /**
     * convert byte array to utf8 code
     *
     * @param buf
     * @param startIndex
     * @param charLength 1, 2, 3 or 4
     * @return utf8 code
     */
    public static int toUtf8Code(byte[] buf, int startIndex, int charLength) {
        switch (charLength) {
        case 1:
            return buf[startIndex];
        case 2:
            return (Byte.toUnsignedInt(buf[startIndex]) << 8) | Byte.toUnsignedInt(buf[++startIndex]);
        case 3:
            return (Byte.toUnsignedInt(buf[startIndex]) << 16) | (Byte.toUnsignedInt(buf[++startIndex]) << 8)
                    | Byte.toUnsignedInt(buf[++startIndex]);
        case 4:
            return (Byte.toUnsignedInt(buf[startIndex]) << 24) | (Byte.toUnsignedInt(buf[++startIndex]) << 16)
                    | (Byte.toUnsignedInt(buf[++startIndex]) << 8) | Byte.toUnsignedInt(buf[++startIndex]);
        default:
            throw new RuntimeException("broken string");
        }
    }
}

Related

  1. toUTF8(String string)
  2. toUtf8(String texto)
  3. toUTF8ByteArray(String s)
  4. toUtf8ByteArray(String source)
  5. toUtf8Code(char utf16)
  6. toUTF8FromLatin1(byte[] outputBuffer, String string)
  7. toUTF8k(String in)
  8. toUtf8Path(String path)
  9. toUTF8String(byte[] b, int offset, int length)