Java examples for java.lang:String UTF
Interprets the given byte array as UTF-8 and converts to UTF-16.
/*//from w w w . j av a2s.c om * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ public class Main{ public static final int UNI_SUR_HIGH_START = 0xD800; public static final int UNI_SUR_HIGH_END = 0xDBFF; public static final int UNI_SUR_LOW_START = 0xDC00; public static final int UNI_SUR_LOW_END = 0xDFFF; private static final long UNI_MAX_BMP = 0x0000FFFF; private static final long HALF_MASK = 0x3FFL; /** * Interprets the given byte array as UTF-8 and converts to UTF-16. It is the * responsibility of the caller to make sure that the destination array is large enough. * <p> * NOTE: Full characters are read, even if this reads past the length passed (and * can result in an ArrayOutOfBoundsException if invalid UTF-8 is passed). * Explicit checks for valid UTF-8 are not performed. */ // TODO: broken if chars.offset != 0 public static int UTF8toUTF16(byte[] utf8, int offset, int length, char[] out) { int out_offset = 0; final int limit = offset + length; while (offset < limit) { int b = utf8[offset++] & 0xff; if (b < 0xc0) { assert b < 0x80; out[out_offset++] = (char) b; } else if (b < 0xe0) { out[out_offset++] = (char) (((b & 0x1f) << 6) + (utf8[offset++] & 0x3f)); } else if (b < 0xf0) { out[out_offset++] = (char) (((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f)); offset += 2; } else { assert b < 0xf8 : "b = 0x" + Integer.toHexString(b); int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f); offset += 3; if (ch < UNI_MAX_BMP) { out[out_offset++] = (char) ch; } else { int chHalf = ch - 0x0010000; out[out_offset++] = (char) ((chHalf >> 10) + 0xD800); out[out_offset++] = (char) ((chHalf & HALF_MASK) + 0xDC00); } } } return out_offset; } /** * Utility method for {@link #UTF8toUTF16(byte[], int, int, char[])} * @see #UTF8toUTF16(byte[], int, int, char[]) */ public static int UTF8toUTF16(BytesRef bytesRef, char[] chars) { return UTF8toUTF16(bytesRef.bytes, bytesRef.offset, bytesRef.length, chars); } public static String toHexString(String s) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char ch = s.charAt(i); if (i > 0) { sb.append(' '); } if (ch < 128) { sb.append(ch); } else { if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { sb.append("H:"); } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { sb.append("L:"); } else if (ch > UNI_SUR_LOW_END) { if (ch == 0xffff) { sb.append("F:"); } else { sb.append("E:"); } } sb.append("0x" + Integer.toHexString(ch)); } } return sb.toString(); } }