Return the Unicode char which is coded in the bytes at position 0.
import java.io.File;
import java.io.FileFilter;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
/**
* Various string manipulation methods that are more efficient then chaining
* string operations: all is done in the same buffer without creating a bunch of
* string objects.
*
* @author <a href="mailto:dev@labs.apache.org">Dungeon Project</a>
*/
public class Main {
private static final int UTF8_MULTI_BYTES_MASK = 0x0080;
private static final int UTF8_TWO_BYTES_MASK = 0x00E0;
private static final int UTF8_TWO_BYTES = 0x00C0;
private static final int UTF8_THREE_BYTES_MASK = 0x00F0;
private static final int UTF8_THREE_BYTES = 0x00E0;
private static final int UTF8_FOUR_BYTES_MASK = 0x00F8;
private static final int UTF8_FOUR_BYTES = 0x00F0;
private static final int UTF8_FIVE_BYTES_MASK = 0x00FC;
private static final int UTF8_FIVE_BYTES = 0x00F8;
private static final int UTF8_SIX_BYTES_MASK = 0x00FE;
private static final int UTF8_SIX_BYTES = 0x00FC;
/**
* Return the Unicode char which is coded in the bytes at position 0.
*
* @param bytes
* The byte[] represntation of an Unicode string.
* @return The first char found.
*/
public static final char bytesToChar( byte[] bytes )
{
return bytesToChar( bytes, 0 );
}
/**
* Return the Unicode char which is coded in the bytes at the given
* position.
*
* @param bytes
* The byte[] represntation of an Unicode string.
* @param pos
* The current position to start decoding the char
* @return The decoded char, or -1 if no char can be decoded TODO : Should
* stop after the third byte, as a char is only 2 bytes long.
*/
public static final char bytesToChar( byte[] bytes, int pos )
{
if ( bytes == null )
{
return ( char ) -1;
}
if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
{
return ( char ) bytes[pos];
}
else
{
if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
{
// Two bytes char
return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + // 110x-xxyy
// 10zz-zzzz
// ->
// 0000-0xxx
// 0000-0000
( ( bytes[pos] & 0x03 ) << 6 ) + // 110x-xxyy 10zz-zzzz
// -> 0000-0000
// yy00-0000
( bytes[pos + 1] & 0x3F ) // 110x-xxyy 10zz-zzzz -> 0000-0000
// 00zz-zzzz
); // -> 0000-0xxx yyzz-zzzz (07FF)
}
else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
{
// Three bytes char
return ( char ) (
// 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-0000-0000-0000
( ( bytes[pos] & 0x0F ) << 12 ) +
// 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-xxxx-0000-0000
( ( bytes[pos + 1] & 0x3C ) << 6 ) +
// 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-yy00-0000
( ( bytes[pos + 1] & 0x03 ) << 6 ) +
// 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-00zz-zzzz
( bytes[pos + 2] & 0x3F )
// -> tttt-xxxx yyzz-zzzz (FF FF)
);
}
else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
{
// Four bytes char
return ( char ) (
// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-tt00
// 0000-0000 0000-0000
( ( bytes[pos] & 0x07 ) << 18 ) +
// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-00uu
// 0000-0000 0000-0000
( ( bytes[pos + 1] & 0x30 ) << 16 ) +
// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
// vvvv-0000 0000-0000
( ( bytes[pos + 1] & 0x0F ) << 12 ) +
// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
// 0000-xxxx 0000-0000
( ( bytes[pos + 2] & 0x3C ) << 6 ) +
// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
// 0000-0000 yy00-0000
( ( bytes[pos + 2] & 0x03 ) << 6 ) +
// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
// 0000-0000 00zz-zzzz
( bytes[pos + 3] & 0x3F )
// -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
);
}
else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
{
// Five bytes char
return ( char ) (
// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
// 0000-00tt 0000-0000 0000-0000 0000-0000
( ( bytes[pos] & 0x03 ) << 24 ) +
// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
// 0000-0000 uuuu-uu00 0000-0000 0000-0000
( ( bytes[pos + 1] & 0x3F ) << 18 ) +
// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
// 0000-0000 0000-00vv 0000-0000 0000-0000
( ( bytes[pos + 2] & 0x30 ) << 12 ) +
// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
// 0000-0000 0000-0000 wwww-0000 0000-0000
( ( bytes[pos + 2] & 0x0F ) << 12 ) +
// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
// 0000-0000 0000-0000 0000-xxxx 0000-0000
( ( bytes[pos + 3] & 0x3C ) << 6 ) +
// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
// 0000-0000 0000-0000 0000-0000 yy00-0000
( ( bytes[pos + 3] & 0x03 ) << 6 ) +
// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
// 0000-0000 0000-0000 0000-0000 00zz-zzzz
( bytes[pos + 4] & 0x3F )
// -> 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)
);
}
else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
{
// Six bytes char
return ( char ) (
// 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
// ->
// 0s00-0000 0000-0000 0000-0000 0000-0000
( ( bytes[pos] & 0x01 ) << 30 ) +
// 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
// ->
// 00tt-tttt 0000-0000 0000-0000 0000-0000
( ( bytes[pos + 1] & 0x3F ) << 24 ) +
// 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
// 10zz-zzzz ->
// 0000-0000 uuuu-uu00 0000-0000 0000-0000
( ( bytes[pos + 2] & 0x3F ) << 18 ) +
// 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
// 10zz-zzzz ->
// 0000-0000 0000-00vv 0000-0000 0000-0000
( ( bytes[pos + 3] & 0x30 ) << 12 ) +
// 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
// 10zz-zzzz ->
// 0000-0000 0000-0000 wwww-0000 0000-0000
( ( bytes[pos + 3] & 0x0F ) << 12 ) +
// 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
// 10zz-zzzz ->
// 0000-0000 0000-0000 0000-xxxx 0000-0000
( ( bytes[pos + 4] & 0x3C ) << 6 ) +
// 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
// 10zz-zzzz ->
// 0000-0000 0000-0000 0000-0000 yy00-0000
( ( bytes[pos + 4] & 0x03 ) << 6 ) +
// 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
// ->
// 0000-0000 0000-0000 0000-0000 00zz-zzzz
( bytes[pos + 5] & 0x3F )
// -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)
);
}
else
{
return ( char ) -1;
}
}
}
}
Related examples in the same category