Count the number of chars included in the given byte[].
import java.io.File;
import java.io.FileFilter;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
/**
* Various string manipulation methods that are more efficient then chaining
* string operations: all is done in the same buffer without creating a bunch of
* string objects.
*
* @author <a href="mailto:dev@labs.apache.org">Dungeon Project</a>
*/
public class Main {
private static final int UTF8_MULTI_BYTES_MASK = 0x0080;
private static final int UTF8_TWO_BYTES_MASK = 0x00E0;
private static final int UTF8_TWO_BYTES = 0x00C0;
private static final int UTF8_THREE_BYTES_MASK = 0x00F0;
private static final int UTF8_THREE_BYTES = 0x00E0;
private static final int UTF8_FOUR_BYTES_MASK = 0x00F8;
private static final int UTF8_FOUR_BYTES = 0x00F0;
private static final int UTF8_FIVE_BYTES_MASK = 0x00FC;
private static final int UTF8_FIVE_BYTES = 0x00F8;
private static final int UTF8_SIX_BYTES_MASK = 0x00FE;
private static final int UTF8_SIX_BYTES = 0x00FC;
/**
* Count the number of bytes needed to return an Unicode char. This can be
* from 1 to 6.
*
* @param bytes
* The bytes to read
* @param pos
* Position to start counting. It must be a valid start of a
* encoded char !
* @return The number of bytes to create a char, or -1 if the encoding is
* wrong. TODO : Should stop after the third byte, as a char is only
* 2 bytes long.
*/
public static final int countBytesPerChar( byte[] bytes, int pos )
{
if ( bytes == null )
{
return -1;
}
if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
{
return 1;
}
else if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
{
return 2;
}
else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
{
return 3;
}
else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
{
return 4;
}
else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
{
return 5;
}
else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
{
return 6;
}
else
{
return -1;
}
}
/**
* Count the number of chars included in the given byte[].
*
* @param bytes
* The byte array to decode
* @return The number of char in the byte array
*/
public static final int countChars( byte[] bytes )
{
if ( bytes == null )
{
return 0;
}
int nbChars = 0;
int currentPos = 0;
while ( currentPos < bytes.length )
{
currentPos += countBytesPerChar( bytes, currentPos );
nbChars++;
}
return nbChars;
}
}
Related examples in the same category