UTF8 String utilities
//
// Copyright 2004-2005 Mort Bay Consulting Pty. Ltd.
// ------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
import java.io.UnsupportedEncodingException;
//
/**
* Fast String Utilities.
*
* These string utilities provide both conveniance methods and performance
* improvements over most standard library versions. The main aim of the
* optimizations is to avoid object creation unless absolutely required.
*
* @author Greg Wilkins (gregw)
*/
public class StringUtil {
public static final String CRLF = "\015\012";
public static final String __LINE_SEPARATOR = System.getProperty("line.separator", "\n");
public static String __ISO_8859_1;
static {
String iso = System.getProperty("ISO_8859_1");
if (iso != null)
__ISO_8859_1 = iso;
else {
try {
new String(new byte[] { (byte) 20 }, "ISO-8859-1");
__ISO_8859_1 = "ISO-8859-1";
} catch (java.io.UnsupportedEncodingException e) {
__ISO_8859_1 = "ISO8859_1";
}
}
}
public final static String __UTF8 = "UTF-8";
private static char[] lowercases = { '\000', '\001', '\002', '\003', '\004', '\005', '\006',
'\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021',
'\022', '\023', '\024', '\025', '\026', '\027', '\030', '\031', '\032', '\033', '\034',
'\035', '\036', '\037', '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
'\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', '\060', '\061', '\062',
'\063', '\064', '\065', '\066', '\067', '\070', '\071', '\072', '\073', '\074', '\075',
'\076', '\077', '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', '\150',
'\151', '\152', '\153', '\154', '\155', '\156', '\157', '\160', '\161', '\162', '\163',
'\164', '\165', '\166', '\167', '\170', '\171', '\172', '\133', '\134', '\135', '\136',
'\137', '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', '\150', '\151',
'\152', '\153', '\154', '\155', '\156', '\157', '\160', '\161', '\162', '\163', '\164',
'\165', '\166', '\167', '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177' };
/* ------------------------------------------------------------ */
/**
* fast lower case conversion. Only works on ascii (not unicode)
*
* @param s
* the string to convert
* @return a lower case version of s
*/
public static String asciiToLowerCase(String s) {
char[] c = null;
int i = s.length();
// look for first conversion
while (i-- > 0) {
char c1 = s.charAt(i);
if (c1 <= 127) {
char c2 = lowercases[c1];
if (c1 != c2) {
c = s.toCharArray();
c[i] = c2;
break;
}
}
}
while (i-- > 0) {
if (c[i] <= 127)
c[i] = lowercases[c[i]];
}
return c == null ? s : new String(c);
}
/* ------------------------------------------------------------ */
public static boolean startsWithIgnoreCase(String s, String w) {
if (w == null)
return true;
if (s == null || s.length() < w.length())
return false;
for (int i = 0; i < w.length(); i++) {
char c1 = s.charAt(i);
char c2 = w.charAt(i);
if (c1 != c2) {
if (c1 <= 127)
c1 = lowercases[c1];
if (c2 <= 127)
c2 = lowercases[c2];
if (c1 != c2)
return false;
}
}
return true;
}
/* ------------------------------------------------------------ */
public static boolean endsWithIgnoreCase(String s, String w) {
if (w == null)
return true;
if (s == null)
return false;
int sl = s.length();
int wl = w.length();
if (sl < wl)
return false;
for (int i = wl; i-- > 0;) {
char c1 = s.charAt(--sl);
char c2 = w.charAt(i);
if (c1 != c2) {
if (c1 <= 127)
c1 = lowercases[c1];
if (c2 <= 127)
c2 = lowercases[c2];
if (c1 != c2)
return false;
}
}
return true;
}
/* ------------------------------------------------------------ */
public static boolean equals(String s, char[] buf, int offset, int length) {
if (s.length() != length)
return false;
for (int i = 0; i < length; i++)
if (buf[offset + i] != s.charAt(i))
return false;
return true;
}
/* ------------------------------------------------------------ */
public static String toUTF8String(byte[] b, int offset, int length) {
try {
if (length < 32) {
Utf8StringBuffer buffer = new Utf8StringBuffer(length);
buffer.append(b, offset, length);
return buffer.toString();
}
return new String(b, offset, length, __UTF8);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return null;
}
}
/* ------------------------------------------------------------ */
public static String toString(byte[] b, int offset, int length, String charset) {
if (charset == null || StringUtil.isUTF8(charset))
return toUTF8String(b, offset, length);
try {
return new String(b, offset, length, charset);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return null;
}
}
/* ------------------------------------------------------------ */
public static boolean isUTF8(String charset) {
return charset == __UTF8 || __UTF8.equalsIgnoreCase(charset);
}
/* ------------------------------------------------------------ */
public static String printable(String name) {
if (name == null)
return null;
StringBuffer buf = new StringBuffer(name.length());
for (int i = 0; i < name.length(); i++) {
char c = name.charAt(i);
if (!Character.isISOControl(c))
buf.append(c);
}
return buf.toString();
}
}
//
// Copyright 2006 Mort Bay Consulting Pty. Ltd.
// ------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
/* ------------------------------------------------------------ */
/**
* UTF-8 StringBuffer.
*
* This class wraps a standard {@link java.lang.StringBuffer} and provides
* methods to append UTF-8 encoded bytes, that are converted into characters.
*
* This class is stateful and up to 6 calls to {@link #append(byte)} may be
* needed before state a character is appended to the string buffer.
*
* The UTF-8 decoding is done by this class and no additional buffers or Readers
* are used. The UTF-8 code was inspired by http://javolution.org
*
*/
class Utf8StringBuffer {
StringBuffer _buffer;
int _more;
int _bits;
boolean _errors;
Utf8StringBuffer() {
_buffer = new StringBuffer();
}
Utf8StringBuffer(int capacity) {
_buffer = new StringBuffer(capacity);
}
public void append(byte[] b, int offset, int length) {
int end = offset + length;
for (int i = offset; i < end; i++)
append(b[i]);
}
public void append(byte b) {
if (b > 0) {
if (_more > 0) {
_buffer.append('?');
_more = 0;
_bits = 0;
} else
_buffer.append((char) (0x7f & b));
} else if (_more == 0) {
if ((b & 0xc0) != 0xc0) {
// 10xxxxxx
_buffer.append('?');
_more = 0;
_bits = 0;
} else if ((b & 0xe0) == 0xc0) {
// 110xxxxx
_more = 1;
_bits = b & 0x1f;
} else if ((b & 0xf0) == 0xe0) {
// 1110xxxx
_more = 2;
_bits = b & 0x0f;
} else if ((b & 0xf8) == 0xf0) {
// 11110xxx
_more = 3;
_bits = b & 0x07;
} else if ((b & 0xfc) == 0xf8) {
// 111110xx
_more = 4;
_bits = b & 0x03;
} else if ((b & 0xfe) == 0xfc) {
// 1111110x
_more = 5;
_bits = b & 0x01;
}
} else {
if ((b & 0xc0) == 0xc0) { // 11??????
_buffer.append('?');
_more = 0;
_bits = 0;
_errors = true;
} else {
// 10xxxxxx
_bits = (_bits << 6) | (b & 0x3f);
if (--_more == 0)
_buffer.append((char) _bits);
}
}
}
public int length() {
return _buffer.length();
}
public void reset() {
_buffer.setLength(0);
_more = 0;
_bits = 0;
_errors = false;
}
public StringBuffer getStringBuffer() {
return _buffer;
}
public String toString() {
return _buffer.toString();
}
/* ------------------------------------------------------------ */
/**
* @return True if there are non UTF-8 characters or incomplete UTF-8
* characters in the buffer.
*/
public boolean isError() {
return _errors || _more > 0;
}
}
Related examples in the same category