List of usage examples for java.lang Character isLowSurrogate
public static boolean isLowSurrogate(char ch)
From source file:de.fau.cs.osr.utils.StringUtils.java
public static String escHtml(String text, boolean forAttribute) { // StringEscapeUtils.escapeHtml(in) does not escape '\'' but a lot of // other stuff that doesn't need escaping. if (text == null) return ""; int n = text.length(); StringBuilder sb = new StringBuilder(n * 4 / 3); for (int i = 0; i < n; i++) { char ch = text.charAt(i); switch (ch) { case ' ': case '\n': case '\t': sb.append(ch);/*w w w . ja v a 2 s . c o m*/ break; case '<': sb.append("<"); break; case '>': sb.append(forAttribute ? ">" : ">"); break; case '&': sb.append("&"); break; case '\'': // ' cannot safely be used, see wikipedia sb.append("'"); break; case '"': sb.append(forAttribute ? """ : "\""); break; default: if ((ch >= 0 && ch < 0x20) || (ch == 0xFE)) { hexCharRef(sb, ch); break; } else if (Character.isHighSurrogate(ch)) { ++i; if (i < n) { char ch2 = text.charAt(i); if (Character.isLowSurrogate(ch2)) { int codePoint = Character.toCodePoint(ch, ch2); switch (Character.getType(codePoint)) { case Character.CONTROL: case Character.PRIVATE_USE: case Character.UNASSIGNED: hexCharRef(sb, codePoint); break; default: sb.append(ch); sb.append(ch2); break; } continue; } } } else if (!Character.isLowSurrogate(ch)) { sb.append(ch); continue; } // No low surrogate followed or only low surrogate throw new IllegalArgumentException("String contains isolated surrogates!"); } } return sb.toString(); }
From source file:Main.java
/** * Gets the index of the longest NCName that is the suffix of a character * sequence.//ww w . j a v a2 s . c o m * * @param s * The character sequence. * @return The index of the longest suffix of the specified character * sequence {@code s} that is an NCName, or -1 if the character * sequence {@code s} does not have a suffix that is an NCName. */ public static int getNCNameSuffixIndex(CharSequence s) { // identify bnode labels and do not try to split them if (s.length() > 1 && s.charAt(0) == '_' && s.charAt(1) == ':') { return -1; } int index = -1; for (int i = s.length() - 1; i > -1; i--) { if (!Character.isLowSurrogate(s.charAt(i))) { int codePoint = Character.codePointAt(s, i); if (isNCNameStartChar(codePoint)) { index = i; } if (!isNCNameChar(codePoint)) { break; } } } return index; }
From source file:it.geosolutions.httpproxy.utils.Utils.java
/** * @param ch//from w w w . j a v a 2s . c o m * @return */ final static int escapeHtmlFull(int ch) { if (ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z' || ch >= '0' && ch <= '9') { // safe return ch; } else if (Character.isWhitespace(ch)) { if (ch != '\n' && ch != '\r' && ch != '\t') // safe return ch; } else if (Character.isDefined(ch)) { // safe return ch; } else if (Character.isISOControl(ch)) { // paranoid version:isISOControl which are not isWhitespace // removed ! // do nothing do not include in output ! return -1; } else if (Character.isHighSurrogate((char) ch)) { // do nothing do not include in output ! return -1; } else if (Character.isLowSurrogate((char) ch)) { // wrong char[] sequence, //TODO: LOG !!! return -1; } return -1; }
From source file:Main.java
/** * Check if the passed character is valid for XML content. Works for XML 1.0 * and XML 1.1.<br>/*from www. j av a 2 s . c om*/ * Note: makes no difference between the runtime JAXP solution and the * explicit Xerces version * * @param c * The character to be checked. * @return <code>true</code> if the character is valid in XML, * <code>false</code> otherwise. */ public static boolean isInvalidXMLCharacter(final char c) { // Based on: http://www.w3.org/TR/2006/REC-xml11-20060816/#charsets // Speed up by separating the most common use cases first if (c < 256) { // Character <= 0x00ff - use precomposed table return ILLEGAL_XML_CHARS[c]; } // Character >= 0x0100 // For completeness, the Unicode line separator character, #x2028, is // also supported. // Surrogate blocks (no Java IDs found) // High surrogate: 0xd800-0xdbff // Low surrogate: 0xdc00-0xdfff return c == '\u2028' || (c >= '\ufdd0' && c <= '\ufddf') || c == '\ufffe' || c == '\uffff' || Character.isHighSurrogate(c) || Character.isLowSurrogate(c); }
From source file:com.ikon.util.FormatUtil.java
/** * Trim Unicode surrogate characters//w w w .j av a2s . c om * * http://en.wikipedia.org/wiki/Mapping_of_Unicode_characters#Surrogates */ public static String trimUnicodeSurrogates(String text) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < text.length(); i++) { char ch = text.charAt(i); if (!Character.isHighSurrogate(ch) && !Character.isLowSurrogate(ch)) { sb.append(ch); } } return sb.toString(); }
From source file:de.fau.cs.osr.utils.StringUtils.java
public static boolean hasIsolatedSurrogates(String text) { final int length = text.length(); for (int i = 0; i < length; ++i) { char ch = text.charAt(i); if (Character.isHighSurrogate(ch)) { ++i;/*from w w w.j a va2 s . c om*/ if (i < length) { char ch2 = text.charAt(i); if (!Character.isLowSurrogate(ch2)) return true; } else return true; } else if (Character.isLowSurrogate(ch)) return true; } return false; }
From source file:org.bimserver.ifc.step.serializer.IfcStepSerializer.java
private void writePrimitive(Object val) throws SerializerException, IOException { if (val.getClass().getSimpleName().equals("Tristate")) { if (val.toString().equals("TRUE")) { print(BOOLEAN_TRUE);/*w w w . j ava 2s . co m*/ } else if (val.toString().equals("FALSE")) { print(BOOLEAN_FALSE); } else if (val.toString().equals("UNDEFINED")) { print(BOOLEAN_UNDEFINED); } } else if (val instanceof Double) { if (((Double) val).isInfinite() || (((Double) val).isNaN())) { LOGGER.info("Serializing infinite or NaN double as 0.0"); print("0.0"); } else { String string = val.toString(); if (string.endsWith(DOT_0)) { print(string.substring(0, string.length() - 1)); } else { print(string); } } } else if (val instanceof Boolean) { Boolean bool = (Boolean) val; if (bool) { print(BOOLEAN_TRUE); } else { print(BOOLEAN_FALSE); } } else if (val instanceof String) { print(SINGLE_QUOTE); String stringVal = (String) val; for (int i = 0; i < stringVal.length(); i++) { char c = stringVal.charAt(i); if (c == '\'') { print("\'\'"); } else if (c == '\\') { print("\\\\"); } else if (c >= 32 && c <= 126) { // ISO 8859-1 print("" + c); } else if (c < 255) { // ISO 10646 and ISO 8859-1 are the same < 255 , using ISO_8859_1 print("\\X\\" + new String(Hex.encodeHex( Charsets.ISO_8859_1.encode(CharBuffer.wrap(new char[] { (char) c })).array())) .toUpperCase()); } else { if (useIso8859_1) { // ISO 8859-1 with -128 offset ByteBuffer encode = Charsets.ISO_8859_1.encode(new String(new char[] { (char) (c - 128) })); print("\\S\\" + (char) encode.get()); } else { // The following code has not been tested (2012-04-25) // Use UCS-2 or UCS-4 // TODO when multiple sequential characters should be encoded in UCS-2 or UCS-4, we don't really need to add all those \X0\ \X2\ and \X4\ chars if (Character.isLowSurrogate(c)) { throw new SerializerException("Unexpected low surrogate range char"); } else if (Character.isHighSurrogate(c)) { // We need UCS-4, this is probably never happening if (i + 1 < stringVal.length()) { char low = stringVal.charAt(i + 1); if (!Character.isLowSurrogate(low)) { throw new SerializerException( "High surrogate char should be followed by char in low surrogate range"); } try { print("\\X4\\" + new String(Hex.encodeHex(Charset.forName("UTF-32") .encode(new String(new char[] { c, low })).array())).toUpperCase() + "\\X0\\"); } catch (UnsupportedCharsetException e) { throw new SerializerException(e); } i++; } else { throw new SerializerException( "High surrogate char should be followed by char in low surrogate range, but end of string reached"); } } else { // UCS-2 will do print("\\X2\\" + new String(Hex .encodeHex(Charsets.UTF_16BE.encode(CharBuffer.wrap(new char[] { c })).array())) .toUpperCase() + "\\X0\\"); } } } } print(SINGLE_QUOTE); } else if (val instanceof Enumerator) { print("." + val + "."); } else { print(val == null ? "$" : val.toString()); } }
From source file:se.sawano.java.security.otp.google.keyuri.UnicodeEscaper.java
/** * Returns the Unicode code point of the character at the given index. * * <p>Unlike {@link Character#codePointAt(CharSequence, int)} or {@link String#codePointAt(int)} * this method will never fail silently when encountering an invalid surrogate pair. * * <p>The behaviour of this method is as follows: * <ol>//from w ww . j ava2 s. co m * <li>If {@code index >= end}, {@link IndexOutOfBoundsException} is thrown. * <li><b>If the character at the specified index is not a surrogate, it is returned.</b> * <li>If the first character was a high surrogate value, then an attempt is made to read the next * character. * <ol> * <li><b>If the end of the sequence was reached, the negated value of the trailing high * surrogate is returned.</b> * <li><b>If the next character was a valid low surrogate, the code point value of the * high/low surrogate pair is returned.</b> * <li>If the next character was not a low surrogate value, then {@link * IllegalArgumentException} is thrown. * </ol> * <li>If the first character was a low surrogate value, {@link IllegalArgumentException} is * thrown. * </ol> * * @param seq * the sequence of characters from which to decode the code point * @param index * the index of the first character to decode * @param end * the index beyond the last valid character to decode * * @return the Unicode code point for the given index or the negated value of the trailing high surrogate character at the end of the sequence */ protected static int codePointAt(CharSequence seq, int index, int end) { notNull(seq); if (index < end) { char c1 = seq.charAt(index++); if (c1 < Character.MIN_HIGH_SURROGATE || c1 > Character.MAX_LOW_SURROGATE) { // Fast path (first test is probably all we need to do) return c1; } else if (c1 <= Character.MAX_HIGH_SURROGATE) { // If the high surrogate was the last character, return its inverse if (index == end) { return -c1; } // Otherwise look for the low surrogate following it char c2 = seq.charAt(index); if (Character.isLowSurrogate(c2)) { return Character.toCodePoint(c1, c2); } throw new IllegalArgumentException("Expected low surrogate but got char '" + c2 + "' with value " + (int) c2 + " at index " + index + " in '" + seq + "'"); } else { throw new IllegalArgumentException("Unexpected low surrogate character '" + c1 + "' with value " + (int) c1 + " at index " + (index - 1) + " in '" + seq + "'"); } } throw new IndexOutOfBoundsException("Index exceeds specified range"); }
From source file:CodePointInputMethod.java
private void finishComposition() { int len = buffer.length(); if (len == 6 && format != SPECIAL_ESCAPE) { char codePoint = (char) getCodePoint(buffer, 2, 5); if (Character.isValidCodePoint(codePoint) && codePoint != 0xFFFF) { buffer.setLength(0);/*from w ww.j a v a 2 s . co m*/ buffer.append(codePoint); sendCommittedText(); return; } } else if (len == 8 && format == SPECIAL_ESCAPE) { int codePoint = getCodePoint(buffer, 2, 7); if (Character.isValidCodePoint(codePoint) && codePoint != 0xFFFF) { buffer.setLength(0); buffer.appendCodePoint(codePoint); sendCommittedText(); return; } } else if (len == 12 && format == SURROGATE_PAIR) { char[] codePoint = { (char) getCodePoint(buffer, 2, 5), (char) getCodePoint(buffer, 8, 11) }; if (Character.isHighSurrogate(codePoint[0]) && Character.isLowSurrogate(codePoint[1])) { buffer.setLength(0); buffer.append(codePoint); sendCommittedText(); return; } } beep(); }
From source file:nl.tue.ddss.ifcrdf.model.IfcStepSerializer.java
private void writePrimitive(Resource val) throws IOException, SerializerException { if (isLogical(val)) { if (val.hasProperty(HASLOGICAL, EXPRESS_TRUE)) { print(BOOLEAN_TRUE);/*from w ww . ja v a 2 s . c o m*/ } else if (val.hasProperty(HASLOGICAL, EXPRESS_FALSE)) { print(BOOLEAN_FALSE); } else if (val.hasProperty(HASLOGICAL, EXPRESS_UNDEFINED)) { print(BOOLEAN_UNDEFINED); } } else if (isReal(val) || isNumber(val)) { Double valDouble = val.getProperty(HASDOUBLE).getObject().asLiteral().getDouble(); if ((valDouble).isInfinite() || ((valDouble).isNaN())) { LOGGER.info("Serializing infinite or NaN double as 0.0"); print("0.0"); } else { String string = valDouble.toString(); if (string.endsWith(DOT_0)) { print(string.substring(0, string.length() - 1)); } else { print(string); } } } else if (isInteger(val)) { Integer valInteger = val.getProperty(HASINTEGER).getObject().asLiteral().getInt(); String string = valInteger.toString(); if (string.endsWith(DOT_0)) { print(string.substring(0, string.length() - 2)); } else { print(string); } } else if (isBoolean(val)) { if (val.hasLiteral(HASBOOLEAN, true)) { print(BOOLEAN_TRUE); } else if (val.hasLiteral(HASBOOLEAN, false)) { print(BOOLEAN_FALSE); } } else if (isString(val)) { print(SINGLE_QUOTE); String stringVal = val.getProperty(HASSTRING).getObject().asLiteral().getString(); for (int i = 0; i < stringVal.length(); i++) { char c = stringVal.charAt(i); if (c == '\'') { print("\'\'"); } else if (c == '\\') { print("\\\\"); } else if (c >= 32 && c <= 126) { // ISO 8859-1 print("" + c); } else if (c < 255) { // ISO 10646 and ISO 8859-1 are the same < 255 , using // ISO_8859_1 print("\\X\\" + new String(Hex.encodeHex( Charsets.ISO_8859_1.encode(CharBuffer.wrap(new char[] { (char) c })).array())) .toUpperCase()); } else { if (useIso8859_1) { // ISO 8859-1 with -128 offset ByteBuffer encode = Charsets.ISO_8859_1.encode(new String(new char[] { (char) (c - 128) })); print("\\S\\" + (char) encode.get()); } else { // The following code has not been tested (2012-04-25) // Use UCS-2 or UCS-4 // TODO when multiple sequential characters should be // encoded in UCS-2 or UCS-4, we don't really need to // add all those \X0\ \X2\ and \X4\ chars if (Character.isLowSurrogate(c)) { throw new SerializerException("Unexpected low surrogate range char"); } else if (Character.isHighSurrogate(c)) { // We need UCS-4, this is probably never happening if (i + 1 < stringVal.length()) { char low = stringVal.charAt(i + 1); if (!Character.isLowSurrogate(low)) { throw new SerializerException( "High surrogate char should be followed by char in low surrogate range"); } try { print("\\X4\\" + new String(Hex.encodeHex(Charset.forName("UTF-32") .encode(new String(new char[] { c, low })).array())).toUpperCase() + "\\X0\\"); } catch (UnsupportedCharsetException e) { throw new SerializerException(e); } i++; } else { throw new SerializerException( "High surrogate char should be followed by char in low surrogate range, but end of string reached"); } } else { // UCS-2 will do print("\\X2\\" + new String(Hex .encodeHex(Charsets.UTF_16BE.encode(CharBuffer.wrap(new char[] { c })).array())) .toUpperCase() + "\\X0\\"); } } } } print(SINGLE_QUOTE); } else if (isEnumeration(val)) { String enumVal = val.getLocalName(); print("." + enumVal + "."); } else { print(val == null ? "$" : val.toString()); } }