List of utility methods to do String Normalize
String | normalizeToAlpha(String input) normalize To Alpha StringBuffer buf = new StringBuffer(); for (int i = 0; i < input.length(); i++) { char charAt = input.charAt(i); if (Character.isLetterOrDigit(charAt)) buf.append(charAt); return buf.toString(); |
String | normalizeUnicode(CharSequence text) Apply Unicode NFC normalization to a string. return Normalizer.isNormalized(text, Normalizer.Form.NFC) ? text.toString()
: Normalizer.normalize(text, Normalizer.Form.NFC);
|
String | normalizeUnicode(final String str) Normalize a string potentially containing Unicode to NFC form. if (Normalizer.isNormalized(str, Normalizer.Form.NFC)) { return str; } else { return Normalizer.normalize(str, Normalizer.Form.NFC); |
String | normalizeUnicode(String input) performs a unicode normalization to NFC form (java.text.Normalizer.Form.NFC) for the given input if (input != null && !Normalizer.isNormalized(input, Normalizer.Form.NFC)) { return Normalizer.normalize(input, Normalizer.Form.NFC); return input; |
String | normalizeUnicode(String str) Normalize to "Normalization Form Canonical Decomposition" (NFD) REF: http: //stackoverflow.com/questions/3610013/file-listfiles-mangles-unicode- names-with-jdk-6-unicode-normalization-issues This supports proper file name retrieval from file system, among other things. Normalizer.Form form = Normalizer.Form.NFD; if (!Normalizer.isNormalized(str, form)) { return Normalizer.normalize(str, form); return str; |
String | normalizeUnicodeDiacritics(String text) normalize Unicode Diacritics text = Normalizer.normalize(text, Form.NFD); Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); text = pattern.matcher(text).replaceAll(""); return text; |
String | normalizeWhitespace(final String str) Utility to normalize whitespace in a String, i.e. if (isNullOrEmpty(str)) return str; final StringBuilder buf = new StringBuilder(); final CharacterIterator iter = new StringCharacterIterator(str); boolean inWhitespace = false; for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) { if (Character.isWhitespace(c)) { if (!inWhitespace) { ... |
String | normalizeWhiteSpace(String str) normalize White Space if (str == null) { return null; String normalized = Normalizer.normalize(str, java.text.Normalizer.Form.NFD); int len = normalized.length(); StringBuffer sb = new StringBuffer(); int spaceCount = 0; for (int i = 0; i < len;) { ... |
String | normalizeWidth(String text) Normalize the width of characters in the supplied text. StringBuilder sb = new StringBuilder(text); int ch; for (int i = 0; i < sb.length(); i++) { ch = sb.charAt(i); if ((ch >= 0xFF01) && (ch <= 0xFF5E)) { sb.setCharAt(i, (char) (ch - 0xFEE0)); continue; if (ch == 0x3000) { sb.setCharAt(i, ' '); switch (ch) { case 0xFF61: sb.setCharAt(i, (char) 0x3002); break; case 0xFF62: sb.setCharAt(i, (char) 0x300C); break; case 0xFF63: sb.setCharAt(i, (char) 0x300D); break; case 0xFF64: sb.setCharAt(i, (char) 0x3001); break; case 0xFF65: sb.setCharAt(i, (char) 0x30FB); break; case 0xFF66: sb.setCharAt(i, (char) 0x30F2); break; case 0xFF67: sb.setCharAt(i, (char) 0x30A1); break; case 0xFF68: sb.setCharAt(i, (char) 0x30A3); break; case 0xFF69: sb.setCharAt(i, (char) 0x30A5); break; case 0xFF6A: sb.setCharAt(i, (char) 0x30A7); break; case 0xFF6B: sb.setCharAt(i, (char) 0x30A9); break; case 0xFF6C: sb.setCharAt(i, (char) 0x30E3); break; case 0xFF6D: sb.setCharAt(i, (char) 0x30E5); break; case 0xFF6E: sb.setCharAt(i, (char) 0x30E7); break; case 0xFF6F: sb.setCharAt(i, (char) 0x30C3); break; case 0xFF70: sb.setCharAt(i, (char) 0x30FC); break; case 0xFF71: sb.setCharAt(i, (char) 0x30A2); break; case 0xFF72: sb.setCharAt(i, (char) 0x30A4); break; case 0xFF73: sb.setCharAt(i, (char) 0x30A6); break; case 0xFF74: sb.setCharAt(i, (char) 0x30A8); break; case 0xFF75: sb.setCharAt(i, (char) 0x30AA); break; case 0xFF76: sb.setCharAt(i, (char) 0x30AB); break; case 0xFF77: sb.setCharAt(i, (char) 0x30AD); break; case 0xFF78: sb.setCharAt(i, (char) 0x30AF); break; case 0xFF79: sb.setCharAt(i, (char) 0x30B1); break; case 0xFF7A: sb.setCharAt(i, (char) 0x30B3); break; case 0xFF7B: sb.setCharAt(i, (char) 0x30B5); break; case 0xFF7C: sb.setCharAt(i, (char) 0x30B7); break; case 0xFF7D: sb.setCharAt(i, (char) 0x30B9); break; case 0xFF7E: sb.setCharAt(i, (char) 0x30BB); break; case 0xFF7F: sb.setCharAt(i, (char) 0x30BD); break; case 0xFF80: sb.setCharAt(i, (char) 0x30BF); break; case 0xFF81: sb.setCharAt(i, (char) 0x30C1); break; case 0xFF82: sb.setCharAt(i, (char) 0x30C4); break; case 0xFF83: sb.setCharAt(i, (char) 0x30C6); break; case 0xFF84: sb.setCharAt(i, (char) 0x30C8); break; case 0xFF85: sb.setCharAt(i, (char) 0x30CA); break; case 0xFF86: sb.setCharAt(i, (char) 0x30CB); break; case 0xFF87: sb.setCharAt(i, (char) 0x30CC); break; case 0xFF88: sb.setCharAt(i, (char) 0x30CD); break; case 0xFF89: sb.setCharAt(i, (char) 0x30CE); break; case 0xFF8A: sb.setCharAt(i, (char) 0x30CF); break; case 0xFF8B: sb.setCharAt(i, (char) 0x30D2); break; case 0xFF8C: sb.setCharAt(i, (char) 0x30D5); break; case 0xFF8D: sb.setCharAt(i, (char) 0x30D8); break; case 0xFF8E: sb.setCharAt(i, (char) 0x30DB); break; case 0xFF8F: sb.setCharAt(i, (char) 0x30DE); break; case 0xFF90: sb.setCharAt(i, (char) 0x30DF); break; case 0xFF91: sb.setCharAt(i, (char) 0x30E0); break; case 0xFF92: sb.setCharAt(i, (char) 0x30E1); break; case 0xFF93: sb.setCharAt(i, (char) 0x30E2); break; case 0xFF94: sb.setCharAt(i, (char) 0x30E4); break; case 0xFF95: sb.setCharAt(i, (char) 0x30E6); break; case 0xFF96: sb.setCharAt(i, (char) 0x30E8); break; case 0xFF97: sb.setCharAt(i, (char) 0x30E9); break; case 0xFF98: sb.setCharAt(i, (char) 0x30EA); break; case 0xFF99: sb.setCharAt(i, (char) 0x30EB); break; case 0xFF9A: sb.setCharAt(i, (char) 0x30EC); break; case 0xFF9B: sb.setCharAt(i, (char) 0x30ED); break; case 0xFF9C: sb.setCharAt(i, (char) 0x30EF); break; case 0xFF9D: sb.setCharAt(i, (char) 0x30F3); break; case 0xFF9E: sb.setCharAt(i, (char) 0x3099); break; case 0xFF9F: sb.setCharAt(i, (char) 0x309A); break; if ((ch > 0xFFA1) && (ch <= 0xFFBE)) { sb.setCharAt(i, (char) (ch - 0xCE70)); continue; switch (ch) { case 0xFFA0: sb.setCharAt(i, (char) 0x3164); break; case 0xFFDA: sb.setCharAt(i, (char) 0x3161); break; case 0xFFDB: sb.setCharAt(i, (char) 0x3162); break; case 0xFFDC: sb.setCharAt(i, (char) 0x3163); break; case 0xFFE8: sb.setCharAt(i, (char) 0x2502); break; case 0xFFE9: sb.setCharAt(i, (char) 0x2190); break; case 0xFFEA: sb.setCharAt(i, (char) 0x2191); break; case 0xFFEB: sb.setCharAt(i, (char) 0x2192); break; case 0xFFEC: sb.setCharAt(i, (char) 0x2193); break; case 0xFFED: sb.setCharAt(i, (char) 0x25A0); break; case 0xFFEE: sb.setCharAt(i, (char) 0x25CB); break; switch (ch) { case 0x2100: sb.setCharAt(i, 'a'); sb.insert(i + 1, "/c"); i += 2; break; case 0x2101: sb.setCharAt(i, 'a'); sb.insert(i + 1, "/s"); i += 2; break; case 0x2105: sb.setCharAt(i, 'c'); sb.insert(i + 1, "/o"); i += 2; break; case 0x2103: sb.setCharAt(i, (char) 0x00B0); sb.insert(i + 1, "C"); i++; break; case 0x2109: sb.setCharAt(i, (char) 0x00B0); sb.insert(i + 1, "F"); i++; break; case 0x2116: sb.setCharAt(i, 'N'); sb.insert(i + 1, "o"); i++; break; case 0x212A: sb.setCharAt(i, 'K'); break; case 0x212B: sb.setCharAt(i, (char) 0x00C5); break; switch (ch) { case 0x3371: sb.setCharAt(i, 'h'); sb.insert(i + 1, "Pa"); i += 2; break; case 0x3372: sb.setCharAt(i, 'd'); sb.insert(i + 1, "a"); i++; break; case 0x3373: sb.setCharAt(i, 'A'); sb.insert(i + 1, "U"); i++; break; case 0x3374: sb.setCharAt(i, 'b'); sb.insert(i + 1, "ar"); i += 2; break; case 0x3375: sb.setCharAt(i, 'o'); sb.insert(i + 1, "V"); i++; break; case 0x3376: sb.setCharAt(i, 'p'); sb.insert(i + 1, "c"); i++; break; case 0x3377: sb.setCharAt(i, 'd'); sb.insert(i + 1, "m"); i++; break; case 0x3378: sb.setCharAt(i, 'd'); sb.insert(i + 1, "m\u00B2"); i += 2; break; case 0x3379: sb.setCharAt(i, 'd'); sb.insert(i + 1, "m\u00B3"); i += 2; break; case 0x337A: sb.setCharAt(i, 'I'); sb.insert(i + 1, "U"); i++; break; case 0x3380: sb.setCharAt(i, 'p'); sb.insert(i + 1, "A"); i++; break; case 0x3381: sb.setCharAt(i, 'n'); sb.insert(i + 1, "A"); i++; break; case 0x3382: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "A"); i++; break; case 0x3383: sb.setCharAt(i, 'm'); sb.insert(i + 1, "A"); i++; break; case 0x3384: sb.setCharAt(i, 'k'); sb.insert(i + 1, "A"); i++; break; case 0x3385: sb.setCharAt(i, 'K'); sb.insert(i + 1, "B"); i++; break; case 0x3386: sb.setCharAt(i, 'M'); sb.insert(i + 1, "B"); i++; break; case 0x3387: sb.setCharAt(i, 'G'); sb.insert(i + 1, "B"); i++; break; case 0x3388: sb.setCharAt(i, 'c'); sb.insert(i + 1, "al"); i += 2; break; case 0x3389: sb.setCharAt(i, 'k'); sb.insert(i + 1, "cal"); i += 3; break; case 0x338A: sb.setCharAt(i, 'p'); sb.insert(i + 1, "F"); i++; break; case 0x338B: sb.setCharAt(i, 'n'); sb.insert(i + 1, "F"); i++; break; case 0x338C: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "F"); i++; break; case 0x338D: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "g"); i++; break; case 0x338E: sb.setCharAt(i, 'm'); sb.insert(i + 1, "g"); i++; break; case 0x338F: sb.setCharAt(i, 'k'); sb.insert(i + 1, "g"); i++; break; case 0x3390: sb.setCharAt(i, 'H'); sb.insert(i + 1, "z"); i++; break; case 0x3391: sb.setCharAt(i, 'k'); sb.insert(i + 1, "Hz"); i += 2; break; case 0x3392: sb.setCharAt(i, 'M'); sb.insert(i + 1, "Hz"); i += 2; break; case 0x3393: sb.setCharAt(i, 'G'); sb.insert(i + 1, "Hz"); i += 2; break; case 0x3394: sb.setCharAt(i, 'T'); sb.insert(i + 1, "Hz"); i += 2; break; case 0x3395: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "\u2113"); i++; break; case 0x3396: sb.setCharAt(i, 'm'); sb.insert(i + 1, "\u2113"); i++; break; case 0x3397: sb.setCharAt(i, 'd'); sb.insert(i + 1, "\u2113"); i++; break; case 0x3398: sb.setCharAt(i, 'k'); sb.insert(i + 1, "\u2113"); i++; break; case 0x3399: sb.setCharAt(i, 'f'); sb.insert(i + 1, "m"); i++; break; case 0x339A: sb.setCharAt(i, 'n'); sb.insert(i + 1, "m"); i++; break; case 0x339B: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "m"); i++; break; case 0x339C: sb.setCharAt(i, 'm'); sb.insert(i + 1, "m"); i++; break; case 0x339D: sb.setCharAt(i, 'c'); sb.insert(i + 1, "m"); i++; break; case 0x339E: sb.setCharAt(i, 'k'); sb.insert(i + 1, "m"); i++; break; case 0x339F: sb.setCharAt(i, 'm'); sb.insert(i + 1, "m\u00B2"); i += 2; break; case 0x33A0: sb.setCharAt(i, 'c'); sb.insert(i + 1, "m\u00B2"); i += 2; break; case 0x33A1: sb.setCharAt(i, 'm'); sb.insert(i + 1, "\u00B2"); i++; break; case 0x33A2: sb.setCharAt(i, 'k'); sb.insert(i + 1, "m\u00B2"); i += 2; break; case 0x33A3: sb.setCharAt(i, 'm'); sb.insert(i + 1, "m\u00B3"); i += 2; break; case 0x33A4: sb.setCharAt(i, 'c'); sb.insert(i + 1, "m\u00B3"); i += 2; break; case 0x33A5: sb.setCharAt(i, 'm'); sb.insert(i + 1, "\u00B3"); i++; break; case 0x33A6: sb.setCharAt(i, 'k'); sb.insert(i + 1, "m\u00B3"); i += 2; break; case 0x33A7: sb.setCharAt(i, 'm'); sb.insert(i + 1, "/s"); i += 2; break; case 0x33A8: sb.setCharAt(i, 'm'); sb.insert(i + 1, "/s\u00B2"); i += 3; break; case 0x33A9: sb.setCharAt(i, 'P'); sb.insert(i + 1, "a"); i++; break; case 0x33AA: sb.setCharAt(i, 'k'); sb.insert(i + 1, "Pa"); i += 2; break; case 0x33AB: sb.setCharAt(i, 'M'); sb.insert(i + 1, "Pa"); i += 2; break; case 0x33AC: sb.setCharAt(i, 'G'); sb.insert(i + 1, "Pa"); i += 2; break; case 0x33AD: sb.setCharAt(i, 'r'); sb.insert(i + 1, "ad"); i += 2; break; case 0x33AE: sb.setCharAt(i, 'r'); sb.insert(i + 1, "ad/s"); i += 4; break; case 0x33AF: sb.setCharAt(i, 'r'); sb.insert(i + 1, "ad/s\u00B2"); i += 5; break; case 0x33B0: sb.setCharAt(i, 'p'); sb.insert(i + 1, "s"); i++; break; case 0x33B1: sb.setCharAt(i, 'n'); sb.insert(i + 1, "s"); i++; break; case 0x33B2: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "s"); i++; break; case 0x33B3: sb.setCharAt(i, 'm'); sb.insert(i + 1, "s"); i++; break; case 0x33B4: sb.setCharAt(i, 'p'); sb.insert(i + 1, "V"); i++; break; case 0x33B5: sb.setCharAt(i, 'n'); sb.insert(i + 1, "V"); i++; break; case 0x33B6: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "V"); i++; break; case 0x33B7: sb.setCharAt(i, 'm'); sb.insert(i + 1, "V"); i++; break; case 0x33B8: sb.setCharAt(i, 'k'); sb.insert(i + 1, "V"); i++; break; case 0x33B9: sb.setCharAt(i, 'M'); sb.insert(i + 1, "V"); i++; break; case 0x33BA: sb.setCharAt(i, 'p'); sb.insert(i + 1, "W"); i++; break; case 0x33BB: sb.setCharAt(i, 'n'); sb.insert(i + 1, "W"); i++; break; case 0x33BC: sb.setCharAt(i, (char) 0x03BC); sb.insert(i + 1, "W"); i++; break; case 0x33BD: sb.setCharAt(i, 'm'); sb.insert(i + 1, "W"); i++; break; case 0x33BE: sb.setCharAt(i, 'k'); sb.insert(i + 1, "W"); i++; break; case 0x33BF: sb.setCharAt(i, 'M'); sb.insert(i + 1, "W"); i++; break; case 0x33C0: sb.setCharAt(i, 'k'); sb.insert(i + 1, "\u03A9"); i++; break; case 0x33C1: sb.setCharAt(i, 'M'); sb.insert(i + 1, "\u03A9"); i++; break; case 0x33C2: sb.setCharAt(i, 'a'); sb.insert(i + 1, ".m."); i += 3; break; case 0x33C3: sb.setCharAt(i, 'B'); sb.insert(i + 1, "q"); i++; break; case 0x33C4: sb.setCharAt(i, 'c'); sb.insert(i + 1, "c"); i++; break; case 0x33C5: sb.setCharAt(i, 'c'); sb.insert(i + 1, "d"); i++; break; case 0x33C6: sb.setCharAt(i, 'C'); sb.insert(i + 1, "/kg"); i += 3; break; case 0x33C7: sb.setCharAt(i, 'C'); sb.insert(i + 1, "o."); i += 2; break; case 0x33C8: sb.setCharAt(i, 'd'); sb.insert(i + 1, "B"); i++; break; case 0x33C9: sb.setCharAt(i, 'G'); sb.insert(i + 1, "y"); i++; break; case 0x33CA: sb.setCharAt(i, 'h'); sb.insert(i + 1, "a"); i++; break; case 0x33CB: sb.setCharAt(i, 'H'); sb.insert(i + 1, "P"); i++; break; case 0x33CC: sb.setCharAt(i, 'i'); sb.insert(i + 1, "n"); i++; break; case 0x33CD: sb.setCharAt(i, 'K'); sb.insert(i + 1, "K"); i++; break; case 0x33CE: sb.setCharAt(i, 'K'); sb.insert(i + 1, "M"); i++; break; case 0x33CF: sb.setCharAt(i, 'K'); sb.insert(i + 1, "t"); i++; break; case 0x33D0: sb.setCharAt(i, 'l'); sb.insert(i + 1, "m"); i++; break; case 0x33D1: sb.setCharAt(i, 'l'); sb.insert(i + 1, "n"); i++; break; case 0x33D2: sb.setCharAt(i, 'l'); sb.insert(i + 1, "og"); i += 2; break; case 0x33D3: sb.setCharAt(i, 'l'); sb.insert(i + 1, "x"); i++; break; case 0x33D4: sb.setCharAt(i, 'm'); sb.insert(i + 1, "b"); i++; break; case 0x33D5: sb.setCharAt(i, 'm'); sb.insert(i + 1, "il"); i += 2; break; case 0x33D6: sb.setCharAt(i, 'm'); sb.insert(i + 1, "ol"); i += 2; break; case 0x33D7: sb.setCharAt(i, 'p'); sb.insert(i + 1, "H"); i++; break; case 0x33D8: sb.setCharAt(i, 'p'); sb.insert(i + 1, ".m."); i += 3; break; case 0x33D9: sb.setCharAt(i, 'P'); sb.insert(i + 1, "PM"); i += 2; break; case 0x33DA: sb.setCharAt(i, 'P'); sb.insert(i + 1, "R"); i++; break; case 0x33DB: sb.setCharAt(i, 's'); sb.insert(i + 1, "r"); i++; break; case 0x33DC: sb.setCharAt(i, 'S'); sb.insert(i + 1, "v"); i++; break; case 0x33DD: sb.setCharAt(i, 'W'); sb.insert(i + 1, "b"); i++; break; case 0x33DE: sb.setCharAt(i, 'v'); sb.insert(i + 1, "/m"); i += 2; break; case 0x33DF: sb.setCharAt(i, 'a'); sb.insert(i + 1, "/m"); i += 2; break; case 0x33FF: sb.setCharAt(i, 'g'); sb.insert(i + 1, "al"); i += 2; break; String result = sb.toString(); if (text.equals(result)) { return text; return normalizeUnicode(result); |
String | normalizeWord(String word) normalize Word try { int i; Class<?> normalizerClass = Class.forName("java.text.Normalizer"); Class<?> normalizerFormClass = null; Class<?>[] nestedClasses = normalizerClass.getDeclaredClasses(); for (i = 0; i < nestedClasses.length; i++) { Class<?> nestedClass = nestedClasses[i]; if (nestedClass.getName().equals("java.text.Normalizer$Form")) { ... |