List of utility methods to do String Normalize
String | normaliseConll(String input) Remove punctuation, quotation marks, and brackets, from CoNLL input, as they are discarded from the PLTAG parser List<String> tokens = unpackConllSentence(input); for (Iterator<String> iter = tokens.iterator(); iter.hasNext();) { String token = iter.next(); String word = token.split("\t")[1]; if (word.equals("``") || word.equals("`") || word.equals("''") || word.equals("{") || word.equals("}") || word.equals("(") || word.equals(")")) { iter.remove(); String finalToken = tokens.get(tokens.size() - 1); String finalWord = finalToken.split("\t")[1]; while (finalWord.matches("\\p{Punct}") && !finalWord.equals("%") && !finalWord.equals(":") && !finalWord.equals(",")) { tokens.remove(tokens.size() - 1); finalToken = tokens.get(tokens.size() - 1); finalWord = finalToken.split("\t")[1]; return repackConllSentence(tokens); |
String | normaliseUnicode(String unicodeText, char[] mappings) normalise Unicode String result = unicodeText; result = java.text.Normalizer.normalize(result, java.text.Normalizer.Form.NFC); for (int i = 0; i < mappings.length; i += 2) { result = result.replace(mappings[i], mappings[i + 1]); return result; |
String | normalize(final String input) Normalize a string if (input == null || input.length() == 0) return ""; return Normalizer.normalize(input, Form.NFD).replaceAll("[^\\p{ASCII}]", ""); |
String | normalize(final String s) normalize a string return stripAccents(String.valueOf(s.toLowerCase()));
|
String | normalize(final String s) trims the string, removes accents, replaces spaces by underscores et remove all non-ascii characters. String tmp = removeAccents(s).replaceAll("[^a-zA-Z0-9_]", "").trim().replaceAll("\\p{Space}", "_"); if (s.matches("^[0-9].*")) { return "_" + tmp; } else { return tmp; |
String | normalize(final String string) Normalizes the given string (which must be encoded into UTF-8) in order that the result contains only unified chars. String normalized = string; if (normalized != null) { normalized = Normalizer.normalize(normalized, Normalizer.Form.NFC); return normalized; |
String | normalize(final String string) normalize return Normalizer.normalize(string, Normalizer.Form.NFD).replaceAll("[^\\p{Alnum}\\s]", ""); |
void | normalize(Object o, StringBuffer sb) normalize if (o == null) { sb.append("<null>"); } else { sb.append(o.toString()); |
String | normalize(String adoc) Normalize with Unicode aware. String normalized = Normalizer.normalize(adoc, Normalizer.Form.NFC); normalized = adoc.toLowerCase(Locale.ROOT); normalized = NOT_PRINT.matcher(adoc).replaceAll(" "); normalized = WHITESPACE.matcher(adoc).replaceAll(" "); normalized = normalized.trim(); return normalized; |
String | normalize(String in) normalize String s; if (in == null) { return ""; s = in.trim(); s = s.toLowerCase(); s = removeAccents(s); s = deletedChars.matcher(s).replaceAll(""); ... |