Java Utililty Methods String Normalize

List of utility methods to do String Normalize

Description

The list of methods to do String Normalize are organized into topic(s).

Method

StringnormaliseConll(String input)
Remove punctuation, quotation marks, and brackets, from CoNLL input, as they are discarded from the PLTAG parser
List<String> tokens = unpackConllSentence(input);
for (Iterator<String> iter = tokens.iterator(); iter.hasNext();) {
    String token = iter.next();
    String word = token.split("\t")[1];
    if (word.equals("``") || word.equals("`") || word.equals("''") || word.equals("{") || word.equals("}")
            || word.equals("(") || word.equals(")")) {
        iter.remove();
String finalToken = tokens.get(tokens.size() - 1);
String finalWord = finalToken.split("\t")[1];
while (finalWord.matches("\\p{Punct}") && !finalWord.equals("%") && !finalWord.equals(":")
        && !finalWord.equals(",")) {
    tokens.remove(tokens.size() - 1);
    finalToken = tokens.get(tokens.size() - 1);
    finalWord = finalToken.split("\t")[1];
return repackConllSentence(tokens);
StringnormaliseUnicode(String unicodeText, char[] mappings)
normalise Unicode
String result = unicodeText;
result = java.text.Normalizer.normalize(result, java.text.Normalizer.Form.NFC);
for (int i = 0; i < mappings.length; i += 2) {
    result = result.replace(mappings[i], mappings[i + 1]);
return result;
Stringnormalize(final String input)
Normalize a string
if (input == null || input.length() == 0)
    return "";
return Normalizer.normalize(input, Form.NFD).replaceAll("[^\\p{ASCII}]", "");
Stringnormalize(final String s)

normalize a string

return stripAccents(String.valueOf(s.toLowerCase()));
Stringnormalize(final String s)
trims the string, removes accents, replaces spaces by underscores et remove all non-ascii characters.
String tmp = removeAccents(s).replaceAll("[^a-zA-Z0-9_]", "").trim().replaceAll("\\p{Space}", "_");
if (s.matches("^[0-9].*")) {
    return "_" + tmp;
} else {
    return tmp;
Stringnormalize(final String string)
Normalizes the given string (which must be encoded into UTF-8) in order that the result contains only unified chars.
String normalized = string;
if (normalized != null) {
    normalized = Normalizer.normalize(normalized, Normalizer.Form.NFC);
return normalized;
Stringnormalize(final String string)
normalize
return Normalizer.normalize(string, Normalizer.Form.NFD).replaceAll("[^\\p{Alnum}\\s]", "");
voidnormalize(Object o, StringBuffer sb)
normalize
if (o == null) {
    sb.append("<null>");
} else {
    sb.append(o.toString());
Stringnormalize(String adoc)
Normalize with Unicode aware.
String normalized = Normalizer.normalize(adoc, Normalizer.Form.NFC);
normalized = adoc.toLowerCase(Locale.ROOT);
normalized = NOT_PRINT.matcher(adoc).replaceAll(" ");
normalized = WHITESPACE.matcher(adoc).replaceAll(" ");
normalized = normalized.trim();
return normalized;
Stringnormalize(String in)
normalize
String s;
if (in == null) {
    return "";
s = in.trim();
s = s.toLowerCase();
s = removeAccents(s);
s = deletedChars.matcher(s).replaceAll("");
...