Java String Accent removeAccents(String text)

Here you can find the source of removeAccents(String text)

Description

remove Accents

License

Open Source License

Declaration

public static String removeAccents(String text) 

Method Source Code

//package com.java2s;
/*//from w ww . j  av  a2 s .  co  m
 * Utilities.java
 *
 * Created on February 13, 2007, 8:18 AM
 *
 * (c) 2009  The Echo Nest
 * See "license.txt" for terms
 *
 */

import java.text.Normalizer;

import java.util.regex.Pattern;

public class Main {
    /**
     * Implements normalization rules described at
     * http://www.ee.columbia.edu/~dpwe/research/musicsim/normalization.html
     *
     * Artist names are particularly important to get normalized to the same
     * forms. Hence, they have severe normalization:
     *
     * 1. Names are all mapped to lower case 2. Delete apostrophes ("'") and
     * periods ("."). 3. Everything else except a-z 0-9 maps to "_". - but this
     * doesn't work for non-english titles 3A Multiple _'s in sequence fold into
     * a single _. 3B Leading and trailing _'s are dropped. 4. Don't reorder
     * proper names - it's just too hard, and there's no clear boundary between
     * proper names and band names. . 5. Always drop a leading "the". 5A Always
     * drop a leading indefinite article too
     *
     *
     * Augmented these rules with: ampersands (&) are replaced with 'and'
     *
     * Issues: Number folding - '3' vs. 'three' Non english names
     */
    static Pattern deletedChars = Pattern.compile("[\"'.]");
    static Pattern ampersand = Pattern.compile("&");
    static Pattern everythingBut = Pattern.compile("[^\\p{Alnum}]");
    static Pattern leadingDash = Pattern.compile("^_+");
    static Pattern trailingDash = Pattern.compile("_+$");
    static Pattern leadingThe = Pattern.compile("^the\\s");
    static Pattern trailingThe = Pattern.compile("\\sthe$");
    static Pattern leadingA = Pattern.compile("^a\\s");
    static Pattern trailingA = Pattern.compile("\\sa$");
    static Pattern multiDash = Pattern.compile("_{2,}");

    public static String removeAccents(String text) {
        return Normalizer.normalize(text, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
    }

    public static String normalize(String in) {
        String s;
        if (in == null) {
            return "";
        }

        s = in.trim();
        s = s.toLowerCase();
        s = removeAccents(s);
        s = deletedChars.matcher(s).replaceAll("");
        s = ampersand.matcher(s).replaceAll(" and ");
        s = leadingDash.matcher(s).replaceAll("");
        s = trailingDash.matcher(s).replaceAll("");
        s = leadingThe.matcher(s).replaceAll("");
        s = trailingThe.matcher(s).replaceAll("");
        s = leadingA.matcher(s).replaceAll("");
        s = trailingA.matcher(s).replaceAll("");
        s = multiDash.matcher(s).replaceAll("_");
        s = everythingBut.matcher(s).replaceAll("");

        // if we've reduced the input down to nothing
        // fall back on input (necessary for non western
        // names

        if (s.length() == 0) {
            s = in;
        }

        //System.out.println(in + " BECOMES " + s);
        return s;
    }
}

Related

  1. removeAccents(final String value)
  2. removeAccents(final String value)
  3. removeAccents(String input)
  4. removeAccents(String s)
  5. removeAccents(String str)
  6. removeAccents(String textWithAccent)
  7. removeAccentsAndNonStandardCharacters(String string)
  8. removeAccentuation(String str)
  9. replaceAccent(String strInit)