Here you can find the source of normalize(String in)
public static String normalize(String in)
//package com.java2s; /*//from ww w. j a v a 2 s.c o m * Utilities.java * * Created on February 13, 2007, 8:18 AM * * (c) 2009 The Echo Nest * See "license.txt" for terms * */ import java.text.Normalizer; import java.util.regex.Pattern; public class Main { /** * Implements normalization rules described at * http://www.ee.columbia.edu/~dpwe/research/musicsim/normalization.html * * Artist names are particularly important to get normalized to the same * forms. Hence, they have severe normalization: * * 1. Names are all mapped to lower case 2. Delete apostrophes ("'") and * periods ("."). 3. Everything else except a-z 0-9 maps to "_". - but this * doesn't work for non-english titles 3A Multiple _'s in sequence fold into * a single _. 3B Leading and trailing _'s are dropped. 4. Don't reorder * proper names - it's just too hard, and there's no clear boundary between * proper names and band names. . 5. Always drop a leading "the". 5A Always * drop a leading indefinite article too * * * Augmented these rules with: ampersands (&) are replaced with 'and' * * Issues: Number folding - '3' vs. 'three' Non english names */ static Pattern deletedChars = Pattern.compile("[\"'.]"); static Pattern ampersand = Pattern.compile("&"); static Pattern everythingBut = Pattern.compile("[^\\p{Alnum}]"); static Pattern leadingDash = Pattern.compile("^_+"); static Pattern trailingDash = Pattern.compile("_+$"); static Pattern leadingThe = Pattern.compile("^the\\s"); static Pattern trailingThe = Pattern.compile("\\sthe$"); static Pattern leadingA = Pattern.compile("^a\\s"); static Pattern trailingA = Pattern.compile("\\sa$"); static Pattern multiDash = Pattern.compile("_{2,}"); public static String normalize(String in) { String s; if (in == null) { return ""; } s = in.trim(); s = s.toLowerCase(); s = removeAccents(s); s = deletedChars.matcher(s).replaceAll(""); s = ampersand.matcher(s).replaceAll(" and "); s = leadingDash.matcher(s).replaceAll(""); s = trailingDash.matcher(s).replaceAll(""); s = leadingThe.matcher(s).replaceAll(""); s = trailingThe.matcher(s).replaceAll(""); s = leadingA.matcher(s).replaceAll(""); s = trailingA.matcher(s).replaceAll(""); s = multiDash.matcher(s).replaceAll("_"); s = everythingBut.matcher(s).replaceAll(""); // if we've reduced the input down to nothing // fall back on input (necessary for non western // names if (s.length() == 0) { s = in; } //System.out.println(in + " BECOMES " + s); return s; } public static String removeAccents(String text) { return Normalizer.normalize(text, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); } }