Java String Normalize normalize(String in)

Here you can find the source of normalize(String in)

Description

normalize

License

Open Source License

Declaration

public static String normalize(String in) 

Method Source Code

//package com.java2s;
/*//from ww  w. j  a  v  a  2  s.c o  m
 * Utilities.java
 *
 * Created on February 13, 2007, 8:18 AM
 *
 * (c) 2009  The Echo Nest
 * See "license.txt" for terms
 *
 */

import java.text.Normalizer;

import java.util.regex.Pattern;

public class Main {
    /**
     * Implements normalization rules described at
     * http://www.ee.columbia.edu/~dpwe/research/musicsim/normalization.html
     *
     * Artist names are particularly important to get normalized to the same
     * forms. Hence, they have severe normalization:
     *
     * 1. Names are all mapped to lower case 2. Delete apostrophes ("'") and
     * periods ("."). 3. Everything else except a-z 0-9 maps to "_". - but this
     * doesn't work for non-english titles 3A Multiple _'s in sequence fold into
     * a single _. 3B Leading and trailing _'s are dropped. 4. Don't reorder
     * proper names - it's just too hard, and there's no clear boundary between
     * proper names and band names. . 5. Always drop a leading "the". 5A Always
     * drop a leading indefinite article too
     *
     *
     * Augmented these rules with: ampersands (&) are replaced with 'and'
     *
     * Issues: Number folding - '3' vs. 'three' Non english names
     */
    static Pattern deletedChars = Pattern.compile("[\"'.]");
    static Pattern ampersand = Pattern.compile("&");
    static Pattern everythingBut = Pattern.compile("[^\\p{Alnum}]");
    static Pattern leadingDash = Pattern.compile("^_+");
    static Pattern trailingDash = Pattern.compile("_+$");
    static Pattern leadingThe = Pattern.compile("^the\\s");
    static Pattern trailingThe = Pattern.compile("\\sthe$");
    static Pattern leadingA = Pattern.compile("^a\\s");
    static Pattern trailingA = Pattern.compile("\\sa$");
    static Pattern multiDash = Pattern.compile("_{2,}");

    public static String normalize(String in) {
        String s;
        if (in == null) {
            return "";
        }

        s = in.trim();
        s = s.toLowerCase();
        s = removeAccents(s);
        s = deletedChars.matcher(s).replaceAll("");
        s = ampersand.matcher(s).replaceAll(" and ");
        s = leadingDash.matcher(s).replaceAll("");
        s = trailingDash.matcher(s).replaceAll("");
        s = leadingThe.matcher(s).replaceAll("");
        s = trailingThe.matcher(s).replaceAll("");
        s = leadingA.matcher(s).replaceAll("");
        s = trailingA.matcher(s).replaceAll("");
        s = multiDash.matcher(s).replaceAll("_");
        s = everythingBut.matcher(s).replaceAll("");

        // if we've reduced the input down to nothing
        // fall back on input (necessary for non western
        // names

        if (s.length() == 0) {
            s = in;
        }

        //System.out.println(in + " BECOMES " + s);
        return s;
    }

    public static String removeAccents(String text) {
        return Normalizer.normalize(text, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
    }
}

Related

  1. normalize(final String s)
  2. normalize(final String string)
  3. normalize(final String string)
  4. normalize(Object o, StringBuffer sb)
  5. normalize(String adoc)
  6. normalize(String name)
  7. normalize(String s)
  8. normalize(String s)
  9. normalize(String s)