de.micromata.genome.util.matcher.norm.StringNormalizeUtils.java Source code

Java tutorial

Introduction

Here is the source code for de.micromata.genome.util.matcher.norm.StringNormalizeUtils.java

Source

//
// Copyright (C) 2010-2016 Micromata GmbH
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package de.micromata.genome.util.matcher.norm;

import java.text.Normalizer;
import java.util.regex.Pattern;

import org.apache.commons.codec.language.ColognePhonetic;
import org.apache.commons.codec.language.Soundex;
import org.apache.commons.lang3.StringUtils;

/**
 * Normalization of a String.
 * 
 * @author Roger Rene Kommer (r.kommer.extern@micromata.de)
 * 
 */
public class StringNormalizeUtils {
    /**
     * Convert the string into upper case.
     */
    public static final char UPPERCASE = 'u';
    /**
     * throw away all non ascii letter/numbers.
     */
    public static final char ASCIILETTERNUMBERSONLY = 'a';
    /**
     * throw away all whitespace characters.
     */
    public static final char NOWHITESPACE = 'w';

    /**
     * remove umlauts and accents.
     */
    public static final char REPLACEUMLAUTS = 'x';

    /**
     * convert to soundex.
     */
    public static final char SOUNDEX = 'e';

    /**
     * convert to german Koelnische Phonetik.
     */
    public static final char COLOGNE = 'c';

    /**
     * The deaccent pattern.
     */
    private static ThreadLocal<Pattern> DEACCENT_PATTERN = new ThreadLocal<Pattern>() {
        @Override
        protected Pattern initialValue() {
            return Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
        }
    };

    /**
     * Replace specifc composits.
     * 
     * @param text the text
     * @return the string
     */
    public static String replaceSpecifcComposits(String text) {
        text = StringUtils.replace(text, "\u00df", "ss");
        return text;
    }

    /**
     * Remove umlauts and accents. Also converts scharfes s to ss.
     * 
     * @param str the input
     * @return the converted string
     */
    public static String deAccent(String str) {

        // makes from one composit character to two characters. Second is the accent or umlaut sign
        String norm = Normalizer.normalize(str, Normalizer.Form.NFD);
        // throw away accent/umlaut character
        norm = DEACCENT_PATTERN.get().matcher(norm).replaceAll("");
        norm = replaceSpecifcComposits(norm);
        return norm;
    }

    /**
     * The non alphanum pattern.
     */
    private static ThreadLocal<Pattern> NON_ALPHANUM_PATTERN = new ThreadLocal<Pattern>() {
        @Override
        protected Pattern initialValue() {
            return Pattern.compile("[^A-Za-z0-9]");
        }
    };

    /**
     * The white chars.
     */
    private static final String WHITE_CHARS = "" /* dummy empty string for homogeneity */
            + "\\u0009" // CHARACTER TABULATION
            + "\\u000A" // LINE FEED (LF)
            + "\\u000B" // LINE TABULATION
            + "\\u000C" // FORM FEED (FF)
            + "\\u000D" // CARRIAGE RETURN (CR)
            + "\\u0020" // SPACE
            + "\\u0085" // NEXT LINE (NEL)
            + "\\u00A0" // NO-BREAK SPACE
            + "\\u1680" // OGHAM SPACE MARK
            + "\\u180E" // MONGOLIAN VOWEL SEPARATOR
            + "\\u2000" // EN QUAD
            + "\\u2001" // EM QUAD
            + "\\u2002" // EN SPACE
            + "\\u2003" // EM SPACE
            + "\\u2004" // THREE-PER-EM SPACE
            + "\\u2005" // FOUR-PER-EM SPACE
            + "\\u2006" // SIX-PER-EM SPACE
            + "\\u2007" // FIGURE SPACE
            + "\\u2008" // PUNCTUATION SPACE
            + "\\u2009" // THIN SPACE
            + "\\u200A" // HAIR SPACE
            + "\\u2028" // LINE SEPARATOR
            + "\\u2029" // PARAGRAPH SEPARATOR
            + "\\u202F" // NARROW NO-BREAK SPACE
            + "\\u205F" // MEDIUM MATHEMATICAL SPACE
            + "\\u3000" // IDEOGRAPHIC SPACE
    ;

    /* A \S that actually works for Javas native character set: Unicode */
    /**
     * The non whitespace pattern.
     */
    private static ThreadLocal<Pattern> NON_WHITESPACE_PATTERN = new ThreadLocal<Pattern>() {
        @Override
        protected Pattern initialValue() {
            return Pattern.compile("[^" + WHITE_CHARS + "]");
        }
    };

    /**
     * normalize a string.
     * 
     * @param text the text
     * @param flags a collection of characters. See public static finals in this class.
     * @return the string
     */
    public static String normalize(String text, String flags) {
        if (text == null) {
            return text;
        }
        if (flags.indexOf(REPLACEUMLAUTS) != -1) {
            text = deAccent(text); //replaceUmlauts(text);
        }
        if (flags.indexOf(UPPERCASE) != -1) {
            text = text.toUpperCase();
        }
        if (flags.indexOf(NOWHITESPACE) != -1) {
            text = NON_WHITESPACE_PATTERN.get().matcher(text).replaceAll("");
        }
        if (flags.indexOf(ASCIILETTERNUMBERSONLY) != -1) {
            text = NON_ALPHANUM_PATTERN.get().matcher(text).replaceAll("");
        }
        if (flags.indexOf(SOUNDEX) != -1) {
            text = Soundex.US_ENGLISH.encode(text);
        }
        if (flags.indexOf(COLOGNE) != -1) {
            text = new ColognePhonetic().encode(text);
        }
        return text;
    }

}