pl.edu.icm.coansys.commons.java.DiacriticsRemover.java Source code

Introduction

Here is the source code for pl.edu.icm.coansys.commons.java.DiacriticsRemover.java
Source

/*
 * This file is part of CoAnSys project.
 * Copyright (c) 2012-2015 ICM-UW
 * 
 * CoAnSys is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
    
 * CoAnSys is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with CoAnSys. If not, see <http://www.gnu.org/licenses/>.
 */

package pl.edu.icm.coansys.commons.java;

import java.text.Normalizer;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;

import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;

/**
 * Mapping to the basic Latin alphabet (a-z, A-Z). In most cases, a character is
 * mapped to the closest visual form, rather than functional one, e.g.: "" is
 * mapped to "o" rather than "oe", and "" is mapped to "d" rather than "dj" or
 * "gj". Notable exceptions include: "" mapped to "q", "" mapped to "ss", and
 * "", "" mapped to "Y", "y".
 *
 * <p> Each character is processed as follows: <ol> <li>the character is
 * compatibility decomposed,</li> <li>all the combining marks are removed,</li>
 * <li>the character is compatibility composed,</li> <li>additional "manual"
 * substitutions are applied.</li> </ol> </p>
 *
 * <p> All the characters from the "Latin-1 Supplement" and "Latin Extended-A"
 * Unicode blocks are mapped to the "Basic Latin" block. Characters from other
 * alphabets are generally left intact, although the decomposable ones may be
 * affected by the procedure. </p>
 *
 * @author Lukasz Bolikowski (bolo@icm.edu.pl)
 *
 */
public final class DiacriticsRemover {

    private DiacriticsRemover() {
    }

private static final Character[] from = {
    '', '?', '', '', '', '', '', '', '', '?', '', '',
    '', '', '', '?', '', '', '', '', '', '', ''};
    private static final String[] to = { "AE", "D", "O", "Y", "ss", "ae", "d", "o", "y", "D", "d", "H", "h", "i",
            "q", "L", "l", "N", "n", "OE", "oe", "T", "t" };
    private static final char[] INTERESTING_CHARACTERS = { '%', '_' };
    private static final int[] INTERESTING_TYPES = new int[] { Character.LOWERCASE_LETTER,
            Character.DECIMAL_DIGIT_NUMBER, Character.NON_SPACING_MARK, Character.SPACE_SEPARATOR };
    private static final String MAGIC = "IeG2Ut!3";
    private static Map<Character, String> lookup = buildLookup();
    private static Map<Character, String> alphaSortableMapping = alphaSortableMapping();

    private static Map<Character, String> buildLookup() {
        if (from.length != to.length) {
            throw new IllegalStateException();
        }

        Map<Character, String> _lookup = new HashMap<Character, String>();
        for (int i = 0; i < from.length; i++) {
            _lookup.put(from[i], to[i]);
        }

        return _lookup;
    }

private static Map<Character, String> alphaSortableMapping() {
    Map<Character, String> result = new HashMap<Character, String>();

    //  is not handled by normalization to NFKD form and so
    // we use artificial mapping:
    //  -> l + combining long stroke overlay
    result.put('', "l\u0336");

    return result;
}

    /**
     * Removes diacritics from a text.
     *
     * @param text Text to process.
     * @return Text without diacritics.
     */
    public static String removeDiacritics(String text) {
        if (text == null) {
            return null;
        }

        String tmp = Normalizer.normalize(text, Normalizer.Form.NFKD);

        StringBuilder builder = new StringBuilder();
        for (int i = 0; i < tmp.length(); i++) {
            Character ch = tmp.charAt(i);
            if (Character.getType(ch) == Character.NON_SPACING_MARK) {
                continue;
            }

            if (lookup.containsKey(ch)) {
                builder.append(lookup.get(ch));
            } else {
                builder.append(ch);
            }
        }

        return builder.toString();
    }

    /**
     * Backwards-compatible removal of diacritics with optional escaping.
     *
     * @deprecated Use {@link #removeDiacritics(String)} for actual diacritics
     * removal or {@link #alphaSortable(String, boolean)} for sort key
     * generation.
     *
     * @param text Text to process.
     * @param escapeNonSpacingMarks If
     * <code>true</code> then the result of
     * <code>alphaSortable(text, false)</code> is returned, otherwise the result
     * of
     * <code>removeDiacritics(text)</code> is returned.
     * @return
     */
    @Deprecated
    public static String removeDiacritics(String text, boolean escapeNonSpacingMarks) {
        if (escapeNonSpacingMarks) {
            return alphaSortable(text, false);
        } else {
            return removeDiacritics(text);
        }
    }

    /**
     * Generates a sort key for a given text. This key is useful in environments
     * where only basic Latin characters are reliably sorted (for example, a
     * RDBMS with unknown collation settings).
     *
     * @param text Text to process.
     * @param idempotent Whether the conversion should be idempotent. This is
     * guaranteed to be true:
     * <code>alphaSortable(s, true).equals(alphaSortable(alphaSortable(s, true), true)</code>,
     * while this is not necessarily true:
     * <code>alphaSortable(s, false).equals(alphaSortable(alphaSortable(s, false), false)</code>.
     * @return
     */
    public static String alphaSortable(String text, boolean idempotent) {
        if (text == null) {
            return null;
        }

        if (idempotent && text.startsWith(MAGIC)) {
            return text;
        }

        String tmp = text.toLowerCase(Locale.ENGLISH);
        tmp = Normalizer.normalize(tmp, Normalizer.Form.NFKD);

        StringBuilder builder = new StringBuilder();
        if (idempotent) {
            builder.append(MAGIC);
        }

        boolean wasSpaceSeparator = false;
        for (int i = 0; i < tmp.length(); i++) {
            Character ch = tmp.charAt(i);
            if (!ArrayUtils.contains(INTERESTING_TYPES, Character.getType(ch))
                    && !ArrayUtils.contains(INTERESTING_CHARACTERS, ch)) {
                continue;
            }

            String s;

            // TODO quick fix of mantis 3231
            if (isSpaceSeparator(ch)) {
                if (wasSpaceSeparator) {
                    continue;
                }
                wasSpaceSeparator = true;
            } else {
                wasSpaceSeparator = false;
            }

            if (alphaSortableMapping.containsKey(ch)) {
                s = alphaSortableMapping.get(ch);
            } else if (lookup.containsKey(ch)) {
                s = lookup.get(ch);
            } else {
                s = ch.toString();
            }

            for (int j = 0; j < s.length(); j++) {
                Character c = s.charAt(j);
                // TODO Very ugly workaround of the problem described in 0002643
                if (ArrayUtils.contains(INTERESTING_CHARACTERS, c)) {
                    builder.append(c);
                } else {
                    builder.append(StringUtils.leftPad(Integer.toHexString(c.charValue()), 4, '0'));
                }
            }
        }

        return builder.toString();
    }

    private static boolean isSpaceSeparator(char ch) {
        return Character.SPACE_SEPARATOR == Character.getType(ch);
    }
}