Java String Accent stripAccents(String input)

Description

Removes diacritics (~= accents) from a string.

License

Apache License

Parameter

Parameter	Description
input	String to be stripped

Return

input text with diacritics removed

Declaration




public static String stripAccents(String input)

Method Source Code

//package com.java2s;
//License from project: Apache License 

import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;

import java.util.regex.Pattern;

public class Main {
    private static boolean sunAvailable = false;
    private static Method sunDecomposeMethod = null;
    private static final Pattern sunPattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
    private static boolean java6Available = false;
    private static Method java6NormalizeMethod = null;
    private static Object java6NormalizerFormNFD = null;
    private static final Pattern java6Pattern = sunPattern;

    /**//  w w w  .j  a  v  a 2 s. c o m
      * <p>
      * Removes diacritics (~= accents) from a string. The case will not be
      * altered.
      * </p>
      * <p>
      * For instance, '&agrave;' will be replaced by 'a'.
      * </p>
      * <p>
      * Note that ligatures will be left as is.
      * </p>
      * 
      * <p>
      * This method will use the first available implementation of: Java 6's
      * {@link java.text.Normalizer}, Java 1.3&ndash;1.5's
      * {@code sun.text.Normalizer}
      * </p>
      * 
      * <pre>
      * StringUtils.stripAccents(null)                = null
      * StringUtils.stripAccents("")                  = ""
      * StringUtils.stripAccents("control")           = "control"
      * StringUtils.stripAccents("&eacute;clair")     = "eclair"
      * </pre>
      * 
      * @param input
      *            String to be stripped
      * @return input text with diacritics removed
      * 
      * @since 3.0
      */
    // See also Lucene's ASCIIFoldingFilter (Lucene 2.9) that replaces accented
    // characters by their unaccented equivalent (and uncommitted bug fix:
    // https://issues.apache.org/jira/browse/LUCENE-1343?focusedCommentId=12858907&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#action_12858907).
    public static String stripAccents(String input) {
        if (input == null) {
            return null;
        }
        try {
            String result = null;
            if (java6Available) {
                result = removeAccentsJava6(input);
            } else if (sunAvailable) {
                result = removeAccentsSUN(input);
            } else {
                throw new UnsupportedOperationException(
                        "The stripAccents(CharSequence) method requires at least Java 1.6 or a Sun JVM");
            }
            // Note that none of the above methods correctly remove ligatures...
            return result;
        } catch (IllegalArgumentException iae) {
            throw new RuntimeException("IllegalArgumentException occurred", iae);
        } catch (IllegalAccessException iae) {
            throw new RuntimeException("IllegalAccessException occurred", iae);
        } catch (InvocationTargetException ite) {
            throw new RuntimeException("InvocationTargetException occurred", ite);
        } catch (SecurityException se) {
            throw new RuntimeException("SecurityException occurred", se);
        }
    }

    /**
      * Use {@code java.text.Normalizer#normalize(CharSequence, Normalizer.Form)}
      * (but be careful, this class exists in Java 1.3, with an entirely
      * different meaning!)
      * 
      * @param text
      *            the text to be processed
      * @return the processed string
      * @throws IllegalAccessException
      *             may be thrown by a reflection call
      * @throws InvocationTargetException
      *             if a reflection call throws an exception
      * @throws IllegalStateException
      *             if the {@code Normalizer} class is not available
      */
    private static String removeAccentsJava6(CharSequence text)
            throws IllegalAccessException, InvocationTargetException {
        /*
         * String decomposed = java.text.Normalizer.normalize(CharSequence,
         * Normalizer.Form.NFD); return
         * java6Pattern.matcher(decomposed).replaceAll("");//$NON-NLS-1$
         */
        if (!java6Available || java6NormalizerFormNFD == null) {
            throw new IllegalStateException("java.text.Normalizer is not available");
        }
        String result;
        result = (String) java6NormalizeMethod.invoke(null, new Object[] { text, java6NormalizerFormNFD });
        result = java6Pattern.matcher(result).replaceAll("");//$NON-NLS-1$
        return result;
    }

    /**
      * Use {@code sun.text.Normalizer#decompose(String, boolean, int)}
      * 
      * @param text
      *            the text to be processed
      * @return the processed string
      * @throws IllegalAccessException
      *             may be thrown by a reflection call
      * @throws InvocationTargetException
      *             if a reflection call throws an exception
      * @throws IllegalStateException
      *             if the {@code Normalizer} class is not available
      */
    private static String removeAccentsSUN(CharSequence text)
            throws IllegalAccessException, InvocationTargetException {
        /*
         * String decomposed = sun.text.Normalizer.decompose(text, false, 0);
         * return sunPattern.matcher(decomposed).replaceAll("");//$NON-NLS-1$
         */
        if (!sunAvailable) {
            throw new IllegalStateException("sun.text.Normalizer is not available");
        }
        String result;
        result = (String) sunDecomposeMethod.invoke(null, new Object[] { text, Boolean.FALSE, Integer.valueOf(0) });
        result = sunPattern.matcher(result).replaceAll("");//$NON-NLS-1$
        return result;
    }
}

Java String Accent stripAccents(String input)

Description

License

Parameter

Return

Declaration

Method Source Code

Related