Example usage for java.text Normalizer normalize

Introduction

In this page you can find the example usage for java.text Normalizer normalize.

Prototype

public static String normalize(CharSequence src, Form form)

Source Link

Document

Normalize a sequence of char values.

Usage

From source file:org.silverpeas.core.util.StringUtil.java

/**
 * Normalizes the given string (which must be encoded into UTF-8) in order that the result
 * contains only unified chars.//  w ww.jav a2 s . c o  m
 * <p>Indeed, according to the environment of the user, sometimes it is sent data with
 * combined characters which will make the server have a bad behavior, like throw an error on
 * file download.</p>
 * @param string the string to normalize. There is no guarantee when the string is not encoded
 * into UTF8.
 * @return the normalized string.
 */
public static String normalize(final String string) {
    String normalized = string;
    if (normalized != null) {
        normalized = Normalizer.normalize(normalized, Normalizer.Form.NFC);
    }
    return normalized;
}

From source file:com.yahoo.semsearch.fastlinking.utils.Normalize.java

License:asdf

/**
 * Normalizes a string using regular expressions. Might not be the most efficient way to do this, but it is flexible.
 * It overwrites the string argument variable
 *
 * @param norm string to normalize/*from w  w w .j a va2 s  . c  o m*/
 * @return processed string
 */
public static String normalizeRegExp(String norm) {
    norm = SPACE.matcher(norm).replaceAll(SUBST_SPACE);
    norm = Normalizer.normalize(norm, Normalizer.Form.NFD);
    norm = DIACRITICS.matcher(norm).replaceAll(SUBST_EMPTY);
    return norm.toLowerCase().trim();
}

From source file:br.com.grupofortress.controller.Agendamentos.java

public static String formatString(String s) {
    String temp = Normalizer.normalize(s, java.text.Normalizer.Form.NFD);
    return temp.replaceAll("[^\\p{ASCII}]", "");
}

From source file:org.lanes.utility.string.TextNormaliser.java

public static List<String> cleanLightHTML(String html) {

    html = html.replaceAll("&nbsp;", " ");
    html = html.replaceAll("[\\{\\}\\[\\]]", "");
    html = html.replaceAll("&amp;", "&");
    html = html.replaceAll("(?i)<div.*?>(.*?)<\\/div>", "$1\n");
    html = html.replaceAll("(?i)<strong.*?>(.*?)<\\/strong>", "[$1] ");
    html = html.replaceAll("(?i)<br\\/?>", "\n");//MUST COME BEFORE <b>
    html = html.replaceAll("(?i)<b.*?>(.*?)<\\/b>", "[$1] ");
    html = html.replaceAll("(?i)<em>(.*?)<\\/em>", "[$1] ");
    html = html.replaceAll("(?i)<i>(.*?)<\\/i>", "[$1] ");
    html = html.replaceAll("(?i)<u>(.*?)<\\/u>", "[$1] ");

    html = html.replaceAll("[\\s\\n]+\\]", "]");
    html = html.replaceAll("\\[[\\s\\n]+", "[");
    html = html.replaceAll("[\\s]*:\\]", "]");
    html = html.replaceAll("(?i)<[\\/]?[uo]l.*?>", "");

    html = html.replaceAll("(?i)<li.*?>(.+?)(?=<li>)", "{$1}\n");
    html = html.replaceAll("(?i)<li.*?>(.+?)\\n", "{$1}\n");
    html = html.replaceAll("(?i)<\\/li>", " ");
    html = html.replaceAll("(?i)<[\\/]?div.*?>", " ");
    html = html.replaceAll("(?i)<\\/?center>", " ");
    html = html.replaceAll("(?i)<\\/?p.*?>", " ");
    html = html.replaceAll("(?i)<\\/?li>", " ");
    html = html.replaceAll("(?i)<\\/?font.*?>", " ");
    html = html.replaceAll("(?i)<\\/?hr.*?>", " ");
    html = html.replaceAll("\\[\\]", "");

    Pattern pattern = Pattern.compile("[\u00B7\u2022]\\s*(.+?)\n",
            (Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE));
    Matcher matcher = pattern.matcher(html);
    html = matcher.replaceAll("{$1}\n");

    html = html.replaceAll("\\s\\}", "}");

    html = html.replaceAll("(?i)(?:[\\w\\.]+)@(?:[\\w]+\\.)+(?:[\\w]+)", "<EMAIL>");
    html = html.replaceAll("(?i)(?:http:\\/\\/)?(?:[\\w]+\\.)+(?:[\\w]+)", "<URL>");
    html = html.replaceAll("\\s*\\/\\s*", ", ");

    //html = html.replaceAll("\\s+", " ");

    html = Normalizer.normalize(html, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");

    List<String> lineobj = new ArrayList<String>();
    String[] lines = html.split("\\n");
    for (String line : lines) {
        line = line.trim();//from w  ww.j  a v a  2s .  c o  m
        if (!line.equals("")) {
            lineobj.add(line);
        }
    }

    return lineobj;
}

From source file:org.cellcore.code.engine.page.extractor.AbstractEditionsExtractor.java

protected String checkList(String name) {
    if (name.toLowerCase().contains("foil")) {
        return null;
    }/*  ww  w  .  ja v a 2  s  .c  om*/
    name = name.replaceAll("(^( )+|( )+$)", "");
    for (String key : getEditions().keySet()) {
        String iname = Normalizer.normalize(name, Normalizer.Form.NFD).toLowerCase().replaceAll("[^\\p{ASCII}]",
                "");
        String kname = Normalizer.normalize(key, Normalizer.Form.NFD).toLowerCase().replaceAll("[^\\p{ASCII}]",
                "");
        int distance = StringUtils.getLevenshteinDistance(iname, kname);
        boolean numeral = false;
        boolean numeralProceed = false;
        if (iname.replaceAll("\\D+", "").length() > 0) {
            numeral = true;
            if (iname.replaceAll("\\D+", "").equals(kname.replaceAll("\\D+", ""))) {
                numeralProceed = true;
            }
        }
        if (((kname.contains(iname) || iname.contains(kname) || distance <= 2) && iname.length() > 3)
                && (numeral == numeralProceed)) {

            logger.info("Found " + iname + " " + getEditions().get(key));
            return getEditions().get(key);
        }
    }
    return null;
}

From source file:com.sp.keyword_generator.Keyword.java

public static String stripAccents(String s) {
    s = StringUtils.replaceEachRepeatedly(s.toLowerCase(), InputReplace, OutputReplace);
    s = StringEscapeUtils.escapeSql(s);/*from  w w w .ja va2  s.c o  m*/
    s = Normalizer.normalize(s.toLowerCase(), Normalizer.Form.NFD);
    return s;
}

From source file:mitm.common.dlp.impl.TextNormalizerImpl.java

@Override
public void normalize(Reader input, Writer output) throws IOException {
    Check.notNull(input, "input");
    Check.notNull(output, "output");

    WordIterator wi = new WordIterator(input);

    String word;/*from ww  w .  j  a  v a 2  s  . c o m*/

    while ((word = wi.nextWord()) != null) {
        word = StringUtils.trimToNull(word);

        if (word != null) {
            /*
             * Unicode normalize the word to make sure the word only has one form
             */
            word = Normalizer.normalize(word.toLowerCase(), Normalizer.Form.NFC);

            if (wordSkipper == null || !wordSkipper.isSkip(word)) {
                output.append(word).append(' ');
            }
        }
    }
}

From source file:io.stallion.utils.GeneralUtils.java

/**
 * Converts the string into a string containing only hyphens, lower-case letters, and numbers, removing all
 * other characters.//from   w  w w.  j  a  v  a 2s . c o  m
 *
 * @param input
 * @return
 */
public static String slugify(String input) {
    String nowhitespace = WHITESPACE.matcher(input).replaceAll("-");
    String normalized = Normalizer.normalize(nowhitespace, Normalizer.Form.NFD);
    String slug = MULTIHYPHENS.matcher(NONLATIN.matcher(normalized).replaceAll("-")).replaceAll("-");
    return slug.toLowerCase(Locale.ENGLISH);
}

From source file:org.openo.sdnhub.overlayvpndriver.common.util.IpAddressUtil.java

/**
 * Check valid IP address<br/>/*  w  ww. j  av  a2 s.c  o  m*/
 *
 * @param ipAddr IP Address
 * @return boolean result for IP validation
 * @since SDNHUB 0.5
 */
public static boolean isValidIpAddr(String ipAddr) {
    String regex = "^(25[0-5]|2[0-4]\\d?|1\\d{2}|[1-9]\\d?|0)\\.(25[0-5]|2[0-4]\\d?|1\\d{2}|[1-9]\\d?|0)\\.(25[0-5]|2[0-4]\\d?|1\\d{2}|[1-9]\\d?|0)\\.(25[0-5]|2[0-4]\\d?|1\\d{2}|[1-9]\\d?|0)$";

    if (StringUtils.isEmpty(ipAddr)) {
        LOGGER.error("invalid ip addresss: " + ipAddr);
        return false;
    }

    Pattern pattern = Pattern.compile(regex);
    Matcher matcher = pattern.matcher(Normalizer.normalize(ipAddr, Form.NFKC));

    boolean match = matcher.matches();
    if (!match) {
        LOGGER.error("invalid ip addresss:" + ipAddr);
    }

    return match;
}

From source file:pl.edu.icm.coansys.commons.java.DiacriticsRemover.java

/**
 * Removes diacritics from a text.// w  w w  .j  av  a 2 s  .  c o  m
 *
 * @param text Text to process.
 * @return Text without diacritics.
 */
public static String removeDiacritics(String text) {
    if (text == null) {
        return null;
    }

    String tmp = Normalizer.normalize(text, Normalizer.Form.NFKD);

    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < tmp.length(); i++) {
        Character ch = tmp.charAt(i);
        if (Character.getType(ch) == Character.NON_SPACING_MARK) {
            continue;
        }

        if (lookup.containsKey(ch)) {
            builder.append(lookup.get(ch));
        } else {
            builder.append(ch);
        }
    }

    return builder.toString();
}