List of usage examples for java.text Normalizer normalize
public static String normalize(CharSequence src, Form form)
From source file:org.silverpeas.core.util.StringUtil.java
/** * Normalizes the given string (which must be encoded into UTF-8) in order that the result * contains only unified chars.// w ww.jav a2 s . c o m * <p>Indeed, according to the environment of the user, sometimes it is sent data with * combined characters which will make the server have a bad behavior, like throw an error on * file download.</p> * @param string the string to normalize. There is no guarantee when the string is not encoded * into UTF8. * @return the normalized string. */ public static String normalize(final String string) { String normalized = string; if (normalized != null) { normalized = Normalizer.normalize(normalized, Normalizer.Form.NFC); } return normalized; }
From source file:com.yahoo.semsearch.fastlinking.utils.Normalize.java
License:asdf
/** * Normalizes a string using regular expressions. Might not be the most efficient way to do this, but it is flexible. * It overwrites the string argument variable * * @param norm string to normalize/*from w w w .j a va2 s . c o m*/ * @return processed string */ public static String normalizeRegExp(String norm) { norm = SPACE.matcher(norm).replaceAll(SUBST_SPACE); norm = Normalizer.normalize(norm, Normalizer.Form.NFD); norm = DIACRITICS.matcher(norm).replaceAll(SUBST_EMPTY); return norm.toLowerCase().trim(); }
From source file:br.com.grupofortress.controller.Agendamentos.java
public static String formatString(String s) { String temp = Normalizer.normalize(s, java.text.Normalizer.Form.NFD); return temp.replaceAll("[^\\p{ASCII}]", ""); }
From source file:org.lanes.utility.string.TextNormaliser.java
public static List<String> cleanLightHTML(String html) { html = html.replaceAll(" ", " "); html = html.replaceAll("[\\{\\}\\[\\]]", ""); html = html.replaceAll("&", "&"); html = html.replaceAll("(?i)<div.*?>(.*?)<\\/div>", "$1\n"); html = html.replaceAll("(?i)<strong.*?>(.*?)<\\/strong>", "[$1] "); html = html.replaceAll("(?i)<br\\/?>", "\n");//MUST COME BEFORE <b> html = html.replaceAll("(?i)<b.*?>(.*?)<\\/b>", "[$1] "); html = html.replaceAll("(?i)<em>(.*?)<\\/em>", "[$1] "); html = html.replaceAll("(?i)<i>(.*?)<\\/i>", "[$1] "); html = html.replaceAll("(?i)<u>(.*?)<\\/u>", "[$1] "); html = html.replaceAll("[\\s\\n]+\\]", "]"); html = html.replaceAll("\\[[\\s\\n]+", "["); html = html.replaceAll("[\\s]*:\\]", "]"); html = html.replaceAll("(?i)<[\\/]?[uo]l.*?>", ""); html = html.replaceAll("(?i)<li.*?>(.+?)(?=<li>)", "{$1}\n"); html = html.replaceAll("(?i)<li.*?>(.+?)\\n", "{$1}\n"); html = html.replaceAll("(?i)<\\/li>", " "); html = html.replaceAll("(?i)<[\\/]?div.*?>", " "); html = html.replaceAll("(?i)<\\/?center>", " "); html = html.replaceAll("(?i)<\\/?p.*?>", " "); html = html.replaceAll("(?i)<\\/?li>", " "); html = html.replaceAll("(?i)<\\/?font.*?>", " "); html = html.replaceAll("(?i)<\\/?hr.*?>", " "); html = html.replaceAll("\\[\\]", ""); Pattern pattern = Pattern.compile("[\u00B7\u2022]\\s*(.+?)\n", (Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE)); Matcher matcher = pattern.matcher(html); html = matcher.replaceAll("{$1}\n"); html = html.replaceAll("\\s\\}", "}"); html = html.replaceAll("(?i)(?:[\\w\\.]+)@(?:[\\w]+\\.)+(?:[\\w]+)", "<EMAIL>"); html = html.replaceAll("(?i)(?:http:\\/\\/)?(?:[\\w]+\\.)+(?:[\\w]+)", "<URL>"); html = html.replaceAll("\\s*\\/\\s*", ", "); //html = html.replaceAll("\\s+", " "); html = Normalizer.normalize(html, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); List<String> lineobj = new ArrayList<String>(); String[] lines = html.split("\\n"); for (String line : lines) { line = line.trim();//from w ww.j a v a 2s . c o m if (!line.equals("")) { lineobj.add(line); } } return lineobj; }
From source file:org.cellcore.code.engine.page.extractor.AbstractEditionsExtractor.java
protected String checkList(String name) { if (name.toLowerCase().contains("foil")) { return null; }/* ww w . ja v a 2 s .c om*/ name = name.replaceAll("(^( )+|( )+$)", ""); for (String key : getEditions().keySet()) { String iname = Normalizer.normalize(name, Normalizer.Form.NFD).toLowerCase().replaceAll("[^\\p{ASCII}]", ""); String kname = Normalizer.normalize(key, Normalizer.Form.NFD).toLowerCase().replaceAll("[^\\p{ASCII}]", ""); int distance = StringUtils.getLevenshteinDistance(iname, kname); boolean numeral = false; boolean numeralProceed = false; if (iname.replaceAll("\\D+", "").length() > 0) { numeral = true; if (iname.replaceAll("\\D+", "").equals(kname.replaceAll("\\D+", ""))) { numeralProceed = true; } } if (((kname.contains(iname) || iname.contains(kname) || distance <= 2) && iname.length() > 3) && (numeral == numeralProceed)) { logger.info("Found " + iname + " " + getEditions().get(key)); return getEditions().get(key); } } return null; }
From source file:com.sp.keyword_generator.Keyword.java
public static String stripAccents(String s) { s = StringUtils.replaceEachRepeatedly(s.toLowerCase(), InputReplace, OutputReplace); s = StringEscapeUtils.escapeSql(s);/*from w w w .ja va2 s.c o m*/ s = Normalizer.normalize(s.toLowerCase(), Normalizer.Form.NFD); return s; }
From source file:mitm.common.dlp.impl.TextNormalizerImpl.java
@Override public void normalize(Reader input, Writer output) throws IOException { Check.notNull(input, "input"); Check.notNull(output, "output"); WordIterator wi = new WordIterator(input); String word;/*from ww w . j a v a 2 s . c o m*/ while ((word = wi.nextWord()) != null) { word = StringUtils.trimToNull(word); if (word != null) { /* * Unicode normalize the word to make sure the word only has one form */ word = Normalizer.normalize(word.toLowerCase(), Normalizer.Form.NFC); if (wordSkipper == null || !wordSkipper.isSkip(word)) { output.append(word).append(' '); } } } }
From source file:io.stallion.utils.GeneralUtils.java
/** * Converts the string into a string containing only hyphens, lower-case letters, and numbers, removing all * other characters.//from w w w. j a v a 2s . c o m * * @param input * @return */ public static String slugify(String input) { String nowhitespace = WHITESPACE.matcher(input).replaceAll("-"); String normalized = Normalizer.normalize(nowhitespace, Normalizer.Form.NFD); String slug = MULTIHYPHENS.matcher(NONLATIN.matcher(normalized).replaceAll("-")).replaceAll("-"); return slug.toLowerCase(Locale.ENGLISH); }
From source file:org.openo.sdnhub.overlayvpndriver.common.util.IpAddressUtil.java
/** * Check valid IP address<br/>/* w ww. j av a2 s.c o m*/ * * @param ipAddr IP Address * @return boolean result for IP validation * @since SDNHUB 0.5 */ public static boolean isValidIpAddr(String ipAddr) { String regex = "^(25[0-5]|2[0-4]\\d?|1\\d{2}|[1-9]\\d?|0)\\.(25[0-5]|2[0-4]\\d?|1\\d{2}|[1-9]\\d?|0)\\.(25[0-5]|2[0-4]\\d?|1\\d{2}|[1-9]\\d?|0)\\.(25[0-5]|2[0-4]\\d?|1\\d{2}|[1-9]\\d?|0)$"; if (StringUtils.isEmpty(ipAddr)) { LOGGER.error("invalid ip addresss: " + ipAddr); return false; } Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(Normalizer.normalize(ipAddr, Form.NFKC)); boolean match = matcher.matches(); if (!match) { LOGGER.error("invalid ip addresss:" + ipAddr); } return match; }
From source file:pl.edu.icm.coansys.commons.java.DiacriticsRemover.java
/** * Removes diacritics from a text.// w w w .j av a 2 s . c o m * * @param text Text to process. * @return Text without diacritics. */ public static String removeDiacritics(String text) { if (text == null) { return null; } String tmp = Normalizer.normalize(text, Normalizer.Form.NFKD); StringBuilder builder = new StringBuilder(); for (int i = 0; i < tmp.length(); i++) { Character ch = tmp.charAt(i); if (Character.getType(ch) == Character.NON_SPACING_MARK) { continue; } if (lookup.containsKey(ch)) { builder.append(lookup.get(ch)); } else { builder.append(ch); } } return builder.toString(); }