List of usage examples for java.text Normalizer normalize
public static String normalize(CharSequence src, Form form)
From source file:nl.utwente.bigdata.bolts.NormalizerBolt.java
@Override public void execute(Tuple tuple, BasicOutputCollector collector) { Status tweet;// ww w . j a v a 2 s . c o m tweet = (Status) tuple.getValueByField("tweet"); // from: http://stackoverflow.com/questions/1008802/converting-symbols-accent-letters-to-english-alphabet Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); String nfdNormalizedString = ""; nfdNormalizedString = Normalizer.normalize(tweet.getText(), Normalizer.Form.NFD); String normalizedTweet = (String) pattern.matcher(nfdNormalizedString.toLowerCase()).replaceAll("") .replace("\n", "").replace("\r", ""); // Also remove prefixed with rt if (!normalizedTweet.startsWith("rt")) { collector.emit(new Values(tweet, normalizedTweet, tweet.getLang())); } }
From source file:org.uiautomation.ios.server.utils.PlistFileUtils.java
/** * load the content of the file to a JSON object * //w w w . j ava2 s. c o m * @param from * @return * @throws Exception */ private JSONObject readJSONFile(File from) throws Exception { FileInputStream is = new FileInputStream(from); StringWriter writer = new StringWriter(); IOUtils.copy(is, writer, "UTF-8"); String content = writer.toString(); content = Normalizer.normalize(content, LanguageDictionary.norme); return new JSONObject(content); }
From source file:com.geecko.QuickLyric.lyrics.Genius.java
@Reflection public static Lyrics fromMetaData(String originalArtist, String originalTitle) { String urlArtist = Normalizer.normalize(originalArtist, Normalizer.Form.NFD) .replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); String urlTitle = Normalizer.normalize(originalTitle, Normalizer.Form.NFD) .replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); urlArtist = urlArtist.replaceAll("[^a-zA-Z0-9\\s+]", "").replaceAll("&", "and").trim().replaceAll("[\\s+]", "-"); urlTitle = urlTitle.replaceAll("[^a-zA-Z0-9\\s+]", "").replaceAll("&", "and").trim().replaceAll("[\\s+]", "-"); String url = String.format("http://genius.com/%s-%s-lyrics", urlArtist, urlTitle); return fromURL(url, originalArtist, originalTitle); }
From source file:nl.utwente.bigdata.PlayersTweets.java
public static String deAccent(String str) { String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD); Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); return pattern.matcher(nfdNormalizedString).replaceAll(""); }
From source file:com.beligum.core.utils.Toolkit.java
public static String normalizeString(String input) { return Normalizer.normalize(input, Normalizer.Form.NFD).replaceAll("[^\\p{ASCII}]", ""); }
From source file:net.sf.sprockets.database.sqlite.SQLite.java
/** * Remove diacritics from the string and convert it to upper case. *///from w w w . j a v a 2s . c o m public static String normalise(String s) { if (sDiacritics == null) { sDiacritics = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); } return sDiacritics.matcher(Normalizer.normalize(s, NFD)).replaceAll("").toUpperCase(US); }
From source file:org.drugis.addis.presentation.SMAASerializer.java
public static String toSlug(String input) { String nowhitespace = WHITESPACE.matcher(input).replaceAll("-"); String normalized = Normalizer.normalize(nowhitespace, Form.NFD); String slug = NONLATIN.matcher(normalized).replaceAll(""); return slug.toLowerCase(Locale.ENGLISH); }
From source file:com.github.bfour.fpliteraturecollector.service.FileStorageService.java
private String getFileNameForLiterature(Literature lit) { // take title, removing all special characters String name = Normalizer.normalize(lit.getTitle(), Normalizer.Form.NFD).replaceAll("[^\\p{ASCII}]", ""); name = name.replaceAll("[^A-z\\s]", ""); // remove unnecessary words name = name.replaceAll("\\sa\\s", " "); name = name.replaceAll("\\sthe\\s", " "); name = name.replaceAll("\\sA\\s", " "); name = name.replaceAll("\\sThe\\s", " "); // trim/*www . jav a2 s . c om*/ if (name.length() > 68) name = name.substring(0, 68); // add kind-of GUID name += "_" + Long.toHexString(new Date().getTime()); return name; }
From source file:com.joliciel.jochre.lexicon.DefaultLexiconWrapper.java
String toUpperCaseNoAccents(String string) { // decompose accents String decomposed = Normalizer.normalize(string, Form.NFD); // removing diacritics String removed = decomposed.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); String uppercase = removed.toUpperCase(JochreSession.getLocale()); return uppercase; }
From source file:com.joliciel.talismane.languageDetector.LanguageDetectorImpl.java
@Override public List<WeightedOutcome<Locale>> detectLanguages(String text) { MONITOR.startTask("detectLanguages"); try {//from w ww . j ava2 s. c o m if (LOG.isTraceEnabled()) { LOG.trace("Testing text: " + text); } text = text.toLowerCase(Locale.ENGLISH); text = Normalizer.normalize(text, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); List<FeatureResult<?>> featureResults = new ArrayList<FeatureResult<?>>(); for (LanguageDetectorFeature<?> feature : features) { RuntimeEnvironment env = this.featureService.getRuntimeEnvironment(); FeatureResult<?> featureResult = feature.check(text, env); if (featureResult != null) featureResults.add(featureResult); } if (LOG.isTraceEnabled()) { for (FeatureResult<?> result : featureResults) { LOG.trace(result.toString()); } } List<Decision<LanguageOutcome>> decisions = this.decisionMaker.decide(featureResults); if (LOG.isTraceEnabled()) { for (Decision<LanguageOutcome> decision : decisions) { LOG.trace(decision.getCode() + ": " + decision.getProbability()); } } List<WeightedOutcome<Locale>> results = new ArrayList<WeightedOutcome<Locale>>(); for (Decision<LanguageOutcome> decision : decisions) { Locale locale = Locale.forLanguageTag(decision.getOutcome().getCode()); results.add(new WeightedOutcome<Locale>(locale, decision.getProbability())); } return results; } finally { MONITOR.endTask(); } }