Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package Normalization; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import java.nio.charset.Charset; import org.json.JSONException; import org.json.JSONObject; /** * * @author Behnam Rahdari - https://github.com/benrahdari */ public class TextNormalization { private final Set<String> stopWordsSet = new HashSet<>(); private String apiKey = ""; public void setApiKey(String apiKey) { this.apiKey = apiKey; } public String setStopWordList(String filePath) throws FileNotFoundException, IOException { FileInputStream fstream = new FileInputStream(filePath); try (BufferedReader br = new BufferedReader(new InputStreamReader(fstream))) { String strLine; while ((strLine = br.readLine()) != null) { this.stopWordsSet.add(strLine); } } return "StopWords list loaded successfully."; } public String removeEmojiFromString(String content) { String utf8tweet = ""; try { byte[] utf8Bytes = content.getBytes("UTF-8"); utf8tweet = new String(utf8Bytes, "UTF-8"); } catch (UnsupportedEncodingException e) { } Pattern unicodeOutliers = Pattern.compile( "[\ud83c\udc00-\ud83c\udfff]|[\ud83d\udc00-\ud83d\udfff]|[\u2600-\u27ff]", Pattern.UNICODE_CASE | Pattern.CANON_EQ | Pattern.CASE_INSENSITIVE); Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet); utf8tweet = unicodeOutlierMatcher.replaceAll(""); return utf8tweet; } public String removeMentionsFromString(String content) { String utf8tweet = ""; try { byte[] utf8Bytes = content.getBytes("UTF-8"); utf8tweet = new String(utf8Bytes, "UTF-8"); } catch (UnsupportedEncodingException e) { } final String regex = "[@]\\w+"; final Pattern unicodeOutliers = Pattern.compile(regex, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet); utf8tweet = unicodeOutlierMatcher.replaceAll(""); return utf8tweet.replace("#", ""); } public String removeUrlsFromString(String content) { String utf8tweet = ""; try { byte[] utf8Bytes = content.getBytes("UTF-8"); utf8tweet = new String(utf8Bytes, "UTF-8"); } catch (UnsupportedEncodingException e) { } final String regex = "(https?|ftp|file|pic|www)[:|.][-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%=~_|]"; final Pattern unicodeOutliers = Pattern.compile(regex, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet); utf8tweet = unicodeOutlierMatcher.replaceAll(""); return utf8tweet; } public String removeSymbolsFromString(String content) { String utf8tweet = ""; try { byte[] utf8Bytes = content.getBytes("UTF-8"); utf8tweet = new String(utf8Bytes, "UTF-8"); } catch (UnsupportedEncodingException e) { } final String regex = "[\\./\\()\"':,.;<>~!$%^&*\\|+={}?\\-`1234567890_]"; final Pattern unicodeOutliers = Pattern.compile(regex, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet); utf8tweet = unicodeOutlierMatcher.replaceAll(" "); return utf8tweet; } public String removeSpacesFromString(String content) { String utf8tweet = ""; try { byte[] utf8Bytes = content.getBytes("UTF-8"); utf8tweet = new String(utf8Bytes, "UTF-8"); } catch (UnsupportedEncodingException e) { } final String regex = "\\s{2,}"; final Pattern unicodeOutliers = Pattern.compile(regex, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet); utf8tweet = unicodeOutlierMatcher.replaceAll(" "); return utf8tweet; } public String removeTwoLetterWordsFromString(String content) { String utf8tweet = ""; try { byte[] utf8Bytes = content.getBytes("UTF-8"); utf8tweet = new String(utf8Bytes, "UTF-8"); } catch (UnsupportedEncodingException e) { } final String regex = "((^|\\s)(\\w{1,2})(\\s|$))"; final Pattern unicodeOutliers = Pattern.compile(regex, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet); utf8tweet = unicodeOutlierMatcher.replaceAll(" "); return utf8tweet; } public String removeNonEnglishWordsFromString(String content) { String utf8tweet = ""; try { byte[] utf8Bytes = content.getBytes("UTF-8"); utf8tweet = new String(utf8Bytes, "UTF-8"); } catch (UnsupportedEncodingException e) { } final String regex = "[\\W]"; final Pattern unicodeOutliers = Pattern.compile(regex, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet); utf8tweet = unicodeOutlierMatcher.replaceAll(" "); return utf8tweet; } public String removeStopWordsFromString(String content) { String result = ""; String[] words = content.split(" "); ArrayList<String> wordsList = new ArrayList<>(); if (this.stopWordsSet.isEmpty()) { return "Stop word set not available !!!"; } else { for (String word : words) { String wordCompare = word.toUpperCase(); if (!stopWordsSet.contains(wordCompare)) { wordsList.add(word); } } // result = wordsList.stream().map((str) -> str + " ").reduce(result, String::concat); for (String str : wordsList) { result += str + " "; } return result; } } private static String readAll(Reader rd) throws IOException { StringBuilder sb = new StringBuilder(); int cp; while ((cp = rd.read()) != -1) { sb.append((char) cp); } return sb.toString(); } private static JSONObject readJsonFromUrl(String url) throws IOException, JSONException { try (InputStream is = new URL(url).openStream()) { BufferedReader rd = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))); String jsonText = readAll(rd); JSONObject json = new JSONObject(jsonText); return json; } } public String detectLanguage(String content) throws IOException, JSONException { JSONObject json = readJsonFromUrl( "https://translate.yandex.net/api/v1.5/tr.json/detect?key=" + this.apiKey + "&text=" + content); //System.out.println(json.toString()); return (String) json.get("lang"); } public String translateString(String content) throws IOException, JSONException { JSONObject json = readJsonFromUrl("https://translate.yandex.net/api/v1.5/tr.json/translate?key=" + this.apiKey + "&text=" + content + "&lang=" + detectLanguage(content) + "-en&format=plain"); //System.out.println(json.toString()); String[] translate = json.toString().split("\""); return translate[5]; } public String StemString(String content) { String result = ""; String[] words = content.split(" "); WordStemming w = new WordStemming(); for (String word : words) { if (!(word.trim().isEmpty())) { result += w.steemWord(word) + " "; } } return result; } }