Normalization.TextNormalization.java Source code

Introduction

Here is the source code for Normalization.TextNormalization.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package Normalization;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.nio.charset.Charset;
import org.json.JSONException;
import org.json.JSONObject;

/**
 *
 * @author Behnam Rahdari - https://github.com/benrahdari
 */
public class TextNormalization {

    private final Set<String> stopWordsSet = new HashSet<>();
    private String apiKey = "";

    public void setApiKey(String apiKey) {
        this.apiKey = apiKey;
    }

    public String setStopWordList(String filePath) throws FileNotFoundException, IOException {

        FileInputStream fstream = new FileInputStream(filePath);
        try (BufferedReader br = new BufferedReader(new InputStreamReader(fstream))) {
            String strLine;

            while ((strLine = br.readLine()) != null) {
                this.stopWordsSet.add(strLine);
            }
        }
        return "StopWords list loaded successfully.";
    }

    public String removeEmojiFromString(String content) {

        String utf8tweet = "";
        try {
            byte[] utf8Bytes = content.getBytes("UTF-8");

            utf8tweet = new String(utf8Bytes, "UTF-8");
        } catch (UnsupportedEncodingException e) {
        }
        Pattern unicodeOutliers = Pattern.compile(
                "[\ud83c\udc00-\ud83c\udfff]|[\ud83d\udc00-\ud83d\udfff]|[\u2600-\u27ff]",
                Pattern.UNICODE_CASE | Pattern.CANON_EQ | Pattern.CASE_INSENSITIVE);
        Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet);

        utf8tweet = unicodeOutlierMatcher.replaceAll("");
        return utf8tweet;
    }

    public String removeMentionsFromString(String content) {

        String utf8tweet = "";
        try {
            byte[] utf8Bytes = content.getBytes("UTF-8");

            utf8tweet = new String(utf8Bytes, "UTF-8");
        } catch (UnsupportedEncodingException e) {
        }

        final String regex = "[@]\\w+";
        final Pattern unicodeOutliers = Pattern.compile(regex,
                Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);

        Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet);
        utf8tweet = unicodeOutlierMatcher.replaceAll("");
        return utf8tweet.replace("#", "");
    }

    public String removeUrlsFromString(String content) {

        String utf8tweet = "";
        try {
            byte[] utf8Bytes = content.getBytes("UTF-8");

            utf8tweet = new String(utf8Bytes, "UTF-8");
        } catch (UnsupportedEncodingException e) {
        }

        final String regex = "(https?|ftp|file|pic|www)[:|.][-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%=~_|]";
        final Pattern unicodeOutliers = Pattern.compile(regex,
                Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);

        Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet);
        utf8tweet = unicodeOutlierMatcher.replaceAll("");
        return utf8tweet;
    }

    public String removeSymbolsFromString(String content) {

        String utf8tweet = "";
        try {
            byte[] utf8Bytes = content.getBytes("UTF-8");

            utf8tweet = new String(utf8Bytes, "UTF-8");
        } catch (UnsupportedEncodingException e) {
        }

        final String regex = "[\\./\\()\"':,.;<>~!$%^&*\\|+={}?\\-`1234567890_]";
        final Pattern unicodeOutliers = Pattern.compile(regex,
                Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);

        Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet);
        utf8tweet = unicodeOutlierMatcher.replaceAll(" ");
        return utf8tweet;
    }

    public String removeSpacesFromString(String content) {

        String utf8tweet = "";
        try {
            byte[] utf8Bytes = content.getBytes("UTF-8");

            utf8tweet = new String(utf8Bytes, "UTF-8");
        } catch (UnsupportedEncodingException e) {
        }

        final String regex = "\\s{2,}";
        final Pattern unicodeOutliers = Pattern.compile(regex,
                Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);

        Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet);
        utf8tweet = unicodeOutlierMatcher.replaceAll(" ");
        return utf8tweet;
    }

    public String removeTwoLetterWordsFromString(String content) {

        String utf8tweet = "";
        try {
            byte[] utf8Bytes = content.getBytes("UTF-8");

            utf8tweet = new String(utf8Bytes, "UTF-8");
        } catch (UnsupportedEncodingException e) {
        }

        final String regex = "((^|\\s)(\\w{1,2})(\\s|$))";
        final Pattern unicodeOutliers = Pattern.compile(regex,
                Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);

        Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet);
        utf8tweet = unicodeOutlierMatcher.replaceAll(" ");
        return utf8tweet;
    }

    public String removeNonEnglishWordsFromString(String content) {

        String utf8tweet = "";
        try {
            byte[] utf8Bytes = content.getBytes("UTF-8");

            utf8tweet = new String(utf8Bytes, "UTF-8");
        } catch (UnsupportedEncodingException e) {
        }

        final String regex = "[\\W]";
        final Pattern unicodeOutliers = Pattern.compile(regex,
                Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);

        Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet);
        utf8tweet = unicodeOutlierMatcher.replaceAll(" ");
        return utf8tweet;
    }

    public String removeStopWordsFromString(String content) {

        String result = "";
        String[] words = content.split(" ");
        ArrayList<String> wordsList = new ArrayList<>();

        if (this.stopWordsSet.isEmpty()) {
            return "Stop word set not available !!!";
        } else {

            for (String word : words) {
                String wordCompare = word.toUpperCase();
                if (!stopWordsSet.contains(wordCompare)) {
                    wordsList.add(word);
                }
            }
            // result = wordsList.stream().map((str) -> str + " ").reduce(result, String::concat);
            for (String str : wordsList) {
                result += str + " ";
            }
            return result;
        }
    }

    private static String readAll(Reader rd) throws IOException {
        StringBuilder sb = new StringBuilder();
        int cp;
        while ((cp = rd.read()) != -1) {
            sb.append((char) cp);
        }
        return sb.toString();
    }

    private static JSONObject readJsonFromUrl(String url) throws IOException, JSONException {
        try (InputStream is = new URL(url).openStream()) {
            BufferedReader rd = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
            String jsonText = readAll(rd);
            JSONObject json = new JSONObject(jsonText);
            return json;
        }
    }

    public String detectLanguage(String content) throws IOException, JSONException {

        JSONObject json = readJsonFromUrl(
                "https://translate.yandex.net/api/v1.5/tr.json/detect?key=" + this.apiKey + "&text=" + content);
        //System.out.println(json.toString());
        return (String) json.get("lang");
    }

    public String translateString(String content) throws IOException, JSONException {

        JSONObject json = readJsonFromUrl("https://translate.yandex.net/api/v1.5/tr.json/translate?key="
                + this.apiKey + "&text=" + content + "&lang=" + detectLanguage(content) + "-en&format=plain");
        //System.out.println(json.toString());
        String[] translate = json.toString().split("\"");
        return translate[5];
    }

    public String StemString(String content) {

        String result = "";
        String[] words = content.split(" ");
        WordStemming w = new WordStemming();
        for (String word : words) {

            if (!(word.trim().isEmpty())) {
                result += w.steemWord(word) + " ";
            }
        }
        return result;
    }
}