com.mythesis.userbehaviouranalysis.WebParser.java Source code

Introduction

Here is the source code for com.mythesis.userbehaviouranalysis.WebParser.java
Source

/*
 * Copyright 2015 Kostas Papangelou.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.mythesis.userbehaviouranalysis;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.json.simple.parser.*;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;

/**
 * get a url's content and apply postprocessing (removing stopwords, common internet words etc.)
 * @author Kostas Papangelou
 */
public class WebParser {

    private String pageContent = "";
    private String url;

    public WebParser(String url) {
        this.url = url;
    }

    /**
     * Get the text content of a url cleaned from stopwords,symbols etc. and lemmatized
     */
    public void parse() {

        if (url.contains("http://www.youtube.com/watch?")) {
            String ventry = url.substring(31);
            pageContent = GetYoutubeDetails(ventry).replace("\n", "").replace("\r", "");
        } else
            pageContent = cleanhtml(url);

        if (!"".equals(pageContent) && pageContent != null) {
            pageContent = removeChars(pageContent);
            pageContent = removeStopwords(pageContent);
            pageContent = removeCommonInternetWords(pageContent);
            pageContent = removeDomainWords(pageContent);
            pageContent = removeChars(pageContent);
            Lemmatizer lemmatizer = new Lemmatizer();
            List<String> contentList = lemmatizer.lemmatize(pageContent);
            pageContent = "";
            for (String contentListItem : contentList) {
                pageContent = pageContent + " " + contentListItem;
            }
        }
    }

    /**
     * Parse the url and get all the content
     * @param link the url to parse
     * @return The content parsed
     */
    private String cleanhtml(String link) {
        try {
            Document doc = Jsoup.connect(link).timeout(10 * 1000).get();
            String title = doc.title();
            String mainbody = doc.body().text();
            Elements links = doc.select("a[href]");
            Elements media = doc.select("[src]");
            //fix link html to remove https:// or http:// and simple /
            if (link.substring(link.length() - 1, link.length()).equalsIgnoreCase("/")) {
                link = link.substring(0, link.length() - 1);
            }
            if (link.substring(0, 5).equalsIgnoreCase("https")) {
                link = link.substring(8);
            } else if (link.substring(0, 4).equalsIgnoreCase("http")) {
                link = link.substring(7);
            }
            String anchortext = "";
            String alttext = "";
            //-----get the anchor text of internal links
            for (Element el : links) {
                String str_check = el.attr("abs:href");
                if (el.attr("abs:href").contains(link) && el.text().length() > 1) {
                    anchortext = anchortext + el.text() + " ";
                }
            }
            //-------get alt text to internal images links
            for (Element medi : media) {
                if (medi.getElementsByTag("img").attr("src").contains(link)) {
                    alttext = alttext + " " + medi.getElementsByTag("img").attr("alt");
                }
                if (medi.getElementsByTag("img").attr("src").startsWith("/")) {
                    alttext = alttext + " " + medi.getElementsByTag("img").attr("alt");
                }
            }
            String content = mainbody + title + anchortext + alttext;

            return content;

        } catch (IOException ex) {
            Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
            String check = null;
            return check;
        } catch (NullPointerException ex) {
            Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
            String check = null;
            return check;
        } catch (Exception ex) {
            Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
            String check = null;
            return check;
        }

    }

/**
 * Removes the characters from a string
 * @param str String to be cleaned from the characters
 * @return the String cleaned from characters
 */
private String removeChars(String str){
    if (str != null) {
        try {
            //str = str.replaceAll("(\r\n|\r|\n|\n\r)", " "); //Clear Paragraph escape sequences
            str = str.replaceAll("\\.", " "); //Clear dots
            str = str.replaceAll("\\-", " "); //
            str = str.replaceAll("\\_", " "); //
            str = str.replaceAll(":", " ");
            str = str.replaceAll("\\+", " ");
            str = str.replaceAll("\\/", " ");
            str = str.replaceAll("\\|", " ");
            str = str.replaceAll("\\[", " ");
            str = str.replaceAll("\\?", " ");
            str = str.replaceAll("\\#", " ");
            str = str.replaceAll("\\!", " ");
            str = str.replaceAll("'", " "); //Clear apostrophes
            str = str.replaceAll(",", " "); //Clear commas
            str = str.replaceAll("@", " "); //Clear @'s (optional)
            str = str.replaceAll("$", " "); //Clear $'s (optional)
            str = str.replaceAll("\\\\", "**&**"); //Clear special character backslash 4 \'s due to regexp format
            str = str.replaceAll("&amp;", "&"); //change &amp to &
            str = str.replaceAll("&lt;", "<"); //change &lt; to <
            str = str.replaceAll("&gt;", ">"); //change &gt; to >
            //      str = str.replaceAll("<[^<>]*>"," ");      //drop anything in <>
            str = str.replaceAll("&#\\d+;", " "); //change &#[digits]; to space
            str = str.replaceAll("&quot;", " "); //change &quot; to space
            //      str = str.replaceAll("http://[^ ]+ "," ");   //drop urls
            str = str.replaceAll("-", " "); //drop non-alphanumeric characters
            str = str.replaceAll("[^0-9a-zA-Z ]", " "); //drop non-alphanumeric characters
            str = str.replaceAll("&middot;", " ");
            str = str.replaceAll("\\>", " ");
            str = str.replaceAll("\\<", " ");
            str = str.replaceAll("<[^>]*>", "");
            str = str.replaceAll("\\d"," ");
            //str=str.replaceAll("\\<.*?\\>", "");
            str = str.replace('', ' ');
            str = str.replace('', ' ');
            str = str.replace('', ' ');
            str = str.replace(')', ' ');
            str = str.replace('(', ' ');
            str = str.replace('[', ' ');
            str = str.replace(']', ' ');
            str = str.replace('`', ' ');
            str = str.replace('~', ' ');
            str = str.replace('!', ' ');
            str = str.replace('#', ' ');
            str = str.replace('%', ' ');
            str = str.replace('^', ' ');
            str = str.replace('*', ' ');
            str = str.replace('&', ' ');
            str = str.replace('_', ' ');
            str = str.replace('=', ' ');
            str = str.replace('+', ' ');
            str = str.replace('|', ' ');
            str = str.replace('\\', ' ');
            str = str.replace('{', ' ');
            str = str.replace('}', ' ');
            str = str.replace(',', ' ');
            str = str.replace('.', ' ');
            str = str.replace('/', ' ');
            str = str.replace('?', ' ');
            str = str.replace('"', ' ');
            str = str.replace(':', ' ');
            str = str.replace('>', ' ');
            str = str.replace(';', ' ');
            str = str.replace('<', ' ');
            str = str.replace('$', ' ');
            str = str.replace('-', ' ');
            str = str.replace('@', ' ');
            str = str.replace('', ' ');
            //remove space
            InputStreamReader in = new InputStreamReader(IOUtils.toInputStream(str));
            BufferedReader br = new BufferedReader(in);
            Pattern p;
            Matcher m;
            String afterReplace = "";
            String strLine;
            String inputText = "";
            while ((strLine = br.readLine()) != null) {
                inputText = strLine;
                p = Pattern.compile("\\s+");
                m = p.matcher(inputText);
                afterReplace = afterReplace + m.replaceAll(" ");
            }
            br.close();
            str = afterReplace;
            return str;
        } catch (IOException ex) {
            Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
            str=null;
            return str;
        }
    } else {
        return str;
    }
}

    private String removeStopwords(String document) {
        String[] stopwords = {

                "a", "able", "and", "about", "above", "according", "accordingly", "across", "actually", "after",
                "afterwards", "again", "against", "all", "allow", "allows", "almost", "alone", "along", "already",
                "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody",
                "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate",
                "appropriate", "are", "around", "as", "aside", "ask", "asking", "associated", "at", "available",
                "away", "awfully", "b", "B", "be", "became", "because", "become", "becomes", "becoming", "been",
                "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best",
                "better", "between", "beyond", "both", "brief", "but", "by", "c", "came", "can", "cannot", "cant",
                "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes",
                "concerning", "consequently", "consider", "considering", "contain", "containing", "contains",
                "corresponding", "could", "course", "currently", "d", "definitely", "described", "despite", "did",
                "different", "do", "does", "doing", "done", "down", "downwards", "during", "e", "each", "edu", "eg",
                "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even",
                "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example",
                "except", "f", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for",
                "former", "formerly", "forth", "four", "from", "further", "furthermore", "g", "get", "gets",
                "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "h",
                "had", "happens", "hardly", "has", "have", "having", "he", "hello", "help", "hence", "her", "here",
                "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his",
                "hither", "hopefully", "how", "howbeit", "however", "i", "ie", "if", "ignored", "immediate", "in",
                "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead",
                "into", "inward", "is", "it", "its", "itself", "j", "just", "k", "keep", "keeps", "kept", "know",
                "knows", "known", "l", "last", "lately", "later", "latter", "latterly", "least", "less", "lest",
                "let", "like", "liked", "likely", "little", "ll", //added to avoid words like you'll,I'll etc.
                "look", "looking", "looks", "ltd", "m", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile",
                "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "n",
                "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never",
                "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally",
                "not", "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off", "often", "oh", "ok",
                "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise",
                "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "p", "particular",
                "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably",
                "provides", "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really", "reasonably",
                "regarding", "regardless", "regards", "relatively", "respectively", "right", "s", "said", "same",
                "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming",
                "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several",
                "shall", "she", "should", "since", "six", "so", "some", "somebody", "somehow", "someone",
                "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified",
                "specify", "specifying", "still", "sub", "such", "sup", "sure", "t", "take", "taken", "tell",
                "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "the", "their", "theirs",
                "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein",
                "theres", "thereupon", "these", "they", "think", "third", "this", "thorough", "thoroughly", "those",
                "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took",
                "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "u", "un", "under",
                "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful",
                "uses", "using", "usually", "uucp", "v", "value", "various", "ve", //added to avoid words like I've,you've etc.
                "very", "via", "viz", "vs", "w", "want", "wants", "was", "way", "we", "welcome", "well", "went",
                "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas",
                "whereby", "wherein", "whereupon", "wherever", "whether", "which", "with", "while", "whither",
                "who", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within",
                "without", "wonder", "would", "would", "x", "y", "yes", "yet", "you", "your", "yours", "yourself",
                "yourselves", "z", "zero", "-", "_", "+", ".", "&", "|" };

        document = document.toLowerCase();
        String[] words = document.split(" ");
        for (int i = 0; i < stopwords.length; i++) {
            for (int j = 0; j < words.length; j++) {
                if (words[j].equals(stopwords[i]))
                    words[j] = " ";

            }
        }

        String output = "";

        for (int k = 0; k < words.length; k++) {
            output = output + words[k] + " ";

        }

        return output;
    }

    private String removeCommonInternetWords(String document) {
        String[] commonwords = { "account", "amazon", "browser", "cart", "chat", "comment", "edit", "email",
                "facebook", "faq", "free", "ftp", "gui", "html", "http", "https", "hyperlink", "ip", "irc", "link",
                "listen", "login", "mail", "path", "pdf", "post", "reply", "retweet", "send", "server", "sign",
                "site", "sftp", "share", "spam", "tag", "top", "tweet", "twitter", "uri", "url", "web", "website",
                "wikipedia", "www" };

        document = document.toLowerCase();
        String[] words = document.split(" ");
        for (int i = 0; i < commonwords.length; i++) {
            for (int j = 0; j < words.length; j++) {
                if (words[j].equals(commonwords[i]))
                    words[j] = " ";

            }
        }

        String output = "";

        for (int k = 0; k < words.length; k++) {
            output = output + words[k] + " ";

        }

        return output;

    }

    public String removeDomainWords(String document) {
        String[] topDomains = { "aero", "arpa", "be", "bf", "bg", "bh", "bi", "biz", "bj", "bm", "bn", "bo", "br",
                "bs", "bt", "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm", "cn",
                "co", "com", "coop", "cr", "cu", "cv", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do", "dz", "ec",
                "edu", "ee", "eg", "er", "es", "et", "eu", "fi", "fj", "fk", "fm", "fo", "fr", "ga", "gd", "ge",
                "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gov", "gp", "gq", "gr", "gs", "gt", "gu", "gw", "gy",
                "hk", "hm", "hn", "hr", "ht", "hu", "id", "ie", "il", "im", "in", "info", "int", "io", "iq", "ir",
                "is", "it", "je", "jm", "jo", "jobs", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw",
                "ky", "kz", "la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma", "mc", "md",
                "me", "mg", "mh", "mil", "mk", "ml", "mm", "mn", "mo", "mobi", "mp", "mq", "mr", "ms", "mt", "mu",
                "mv", "mw", "mx", "my", "mz", "na", "name", "net", "nc", "ne", "nf", "ng", "ni", "nl", "no", "np",
                "nr", "nu", "nz", "om", "org", "pa", "pe", "pf", "pg", "ph", "pk", "pl", "pn", "pr", "pro", "ps",
                "pt", "pw", "py", "qa", "re", "ro", "root", "rs", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg",
                "sh", "si", "sk", "sl", "sm", "sn", "sr", "st", "su", "sv", "sy", "sz", "tc", "td", "tf", "tg",
                "th", "tj", "tk", "tl", "tm", "tn", "to", "tr", "tt", "tv", "tw", "tz", "ua", "ug", "uk", "us",
                "uy", "uz", "va", "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "za", "zm", "zw", };

        document = document.toLowerCase();
        String[] words = document.split(" ");
        for (int i = 0; i < topDomains.length; i++) {
            for (int j = 0; j < words.length; j++) {
                if (words[j].equals(topDomains[i]))
                    words[j] = " ";

            }
        }

        String output = "";

        for (int k = 0; k < words.length; k++) {
            output = output + words[k] + " ";

        }

        return output;
    }

    /**
     * Get meta info for a Youtube link
     * @param ventry the id of the Youtube video
     * @return a String with all the meta info about the youtube video
     */
    public String GetYoutubeDetails(String ventry) {
        try {
            String apikey = "AIzaSyDa18Hdo8Fky9HuxVZZP2uDhvpAGckmxSY";
            String output = "";
            URL link_ur = new URL("https://www.googleapis.com/youtube/v3/videos?id=" + ventry + "&key=" + apikey
                    + "&part=snippet");
            String line = "";
            try {
                HttpURLConnection httpCon = (HttpURLConnection) link_ur.openConnection();
                if (httpCon.getResponseCode() != 200) {
                    line = "fail";
                } else {
                    try (BufferedReader rd = new BufferedReader(new InputStreamReader(httpCon.getInputStream()))) {
                        StringBuilder sb = new StringBuilder();
                        while ((line = rd.readLine()) != null) {
                            sb.append(line);
                        }
                        line = sb.toString();
                    }
                }
            } catch (IOException ex) {
                Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
                line = "fail";
                return line;
            }
            JSONParser parser = new JSONParser();
            //Create the map
            Map json = (Map) parser.parse(line);
            // Get a set of the entries
            Set set = json.entrySet();
            Iterator iterator = set.iterator();
            Map.Entry entry = null;
            boolean flagfound = false;
            while (iterator.hasNext() && !flagfound) {
                entry = (Map.Entry) iterator.next();
                if (entry.getKey().toString().equalsIgnoreCase("items")) {
                    flagfound = true;
                }
            }
            JSONArray jsonarray = (JSONArray) entry.getValue();
            Iterator iteratorarray = jsonarray.iterator();
            flagfound = false;
            JSONObject get = null;
            while (iteratorarray.hasNext() && !flagfound) {
                JSONObject next = (JSONObject) iteratorarray.next();
                if (next.containsKey("snippet")) {
                    get = (JSONObject) next.get("snippet");
                    flagfound = true;
                }
            }
            String description = "";
            String title = "";
            if (flagfound) {
                if (get.containsKey("description")) {
                    description = get.get("description").toString();
                }
                if (get.containsKey("title")) {
                    title = get.get("title").toString();
                }
                output = description + " " + title;
            }
            output = removeStopwords(output);
            return output;
        } catch (IOException | ArrayIndexOutOfBoundsException | ParseException ex) {
            Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
            String output = null;
            return output;
        }
    }

    public String getContent() {
        return pageContent;
    }

}