Java tutorial
/* * Copyright 2015 Kostas Papangelou. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.mythesis.userbehaviouranalysis; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.json.simple.parser.*; import org.json.simple.JSONArray; import org.json.simple.JSONObject; /** * get a url's content and apply postprocessing (removing stopwords, common internet words etc.) * @author Kostas Papangelou */ public class WebParser { private String pageContent = ""; private String url; public WebParser(String url) { this.url = url; } /** * Get the text content of a url cleaned from stopwords,symbols etc. and lemmatized */ public void parse() { if (url.contains("http://www.youtube.com/watch?")) { String ventry = url.substring(31); pageContent = GetYoutubeDetails(ventry).replace("\n", "").replace("\r", ""); } else pageContent = cleanhtml(url); if (!"".equals(pageContent) && pageContent != null) { pageContent = removeChars(pageContent); pageContent = removeStopwords(pageContent); pageContent = removeCommonInternetWords(pageContent); pageContent = removeDomainWords(pageContent); pageContent = removeChars(pageContent); Lemmatizer lemmatizer = new Lemmatizer(); List<String> contentList = lemmatizer.lemmatize(pageContent); pageContent = ""; for (String contentListItem : contentList) { pageContent = pageContent + " " + contentListItem; } } } /** * Parse the url and get all the content * @param link the url to parse * @return The content parsed */ private String cleanhtml(String link) { try { Document doc = Jsoup.connect(link).timeout(10 * 1000).get(); String title = doc.title(); String mainbody = doc.body().text(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); //fix link html to remove https:// or http:// and simple / if (link.substring(link.length() - 1, link.length()).equalsIgnoreCase("/")) { link = link.substring(0, link.length() - 1); } if (link.substring(0, 5).equalsIgnoreCase("https")) { link = link.substring(8); } else if (link.substring(0, 4).equalsIgnoreCase("http")) { link = link.substring(7); } String anchortext = ""; String alttext = ""; //-----get the anchor text of internal links for (Element el : links) { String str_check = el.attr("abs:href"); if (el.attr("abs:href").contains(link) && el.text().length() > 1) { anchortext = anchortext + el.text() + " "; } } //-------get alt text to internal images links for (Element medi : media) { if (medi.getElementsByTag("img").attr("src").contains(link)) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt"); } if (medi.getElementsByTag("img").attr("src").startsWith("/")) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt"); } } String content = mainbody + title + anchortext + alttext; return content; } catch (IOException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (NullPointerException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (Exception ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } } /** * Removes the characters from a string * @param str String to be cleaned from the characters * @return the String cleaned from characters */ private String removeChars(String str){ if (str != null) { try { //str = str.replaceAll("(\r\n|\r|\n|\n\r)", " "); //Clear Paragraph escape sequences str = str.replaceAll("\\.", " "); //Clear dots str = str.replaceAll("\\-", " "); // str = str.replaceAll("\\_", " "); // str = str.replaceAll(":", " "); str = str.replaceAll("\\+", " "); str = str.replaceAll("\\/", " "); str = str.replaceAll("\\|", " "); str = str.replaceAll("\\[", " "); str = str.replaceAll("\\?", " "); str = str.replaceAll("\\#", " "); str = str.replaceAll("\\!", " "); str = str.replaceAll("'", " "); //Clear apostrophes str = str.replaceAll(",", " "); //Clear commas str = str.replaceAll("@", " "); //Clear @'s (optional) str = str.replaceAll("$", " "); //Clear $'s (optional) str = str.replaceAll("\\\\", "**&**"); //Clear special character backslash 4 \'s due to regexp format str = str.replaceAll("&", "&"); //change & to & str = str.replaceAll("<", "<"); //change < to < str = str.replaceAll(">", ">"); //change > to > // str = str.replaceAll("<[^<>]*>"," "); //drop anything in <> str = str.replaceAll("&#\\d+;", " "); //change &#[digits]; to space str = str.replaceAll(""", " "); //change " to space // str = str.replaceAll("http://[^ ]+ "," "); //drop urls str = str.replaceAll("-", " "); //drop non-alphanumeric characters str = str.replaceAll("[^0-9a-zA-Z ]", " "); //drop non-alphanumeric characters str = str.replaceAll("·", " "); str = str.replaceAll("\\>", " "); str = str.replaceAll("\\<", " "); str = str.replaceAll("<[^>]*>", ""); str = str.replaceAll("\\d"," "); //str=str.replaceAll("\\<.*?\\>", ""); str = str.replace('', ' '); str = str.replace('', ' '); str = str.replace('', ' '); str = str.replace(')', ' '); str = str.replace('(', ' '); str = str.replace('[', ' '); str = str.replace(']', ' '); str = str.replace('`', ' '); str = str.replace('~', ' '); str = str.replace('!', ' '); str = str.replace('#', ' '); str = str.replace('%', ' '); str = str.replace('^', ' '); str = str.replace('*', ' '); str = str.replace('&', ' '); str = str.replace('_', ' '); str = str.replace('=', ' '); str = str.replace('+', ' '); str = str.replace('|', ' '); str = str.replace('\\', ' '); str = str.replace('{', ' '); str = str.replace('}', ' '); str = str.replace(',', ' '); str = str.replace('.', ' '); str = str.replace('/', ' '); str = str.replace('?', ' '); str = str.replace('"', ' '); str = str.replace(':', ' '); str = str.replace('>', ' '); str = str.replace(';', ' '); str = str.replace('<', ' '); str = str.replace('$', ' '); str = str.replace('-', ' '); str = str.replace('@', ' '); str = str.replace('', ' '); //remove space InputStreamReader in = new InputStreamReader(IOUtils.toInputStream(str)); BufferedReader br = new BufferedReader(in); Pattern p; Matcher m; String afterReplace = ""; String strLine; String inputText = ""; while ((strLine = br.readLine()) != null) { inputText = strLine; p = Pattern.compile("\\s+"); m = p.matcher(inputText); afterReplace = afterReplace + m.replaceAll(" "); } br.close(); str = afterReplace; return str; } catch (IOException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); str=null; return str; } } else { return str; } } private String removeStopwords(String document) { String[] stopwords = { "a", "able", "and", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "b", "B", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "c", "came", "can", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "course", "currently", "d", "definitely", "described", "despite", "did", "different", "do", "does", "doing", "done", "down", "downwards", "during", "e", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "g", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "h", "had", "happens", "hardly", "has", "have", "having", "he", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "i", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "it", "its", "itself", "j", "just", "k", "keep", "keeps", "kept", "know", "knows", "known", "l", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "like", "liked", "likely", "little", "ll", //added to avoid words like you'll,I'll etc. "look", "looking", "looks", "ltd", "m", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "n", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "p", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "s", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "t", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "u", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "uucp", "v", "value", "various", "ve", //added to avoid words like I've,you've etc. "very", "via", "viz", "vs", "w", "want", "wants", "was", "way", "we", "welcome", "well", "went", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "with", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "wonder", "would", "would", "x", "y", "yes", "yet", "you", "your", "yours", "yourself", "yourselves", "z", "zero", "-", "_", "+", ".", "&", "|" }; document = document.toLowerCase(); String[] words = document.split(" "); for (int i = 0; i < stopwords.length; i++) { for (int j = 0; j < words.length; j++) { if (words[j].equals(stopwords[i])) words[j] = " "; } } String output = ""; for (int k = 0; k < words.length; k++) { output = output + words[k] + " "; } return output; } private String removeCommonInternetWords(String document) { String[] commonwords = { "account", "amazon", "browser", "cart", "chat", "comment", "edit", "email", "facebook", "faq", "free", "ftp", "gui", "html", "http", "https", "hyperlink", "ip", "irc", "link", "listen", "login", "mail", "path", "pdf", "post", "reply", "retweet", "send", "server", "sign", "site", "sftp", "share", "spam", "tag", "top", "tweet", "twitter", "uri", "url", "web", "website", "wikipedia", "www" }; document = document.toLowerCase(); String[] words = document.split(" "); for (int i = 0; i < commonwords.length; i++) { for (int j = 0; j < words.length; j++) { if (words[j].equals(commonwords[i])) words[j] = " "; } } String output = ""; for (int k = 0; k < words.length; k++) { output = output + words[k] + " "; } return output; } public String removeDomainWords(String document) { String[] topDomains = { "aero", "arpa", "be", "bf", "bg", "bh", "bi", "biz", "bj", "bm", "bn", "bo", "br", "bs", "bt", "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm", "cn", "co", "com", "coop", "cr", "cu", "cv", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do", "dz", "ec", "edu", "ee", "eg", "er", "es", "et", "eu", "fi", "fj", "fk", "fm", "fo", "fr", "ga", "gd", "ge", "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gov", "gp", "gq", "gr", "gs", "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr", "ht", "hu", "id", "ie", "il", "im", "in", "info", "int", "io", "iq", "ir", "is", "it", "je", "jm", "jo", "jobs", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz", "la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma", "mc", "md", "me", "mg", "mh", "mil", "mk", "ml", "mm", "mn", "mo", "mobi", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my", "mz", "na", "name", "net", "nc", "ne", "nf", "ng", "ni", "nl", "no", "np", "nr", "nu", "nz", "om", "org", "pa", "pe", "pf", "pg", "ph", "pk", "pl", "pn", "pr", "pro", "ps", "pt", "pw", "py", "qa", "re", "ro", "root", "rs", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sk", "sl", "sm", "sn", "sr", "st", "su", "sv", "sy", "sz", "tc", "td", "tf", "tg", "th", "tj", "tk", "tl", "tm", "tn", "to", "tr", "tt", "tv", "tw", "tz", "ua", "ug", "uk", "us", "uy", "uz", "va", "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "za", "zm", "zw", }; document = document.toLowerCase(); String[] words = document.split(" "); for (int i = 0; i < topDomains.length; i++) { for (int j = 0; j < words.length; j++) { if (words[j].equals(topDomains[i])) words[j] = " "; } } String output = ""; for (int k = 0; k < words.length; k++) { output = output + words[k] + " "; } return output; } /** * Get meta info for a Youtube link * @param ventry the id of the Youtube video * @return a String with all the meta info about the youtube video */ public String GetYoutubeDetails(String ventry) { try { String apikey = "AIzaSyDa18Hdo8Fky9HuxVZZP2uDhvpAGckmxSY"; String output = ""; URL link_ur = new URL("https://www.googleapis.com/youtube/v3/videos?id=" + ventry + "&key=" + apikey + "&part=snippet"); String line = ""; try { HttpURLConnection httpCon = (HttpURLConnection) link_ur.openConnection(); if (httpCon.getResponseCode() != 200) { line = "fail"; } else { try (BufferedReader rd = new BufferedReader(new InputStreamReader(httpCon.getInputStream()))) { StringBuilder sb = new StringBuilder(); while ((line = rd.readLine()) != null) { sb.append(line); } line = sb.toString(); } } } catch (IOException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); line = "fail"; return line; } JSONParser parser = new JSONParser(); //Create the map Map json = (Map) parser.parse(line); // Get a set of the entries Set set = json.entrySet(); Iterator iterator = set.iterator(); Map.Entry entry = null; boolean flagfound = false; while (iterator.hasNext() && !flagfound) { entry = (Map.Entry) iterator.next(); if (entry.getKey().toString().equalsIgnoreCase("items")) { flagfound = true; } } JSONArray jsonarray = (JSONArray) entry.getValue(); Iterator iteratorarray = jsonarray.iterator(); flagfound = false; JSONObject get = null; while (iteratorarray.hasNext() && !flagfound) { JSONObject next = (JSONObject) iteratorarray.next(); if (next.containsKey("snippet")) { get = (JSONObject) next.get("snippet"); flagfound = true; } } String description = ""; String title = ""; if (flagfound) { if (get.containsKey("description")) { description = get.get("description").toString(); } if (get.containsKey("title")) { title = get.get("title").toString(); } output = description + " " + title; } output = removeStopwords(output); return output; } catch (IOException | ArrayIndexOutOfBoundsException | ParseException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String output = null; return output; } } public String getContent() { return pageContent; } }