Java tutorial
/* * Copyright 2015 Konstantinos Papangelou. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.mythesis.userbehaviouranalysis; import com.mongodb.BasicDBList; import com.mongodb.BasicDBObject; import com.mongodb.DB; import com.mongodb.DBCollection; import com.mongodb.DBObject; import com.mongodb.Mongo; import com.mongodb.util.JSON; import java.io.File; import java.io.IOException; import java.math.BigDecimal; import java.math.RoundingMode; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Random; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.apache.lucene.search.spell.JaroWinklerDistance; import org.joda.time.DateTime; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; /** * This class performs the user's history analysis * @author Konstantinos Papangelou */ public class ProfileAnalysis { /** * finds the profiles that match user's interests given his web history * @param userID the user's id * @param history the user's web history * @param input a txt file that contains the necessary parameters */ public void perform(String userID, String[] history, File input) { System.out.println("total urls = " + history.length); //default parameters //number of random queries for each profile int numQueriesSuggestion = 5; //number of random webpages per query to suggest - total number of suggestions = // numQueriesSuggestion*pagesPerQuerySuggestion int pagesPerQuerySuggestion = 1; //number of random queries to return as examples for alternatives profiles int numQueriesExample = 2; //we get the current date/time DateTime current = new DateTime(); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd/MM/yyyy HH:mm"); String timestamp = fmt.print(current); //update user info - i'll store the results when i'll perform the last analysis Mongo mongo = new Mongo("localhost", 27017); DB db = mongo.getDB("profileAnalysis"); DBCollection userinfo = db.getCollection("userinfo"); BasicDBObject newDocument = new BasicDBObject(); newDocument.put("$set", new BasicDBObject().append("timestamp", timestamp)); BasicDBObject searchQuery = new BasicDBObject(); searchQuery.put("userID", userID); userinfo.update(searchQuery, newDocument, true, false); //read the neccessary parameters Utils utils = new Utils(); utils.readInput(input); HashMap<String, ArrayList<String>> wordvectors = utils.wordvectors; HashMap<String, String> crawlerOutputPaths = utils.crawlerOutputPaths; //get the urls' content ArrayList<String> webpages = new ArrayList<>(); ArrayList<String> urls = new ArrayList<>(); for (int i = 0; i < history.length; i++) { WebParser pageParser = new WebParser(history[i]); pageParser.parse(); String content = pageParser.getContent(); if ("".equals(content) || content == null) continue; webpages.add(content); urls.add(history[i]); } //calculate the urls' scores HashMap<String, double[]> historyScores = new HashMap<>(); String[] webpagesArr = new String[webpages.size()]; webpagesArr = webpages.toArray(webpagesArr); String[] urlsArr = new String[urls.size()]; urlsArr = urls.toArray(urlsArr); for (String profile : wordvectors.keySet()) { Scorer scorer = new Scorer(webpagesArr, urlsArr, wordvectors.get(profile)); double[] semanticScores = scorer.getSemanticScores(); double[] relevanceScores = scorer.getRelevanceScores(); double[] confidenceScores = scorer.getConfidenceScores(); double[] scores = scoreFormula(semanticScores, relevanceScores, confidenceScores); historyScores.put(profile, scores); } //find the maximum score of every url and get summation of the scores for each profile HashMap<String, Double> userProfilesScore = new HashMap<>(); for (int i = 0; i < webpages.size(); i++) { double max = 0.0; String info = "undefined"; for (String profile : historyScores.keySet()) { if (historyScores.get(profile)[i] > max) { max = historyScores.get(profile)[i]; info = profile; } } if (!"undefined".equals(info)) { Double prevscore = userProfilesScore.get(info); userProfilesScore.put(info, (prevscore == null) ? max : prevscore + max); } } //find which profile level has maximum score e.g. if football/level=0 score is greater //than football/level=1 score then the user is better described as a football/level=0 user HashMap<String, Double> userProfileScores = new HashMap<>(); HashMap<String, String> userProfileLevels = new HashMap<>(); for (String s : userProfilesScore.keySet()) { String[] info = s.split("/"); Double prevscore = userProfileScores.get(info[0] + "/" + info[1] + "/"); if (prevscore == null) { userProfileScores.put(info[0] + "/" + info[1] + "/", userProfilesScore.get(s)); userProfileLevels.put(info[0] + "/" + info[1] + "/", info[2]); } else if (userProfilesScore.get(s) > prevscore) { userProfileScores.put(info[0] + "/" + info[1] + "/", userProfilesScore.get(s)); userProfileLevels.put(info[0] + "/" + info[1] + "/", info[2]); } } //put the final profiles together in this simple form: domain/profile/level of expertise and rank them Double totalScore = 0.0; for (String s : userProfileScores.keySet()) totalScore += userProfileScores.get(s); Map<String, Double> userProfiles = new HashMap<>(); for (String s : userProfileLevels.keySet()) userProfiles.put(s + userProfileLevels.get(s), round(userProfileScores.get(s) * 100 / totalScore, 2)); userProfiles = sortByValue(userProfiles); //find page suggestions for every profile HashMap<String, ArrayList<String>> pageSuggestions = new HashMap<>(); for (String profile : userProfiles.keySet()) { String path = crawlerOutputPaths.get(profile); ArrayList<String> suggestions = getSuggestions(path, numQueriesSuggestion, pagesPerQuerySuggestion, history); pageSuggestions.put(profile, suggestions); } //find alternative profiles for every profile and representative queries HashMap<String, HashMap<String, ArrayList<String>>> alternativeProfiles = new HashMap<>(); for (String userProfile : userProfiles.keySet()) { String[] userProfileInfo = userProfile.split("/"); HashMap<String, ArrayList<String>> profileQueries = new HashMap<>(); for (String profile : wordvectors.keySet()) { String[] profileInfo = profile.split("/"); if (profileInfo[0].equals(userProfileInfo[0]) && profileInfo[1].equals(userProfileInfo[1]) && !profileInfo[2].equals(userProfileInfo[2])) { String path = crawlerOutputPaths.get(profile); ArrayList<String> queries = getQueries(path, numQueriesExample); for (int i = 0; i < queries.size(); i++) { String query = queries.get(i); queries.set(i, query.substring(query.lastIndexOf("\\") + 1).replace("-query", "") .replace("+", " ")); } profileQueries.put(profile, queries); } } alternativeProfiles.put(userProfile, profileQueries); } //prepare JSON response JSONObject response = new JSONObject(); response.put("userID", userID); response.put("timestamp", timestamp); JSONArray list = new JSONArray(); for (String profile : userProfiles.keySet()) { JSONObject profileInfo = new JSONObject(); profileInfo.put("profile", profile); profileInfo.put("score", userProfiles.get(profile)); JSONArray temp = new JSONArray(); ArrayList<String> suggestions = pageSuggestions.get(profile); for (String s : suggestions) temp.add(s); profileInfo.put("suggestions", temp); JSONArray alternativesArray = new JSONArray(); for (String s : alternativeProfiles.get(profile).keySet()) { JSONObject alternativeInfo = new JSONObject(); alternativeInfo.put("alternative", s); ArrayList<String> queries = alternativeProfiles.get(profile).get(s); JSONArray queriesArray = new JSONArray(); for (String str : queries) { queriesArray.add(str); } alternativeInfo.put("queries", queriesArray); alternativesArray.add(alternativeInfo); } profileInfo.put("alternatives", alternativesArray); list.add(profileInfo); } response.put("profiles", list); System.out.println("JSON response is ready: " + response); //delete previous analysis and store results DBCollection collection = db.getCollection("history"); BasicDBObject previous = new BasicDBObject(); previous.put("userID", userID); collection.remove(previous); DBObject dbObject = (DBObject) JSON.parse(response.toString()); collection.insert(dbObject); System.out.println("I saved the analysis..."); } public static double round(double value, int places) { if (places < 0) throw new IllegalArgumentException(); BigDecimal bd = new BigDecimal(value); bd = bd.setScale(places, RoundingMode.HALF_UP); return bd.doubleValue(); } public static Map sortByValue(Map unsortedMap) { List<Map.Entry<String, Double>> entryList = new ArrayList<Map.Entry<String, Double>>( unsortedMap.entrySet()); Collections.sort(entryList, new Comparator<Map.Entry<String, Double>>() { @Override public int compare(Map.Entry<String, Double> e1, Map.Entry<String, Double> e2) { if (!e1.getValue().equals(e2.getValue())) { return e1.getValue().compareTo(e2.getValue()) * -1; // The * -1 reverses the order. } else { return e1.getKey().compareTo(e2.getKey()); } } }); Map<String, Double> orderedMap = new LinkedHashMap<String, Double>(); for (Map.Entry<String, Double> entry : entryList) { orderedMap.put(entry.getKey(), entry.getValue()); } return orderedMap; } /** * a formula tha computes the score of the urls of the given web history * @param semanticScore the semantic score of the urls * @param relevanceScore the relevance score of the urls * @param confidenceScore the confidence score of the urls * @return 0.6*relevanceScore + 0.3*semanticScore+ 0.1*confidenceScore */ private double[] scoreFormula(double[] semanticScore, double[] relevanceScore, double[] confidenceScore) { double[] score = new double[relevanceScore.length]; for (int i = 0; i < score.length; i++) score[i] = 0.6 * relevanceScore[i] + 0.3 * semanticScore[i] + 0.1 * confidenceScore[i]; return score; } /** * a method that returns a number of random urls * @param path SWebRank output directory * @param numOfQueries the number of queries * @param numOfpages the number of urls per query * @return a list of urls */ private ArrayList<String> getSuggestions(String path, int numOfQueries, int numOfpages, String[] history) { List<String> historyUrls = Arrays.asList(history); ArrayList<String> randomQueries = getQueries(path, numOfQueries); //for each query select a number of random urls //for now it only works for bing search engine ArrayList<String> suggestions = new ArrayList<>(); for (String s : randomQueries) { File level = new File(s + "\\" + "bing" + "\\"); File[] docPaths = level.listFiles(); List<String> urls = new ArrayList<>(); for (File f : docPaths) { String str = f.getAbsolutePath(); if (StringUtils.isNumeric(str.substring(str.lastIndexOf("\\") + 1))) { File webPagePath = new File(str + "\\current_url.txt"); try { String url = FileUtils.readFileToString(webPagePath); urls.add(url); } catch (IOException ex) { Logger.getLogger(ProfileAnalysis.class.getName()).log(Level.SEVERE, null, ex); } } } if (numOfpages > urls.size()) { for (String ur : urls) { if (!suggestions.contains(ur) && !historyUrls.contains(ur)) suggestions.add(ur); } continue; } int totalUrls = urls.size() - 1; Random randomPage = new Random(); int count = 0; while (count < numOfpages) { String val = urls.get(randomPage.nextInt(totalUrls)); if (!suggestions.contains(val) && !historyUrls.contains(val)) { suggestions.add(val); count++; } } } return suggestions; } /** * a method that returns a number of random queries * @param path SWebRank output directory * @param numOfQueries number of random queries * @return a list of paths for the queries */ private ArrayList<String> getQueries(String path, int numOfQueries) { //Find output paths File root = new File(path); File[] contents = root.listFiles(); List<String> sWebRanklevels = new ArrayList<>(); for (File f : contents) { if (f.getAbsolutePath().contains("level")) sWebRanklevels.add(f.getAbsolutePath()); } //Find all query paths ArrayList<String> queries = new ArrayList<>(); for (String s : sWebRanklevels) { File level = new File(s); File[] queriesFiles = level.listFiles(); for (File f : queriesFiles) { if (!f.getAbsolutePath().contains("txt")) queries.add(f.getAbsolutePath()); } } if (numOfQueries > queries.size()) { return queries; } //Select a number of random queries int totalQueries = queries.size() - 1; Random randomQuery = new Random(); ArrayList<String> randomQueries = new ArrayList<>(); int count = 0; while (count < numOfQueries) { String val = queries.get(randomQuery.nextInt(totalQueries)); if (!randomQueries.contains(val)) { randomQueries.add(val); count++; } } return randomQueries; } /** * a method that stores queries given from users * @param profiles the alternative profiles that users selected * @param queries the queries given by the users * @param input a txt file that contains the necessary parameters */ public void storeQueries(ArrayList<String> profiles, ArrayList<String> queries, File input) { System.out.println("I will store the queries..."); Utils utils = new Utils(); utils.readInput(input); HashMap<String, String> crawlerOutputPaths = utils.crawlerOutputPaths; for (int i = 0; i < profiles.size(); i++) { if (!"".equals(queries.get(i))) storeQuery(crawlerOutputPaths.get(profiles.get(i)), profiles.get(i), queries.get(i)); } System.out.println("I stored the queries..."); } /** * a method that stores the query that has been suggested by the user * @param crawlerOutputPath SWebRank output directory used to check if a relevant query already exists * @param profile the query's relevant profile * @param query the given query */ public void storeQuery(String crawlerOutputPath, String profile, String query) { System.out.println(crawlerOutputPath); System.out.println(profile); System.out.println(query); //Find output paths File root = new File(crawlerOutputPath); File[] contents = root.listFiles(); List<String> sWebRanklevels = new ArrayList<>(); for (File f : contents) { if (f.getAbsolutePath().contains("level")) sWebRanklevels.add(f.getAbsolutePath()); } //Find all query paths List<String> queries = new ArrayList<>(); for (String s : sWebRanklevels) { File level = new File(s); File[] queriesFiles = level.listFiles(); for (File f : queriesFiles) { if (!f.getAbsolutePath().contains("txt")) { String str = f.getAbsolutePath(); queries.add(str.substring(str.lastIndexOf("\\") + 1).replace("-query", "").replace("+", " ")); } } } //check if a relevant query already exists - I use Jaro-Winkler distance query = query.trim().replaceAll(" +", " "); for (String q : queries) { JaroWinklerDistance jwd = new JaroWinklerDistance(); double distance = jwd.getDistance(q, query); if (distance > 0.9) { // threshold = 0.9 return; } } Mongo mongo = new Mongo("localhost", 27017); DB db = mongo.getDB("profileAnalysis"); DBCollection DBqueries = db.getCollection("newQueries"); BasicDBObject searchQuery = new BasicDBObject(); searchQuery.put("profile", profile); DBObject document = DBqueries.findOne(searchQuery); boolean flag = false; //check if a relevant query exists in the database - I use Jaro-Winkler distance if (document != null) { flag = true; BasicDBList storedQueries = (BasicDBList) document.get("queries"); for (Object quer : storedQueries) { JaroWinklerDistance jwd = new JaroWinklerDistance(); double distance = jwd.getDistance((String) quer, query); if (distance > 0.9) { // threshold = 0.9 return; } } } //if document already exists add the new query if (flag) { DBqueries.update(searchQuery, new BasicDBObject("$push", new BasicDBObject("queries", query))); } else { //otherwise create a new document BasicDBList dbl = new BasicDBList(); dbl.add(query); BasicDBObject entry = new BasicDBObject("profile", profile).append("queries", dbl); DBqueries.insert(entry); } } }