com.mythesis.userbehaviouranalysis.ProfileAnalysis.java Source code

Introduction

Here is the source code for com.mythesis.userbehaviouranalysis.ProfileAnalysis.java
Source

/* 
 * Copyright 2015 Konstantinos Papangelou.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.mythesis.userbehaviouranalysis;

import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
import com.mongodb.util.JSON;
import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.apache.lucene.search.spell.JaroWinklerDistance;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;

/**
 * This class performs the user's history analysis
 * @author Konstantinos Papangelou
 */
public class ProfileAnalysis {

    /**
     * finds the profiles that match user's interests given his web history
     * @param userID the user's id
     * @param history the user's web history
     * @param input a txt file that contains the necessary parameters
     */
    public void perform(String userID, String[] history, File input) {

        System.out.println("total urls = " + history.length);
        //default parameters
        //number of random queries for each profile
        int numQueriesSuggestion = 5;
        //number of random webpages per query to suggest - total number of suggestions = 
        // numQueriesSuggestion*pagesPerQuerySuggestion
        int pagesPerQuerySuggestion = 1;
        //number of random queries to return as examples for alternatives profiles
        int numQueriesExample = 2;

        //we get the current date/time
        DateTime current = new DateTime();
        DateTimeFormatter fmt = DateTimeFormat.forPattern("dd/MM/yyyy HH:mm");
        String timestamp = fmt.print(current);

        //update user info - i'll store the results when i'll perform the last analysis
        Mongo mongo = new Mongo("localhost", 27017);
        DB db = mongo.getDB("profileAnalysis");
        DBCollection userinfo = db.getCollection("userinfo");
        BasicDBObject newDocument = new BasicDBObject();
        newDocument.put("$set", new BasicDBObject().append("timestamp", timestamp));
        BasicDBObject searchQuery = new BasicDBObject();
        searchQuery.put("userID", userID);
        userinfo.update(searchQuery, newDocument, true, false);

        //read the neccessary parameters
        Utils utils = new Utils();
        utils.readInput(input);
        HashMap<String, ArrayList<String>> wordvectors = utils.wordvectors;
        HashMap<String, String> crawlerOutputPaths = utils.crawlerOutputPaths;

        //get the urls' content
        ArrayList<String> webpages = new ArrayList<>();
        ArrayList<String> urls = new ArrayList<>();
        for (int i = 0; i < history.length; i++) {
            WebParser pageParser = new WebParser(history[i]);
            pageParser.parse();
            String content = pageParser.getContent();
            if ("".equals(content) || content == null)
                continue;
            webpages.add(content);
            urls.add(history[i]);
        }

        //calculate the urls' scores
        HashMap<String, double[]> historyScores = new HashMap<>();
        String[] webpagesArr = new String[webpages.size()];
        webpagesArr = webpages.toArray(webpagesArr);
        String[] urlsArr = new String[urls.size()];
        urlsArr = urls.toArray(urlsArr);
        for (String profile : wordvectors.keySet()) {
            Scorer scorer = new Scorer(webpagesArr, urlsArr, wordvectors.get(profile));
            double[] semanticScores = scorer.getSemanticScores();
            double[] relevanceScores = scorer.getRelevanceScores();
            double[] confidenceScores = scorer.getConfidenceScores();
            double[] scores = scoreFormula(semanticScores, relevanceScores, confidenceScores);
            historyScores.put(profile, scores);
        }

        //find the maximum score of every url and get summation of the scores for each profile
        HashMap<String, Double> userProfilesScore = new HashMap<>();
        for (int i = 0; i < webpages.size(); i++) {
            double max = 0.0;
            String info = "undefined";
            for (String profile : historyScores.keySet()) {
                if (historyScores.get(profile)[i] > max) {
                    max = historyScores.get(profile)[i];
                    info = profile;
                }
            }
            if (!"undefined".equals(info)) {
                Double prevscore = userProfilesScore.get(info);
                userProfilesScore.put(info, (prevscore == null) ? max : prevscore + max);
            }
        }

        //find which profile level has maximum score e.g. if football/level=0 score is greater
        //than football/level=1 score then the user is better described as a football/level=0 user
        HashMap<String, Double> userProfileScores = new HashMap<>();
        HashMap<String, String> userProfileLevels = new HashMap<>();
        for (String s : userProfilesScore.keySet()) {
            String[] info = s.split("/");
            Double prevscore = userProfileScores.get(info[0] + "/" + info[1] + "/");
            if (prevscore == null) {
                userProfileScores.put(info[0] + "/" + info[1] + "/", userProfilesScore.get(s));
                userProfileLevels.put(info[0] + "/" + info[1] + "/", info[2]);
            } else if (userProfilesScore.get(s) > prevscore) {
                userProfileScores.put(info[0] + "/" + info[1] + "/", userProfilesScore.get(s));
                userProfileLevels.put(info[0] + "/" + info[1] + "/", info[2]);
            }
        }

        //put the final profiles together in this simple form: domain/profile/level of expertise and rank them
        Double totalScore = 0.0;
        for (String s : userProfileScores.keySet())
            totalScore += userProfileScores.get(s);

        Map<String, Double> userProfiles = new HashMap<>();
        for (String s : userProfileLevels.keySet())
            userProfiles.put(s + userProfileLevels.get(s), round(userProfileScores.get(s) * 100 / totalScore, 2));

        userProfiles = sortByValue(userProfiles);

        //find page suggestions for every profile
        HashMap<String, ArrayList<String>> pageSuggestions = new HashMap<>();
        for (String profile : userProfiles.keySet()) {
            String path = crawlerOutputPaths.get(profile);
            ArrayList<String> suggestions = getSuggestions(path, numQueriesSuggestion, pagesPerQuerySuggestion,
                    history);
            pageSuggestions.put(profile, suggestions);
        }

        //find alternative profiles for every profile and representative queries
        HashMap<String, HashMap<String, ArrayList<String>>> alternativeProfiles = new HashMap<>();
        for (String userProfile : userProfiles.keySet()) {
            String[] userProfileInfo = userProfile.split("/");
            HashMap<String, ArrayList<String>> profileQueries = new HashMap<>();
            for (String profile : wordvectors.keySet()) {
                String[] profileInfo = profile.split("/");
                if (profileInfo[0].equals(userProfileInfo[0]) && profileInfo[1].equals(userProfileInfo[1])
                        && !profileInfo[2].equals(userProfileInfo[2])) {
                    String path = crawlerOutputPaths.get(profile);
                    ArrayList<String> queries = getQueries(path, numQueriesExample);
                    for (int i = 0; i < queries.size(); i++) {
                        String query = queries.get(i);
                        queries.set(i, query.substring(query.lastIndexOf("\\") + 1).replace("-query", "")
                                .replace("+", " "));
                    }
                    profileQueries.put(profile, queries);
                }
            }
            alternativeProfiles.put(userProfile, profileQueries);
        }

        //prepare JSON response
        JSONObject response = new JSONObject();
        response.put("userID", userID);
        response.put("timestamp", timestamp);
        JSONArray list = new JSONArray();

        for (String profile : userProfiles.keySet()) {
            JSONObject profileInfo = new JSONObject();
            profileInfo.put("profile", profile);
            profileInfo.put("score", userProfiles.get(profile));

            JSONArray temp = new JSONArray();
            ArrayList<String> suggestions = pageSuggestions.get(profile);
            for (String s : suggestions)
                temp.add(s);
            profileInfo.put("suggestions", temp);

            JSONArray alternativesArray = new JSONArray();
            for (String s : alternativeProfiles.get(profile).keySet()) {
                JSONObject alternativeInfo = new JSONObject();
                alternativeInfo.put("alternative", s);
                ArrayList<String> queries = alternativeProfiles.get(profile).get(s);
                JSONArray queriesArray = new JSONArray();
                for (String str : queries) {
                    queriesArray.add(str);
                }
                alternativeInfo.put("queries", queriesArray);
                alternativesArray.add(alternativeInfo);
            }

            profileInfo.put("alternatives", alternativesArray);
            list.add(profileInfo);
        }
        response.put("profiles", list);
        System.out.println("JSON response is ready: " + response);

        //delete previous analysis and store results
        DBCollection collection = db.getCollection("history");
        BasicDBObject previous = new BasicDBObject();
        previous.put("userID", userID);
        collection.remove(previous);
        DBObject dbObject = (DBObject) JSON.parse(response.toString());
        collection.insert(dbObject);
        System.out.println("I saved the analysis...");

    }

    public static double round(double value, int places) {
        if (places < 0)
            throw new IllegalArgumentException();

        BigDecimal bd = new BigDecimal(value);
        bd = bd.setScale(places, RoundingMode.HALF_UP);
        return bd.doubleValue();
    }

    public static Map sortByValue(Map unsortedMap) {
        List<Map.Entry<String, Double>> entryList = new ArrayList<Map.Entry<String, Double>>(
                unsortedMap.entrySet());

        Collections.sort(entryList, new Comparator<Map.Entry<String, Double>>() {

            @Override
            public int compare(Map.Entry<String, Double> e1, Map.Entry<String, Double> e2) {
                if (!e1.getValue().equals(e2.getValue())) {
                    return e1.getValue().compareTo(e2.getValue()) * -1; // The * -1 reverses the order.
                } else {
                    return e1.getKey().compareTo(e2.getKey());
                }
            }
        });

        Map<String, Double> orderedMap = new LinkedHashMap<String, Double>();

        for (Map.Entry<String, Double> entry : entryList) {
            orderedMap.put(entry.getKey(), entry.getValue());
        }

        return orderedMap;

    }

    /**
     * a formula tha computes the score of the urls of the given web history
     * @param semanticScore the semantic score of the urls
     * @param relevanceScore the relevance score of the urls
     * @param confidenceScore the confidence score of the urls
     * @return 0.6*relevanceScore + 0.3*semanticScore+ 0.1*confidenceScore 
     */
    private double[] scoreFormula(double[] semanticScore, double[] relevanceScore, double[] confidenceScore) {
        double[] score = new double[relevanceScore.length];
        for (int i = 0; i < score.length; i++)
            score[i] = 0.6 * relevanceScore[i] + 0.3 * semanticScore[i] + 0.1 * confidenceScore[i];

        return score;
    }

    /**
     * a method that returns a number of random urls
     * @param path SWebRank output directory
     * @param numOfQueries the number of queries
     * @param numOfpages the number of urls per query
     * @return a list of urls
     */
    private ArrayList<String> getSuggestions(String path, int numOfQueries, int numOfpages, String[] history) {

        List<String> historyUrls = Arrays.asList(history);
        ArrayList<String> randomQueries = getQueries(path, numOfQueries);
        //for each query select a number of random urls
        //for now it only works for bing search engine
        ArrayList<String> suggestions = new ArrayList<>();
        for (String s : randomQueries) {
            File level = new File(s + "\\" + "bing" + "\\");
            File[] docPaths = level.listFiles();
            List<String> urls = new ArrayList<>();
            for (File f : docPaths) {
                String str = f.getAbsolutePath();
                if (StringUtils.isNumeric(str.substring(str.lastIndexOf("\\") + 1))) {
                    File webPagePath = new File(str + "\\current_url.txt");
                    try {
                        String url = FileUtils.readFileToString(webPagePath);
                        urls.add(url);
                    } catch (IOException ex) {
                        Logger.getLogger(ProfileAnalysis.class.getName()).log(Level.SEVERE, null, ex);
                    }
                }
            }

            if (numOfpages > urls.size()) {
                for (String ur : urls) {
                    if (!suggestions.contains(ur) && !historyUrls.contains(ur))
                        suggestions.add(ur);
                }
                continue;
            }

            int totalUrls = urls.size() - 1;
            Random randomPage = new Random();
            int count = 0;
            while (count < numOfpages) {
                String val = urls.get(randomPage.nextInt(totalUrls));
                if (!suggestions.contains(val) && !historyUrls.contains(val)) {
                    suggestions.add(val);
                    count++;
                }
            }
        }

        return suggestions;
    }

    /**
     * a method that returns a number of random queries
     * @param path SWebRank output directory 
     * @param numOfQueries number of random queries
     * @return a list of paths for the queries
     */
    private ArrayList<String> getQueries(String path, int numOfQueries) {

        //Find output paths
        File root = new File(path);
        File[] contents = root.listFiles();
        List<String> sWebRanklevels = new ArrayList<>();
        for (File f : contents) {
            if (f.getAbsolutePath().contains("level"))
                sWebRanklevels.add(f.getAbsolutePath());
        }

        //Find all query paths
        ArrayList<String> queries = new ArrayList<>();
        for (String s : sWebRanklevels) {
            File level = new File(s);
            File[] queriesFiles = level.listFiles();
            for (File f : queriesFiles) {
                if (!f.getAbsolutePath().contains("txt"))
                    queries.add(f.getAbsolutePath());

            }
        }

        if (numOfQueries > queries.size()) {
            return queries;
        }

        //Select a number of random queries
        int totalQueries = queries.size() - 1;
        Random randomQuery = new Random();
        ArrayList<String> randomQueries = new ArrayList<>();
        int count = 0;
        while (count < numOfQueries) {
            String val = queries.get(randomQuery.nextInt(totalQueries));
            if (!randomQueries.contains(val)) {
                randomQueries.add(val);
                count++;
            }
        }

        return randomQueries;
    }

    /**
     * a method that stores queries given from users
     * @param profiles the alternative profiles that users selected
     * @param queries the queries given by the users
     * @param input a txt file that contains the necessary parameters
     */
    public void storeQueries(ArrayList<String> profiles, ArrayList<String> queries, File input) {

        System.out.println("I will store the queries...");
        Utils utils = new Utils();
        utils.readInput(input);
        HashMap<String, String> crawlerOutputPaths = utils.crawlerOutputPaths;

        for (int i = 0; i < profiles.size(); i++) {
            if (!"".equals(queries.get(i)))
                storeQuery(crawlerOutputPaths.get(profiles.get(i)), profiles.get(i), queries.get(i));
        }

        System.out.println("I stored the queries...");
    }

    /**
     * a method that stores the query that has been suggested by the user
     * @param crawlerOutputPath SWebRank output directory used to check if a relevant query already exists
     * @param profile the query's relevant profile
     * @param query the given query
     */
    public void storeQuery(String crawlerOutputPath, String profile, String query) {

        System.out.println(crawlerOutputPath);
        System.out.println(profile);
        System.out.println(query);
        //Find output paths
        File root = new File(crawlerOutputPath);
        File[] contents = root.listFiles();
        List<String> sWebRanklevels = new ArrayList<>();
        for (File f : contents) {
            if (f.getAbsolutePath().contains("level"))
                sWebRanklevels.add(f.getAbsolutePath());
        }

        //Find all query paths
        List<String> queries = new ArrayList<>();
        for (String s : sWebRanklevels) {
            File level = new File(s);
            File[] queriesFiles = level.listFiles();
            for (File f : queriesFiles) {
                if (!f.getAbsolutePath().contains("txt")) {
                    String str = f.getAbsolutePath();
                    queries.add(str.substring(str.lastIndexOf("\\") + 1).replace("-query", "").replace("+", " "));
                }
            }
        }

        //check if a relevant query already exists - I use Jaro-Winkler distance
        query = query.trim().replaceAll(" +", " ");
        for (String q : queries) {
            JaroWinklerDistance jwd = new JaroWinklerDistance();
            double distance = jwd.getDistance(q, query);
            if (distance > 0.9) { // threshold = 0.9
                return;
            }
        }

        Mongo mongo = new Mongo("localhost", 27017);
        DB db = mongo.getDB("profileAnalysis");

        DBCollection DBqueries = db.getCollection("newQueries");
        BasicDBObject searchQuery = new BasicDBObject();
        searchQuery.put("profile", profile);
        DBObject document = DBqueries.findOne(searchQuery);
        boolean flag = false;

        //check if a relevant query exists in the database - I use Jaro-Winkler distance
        if (document != null) {
            flag = true;
            BasicDBList storedQueries = (BasicDBList) document.get("queries");
            for (Object quer : storedQueries) {
                JaroWinklerDistance jwd = new JaroWinklerDistance();
                double distance = jwd.getDistance((String) quer, query);
                if (distance > 0.9) { // threshold = 0.9
                    return;
                }
            }
        }

        //if document already exists add the new query
        if (flag) {
            DBqueries.update(searchQuery, new BasicDBObject("$push", new BasicDBObject("queries", query)));
        } else { //otherwise create a new document
            BasicDBList dbl = new BasicDBList();
            dbl.add(query);
            BasicDBObject entry = new BasicDBObject("profile", profile).append("queries", dbl);
            DBqueries.insert(entry);
        }
    }
}