ca.queensu.cs.sail.mailboxmina2.main.modules.ThreadsModule.java Source code

Java tutorial

Introduction

Here is the source code for ca.queensu.cs.sail.mailboxmina2.main.modules.ThreadsModule.java

Source

/*******************************************************************************
 * Copyright (c) 2012 Nicolas Bettenburg.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl.html
 * 
 * Contributors:
 *     Nicolas Bettenburg - initial API and implementation
 ******************************************************************************/
package ca.queensu.cs.sail.mailboxmina2.main.modules;

import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.time.DateUtils;

import sun.tools.tree.LengthExpression;

import ca.queensu.cs.sail.mailboxmina2.common.Logger;
import ca.queensu.cs.sail.mailboxmina2.main.Main;
import ca.queensu.cs.sail.mailboxmina2.storage.ConnectorPSQL;
import ca.queensu.cs.sail.mailboxmina2.storage.MM2Message;

public class ThreadsModule implements IModule {

    // The offset we allow for the sliding window in the subjects heuristic (in months) 
    private static int OFFSET = -6;

    @Override
    public String getDescription() {
        return "The Threads module\n"
                + "----------------------------------------------------------------------------\n"
                + "Input:    Database connection credentials"
                + "Output:    Recreates Threads Information from MM2 database content.";
    }

    @Override
    public boolean run(Properties props, Logger logger) {
        boolean success = true;

        // Open a connection
        ConnectorPSQL connector = new ConnectorPSQL(props);
        Connection connection = connector.getConnection();
        if (connection == null) {
            logger.error(this, "Cannot open connection - aborting thread module!");
            return false;
        }
        // Retrieve Messages
        List<MM2Message> messages = connector.getMM2Messages(false, false);

        // Remove all existing threads information
        logger.debug(1, "Running thread generation...");
        try {
            logger.debug(1, "Dropping existing thread associations...");
            dropAssociations(connection);
        } catch (Exception e) {
            logger.warning("Could not delete existing thread data ", e);
        }

        // Run each of the heuristics
        logger.debug(1, "Running heuristic: in-reply...");
        heuristicInreply(messages, connection);

        logger.debug(1, "Running heuristic: references...");
        heuristicReferences(messages, connection);

        logger.debug(1, "Running heuristic: subjects...");
        heuristicSubject(messages, connection);

        // Calculate the roots
        logger.debug(1, "Calculating message roots...");
        try {
            calculateRoots(messages, connection);
        } catch (Exception e) {
            logger.error(this, "Error while calculating message roots!", e);
        }

        // Done
        return success;
    }

    // ------------------------------------------------------------------------------------------------------------
    //                                 Here come the database helper functions
    //                           some time we should decouple the schema and the data!
    // ------------------------------------------------------------------------------------------------------------

    /**
     * Retrieve associations of message ids to messages's parents from the
     * database
     * 
     * @return
     */
    public Hashtable<String, String> getDBThreadsParents(Connection connection) {
        Hashtable<String, String> parentTable = new Hashtable<String, String>();
        String sqlText;
        // Get all the entries in the parent table
        sqlText = "SELECT msg_id, parent_id from parent";
        try {
            Statement statement = connection.createStatement();
            ResultSet res = statement.executeQuery(sqlText);
            while (res.next()) {
                // Store all entries in the hashtable NULL POINTER Exception if one of the entries is null
                parentTable.put(res.getString("msg_id"), res.getString("msg_parent_id"));
            }
        } catch (SQLException e) {
            Main.getLogger().error(this, "Error while receiving message parent ids", e);
        }

        return parentTable;
    }

    /**
     * Retrieve associations of message ids to messages' roots from the database
     * 
     * @return
     */
    public Hashtable<String, String> getDBThreadsRoots(Connection connection) {
        Hashtable<String, String> rootTable = new Hashtable<String, String>();
        String sqlText;
        // Get all the entries in the root table
        sqlText = "SELECT msg_id, root_id from roots";
        try {
            Statement statement = connection.createStatement();
            ResultSet res = statement.executeQuery(sqlText);
            while (res.next()) {
                // Store all entries in the hashtable
                rootTable.put(res.getString("msg_id"), res.getString("root_parent_id"));
            }
        } catch (SQLException e) {
            Main.getLogger().error(this, "Error while receiving message root ids", e);
        }
        return rootTable;
    }

    /**
     * This method stores associations of parents given as Hash Table
     * in the Database
     * 
     * @param parents
     *            a table that maps for every method having a parent this
     *            parent's ID
     * @throws SQLException
     */
    private void storeParents(Hashtable<String, String> parents, Connection connection) throws SQLException {
        // Use StringBuilders to create big insertion queries

        StringBuilder parentQueryBuilder = new StringBuilder();

        // Queries use the pgpsql functions
        // merge_parent(int msg_uid, int parent_uid)
        // merge_root(int msg_uid, int root_uid)
        // that take care of insertion / update automatically

        parentQueryBuilder.append("PREPARE parentplan (int, int) AS SELECT merge_parent($1, $2);");
        // Genereate the parents query
        int i = 0;
        for (String key : parents.keySet()) {
            parentQueryBuilder.append("EXECUTE parentplan (" + key + ", " + parents.get(key) + ");"
                    + System.getProperty("line.separator"));
            i++;
            if ((i % 1000) == 0) {
                i = 0;
                parentQueryBuilder.append("DEALLOCATE parentplan;" + System.getProperty("line.separator"));
                String sqlstr = parentQueryBuilder.toString();
                Statement statement = connection.createStatement();
                statement.execute(sqlstr);
                statement.close();
                parentQueryBuilder.delete(0, parentQueryBuilder.length());
                parentQueryBuilder.append("PREPARE parentplan (int, int) AS SELECT merge_parent($1, $2);");
                Main.getLogger().debug(4, "Stored 1000 parent relations!");
            }
        }
        if (i > 0) {
            parentQueryBuilder.append("DEALLOCATE parentplan;" + System.getProperty("line.separator"));
            String sqlstr = parentQueryBuilder.toString();
            Statement statement = connection.createStatement();
            statement.execute(sqlstr);
            statement.close();
            Main.getLogger().debug(4, "Stored " + i + " parent relations!");
        }
    }

    /**
     * This method stores the root associations of a roots table into the database
     * @param roots a list of roots associations
     * @throws SQLException
     */
    private void storeRoots(Hashtable<String, String> roots, Connection connection) throws SQLException {
        // Use StringBuilders to create big insertion queries

        StringBuilder rootQueryBuilder = new StringBuilder();

        // Queries use the pgpsql functions
        // merge_parent(int msg_uid, int parent_uid)
        // merge_root(int msg_uid, int root_uid)
        // that take care of insertion / update automatically

        rootQueryBuilder.append("PREPARE rootplan (int, int) AS SELECT merge_root($1, $2);");
        // Genereate the roots query
        int i = 0;
        for (String key : roots.keySet()) {
            rootQueryBuilder.append("EXECUTE rootplan (" + key + ", " + roots.get(key) + ");"
                    + System.getProperty("line.separator"));
            i++;
            if ((i % 1000) == 0) {
                i = 0;
                rootQueryBuilder.append("DEALLOCATE rootplan;" + System.getProperty("line.separator"));
                String sqlstr = rootQueryBuilder.toString();
                Statement statement = connection.createStatement();
                statement.execute(sqlstr);
                statement.close();
                rootQueryBuilder.delete(0, rootQueryBuilder.length());
                rootQueryBuilder.append("PREPARE rootplan (int, int) AS SELECT merge_root($1, $2);");
                Main.getLogger().debug(4, "Stored 1000 root relations!");
            }
        }
        if (i > 0) {
            rootQueryBuilder.append("DEALLOCATE rootplan;" + System.getProperty("line.separator"));
            String sqlstr = rootQueryBuilder.toString();
            Statement statement = connection.createStatement();
            statement.execute(sqlstr);
            statement.close();
            Main.getLogger().debug(4, "Stored " + i + " root relations!");
        }
    }

    // ------------------------------------------------------------------------------------------------------------
    //                                 Here come the heuristics to re-build threads
    // ------------------------------------------------------------------------------------------------------------

    /**
     * Helper function for the in-reply-to heuristic
     * Among a list of strings it will return the longst one.
     * @param elements a {@link List} of {@link String} objects.
     * @return the longest {@link String} contained in elements. 
     */
    public String getLongestString(List<String> elements) {
        Collections.sort(elements, new Comparator<String>() {
            @Override
            public int compare(String o1, String o2) {
                if (o1.length() > o2.length())
                    return 1;
                if (o1.length() < o2.length())
                    return -1;
                return 0;
            }
        });
        if (elements.size() == 0)
            return "";
        else
            return elements.get(elements.size() - 1);
    }

    /**
     * Helper function for the in-reply-to heuristic.
     * Finds the longest of all matches to a given pattern and an input.
     * @param input a {@link String} to find all matches of {@link Pattern} p in.
     * @param p a {@link Pattern} to match against
     * @return the longest match found in input if exists, an empty {@link String} otherwise.
     */
    public String findLongestPatternMatch(String input, Pattern p) {
        List<String> matches = new ArrayList<String>();
        Matcher matcher = p.matcher(input);
        while (matcher.find())
            matches.add(matcher.group(1));
        return getLongestString(matches);
    }

    /**
     * This heuristic creates associations based on the in-reply: header field
     * of messages
     */
    private void heuristicInreply(List<MM2Message> messages, Connection connection) {
        // This is the msg_id ==> in-reply-to
        Hashtable<String, String> msg_id_to_message_id = new Hashtable<String, String>();

        // This is message-id ==> msg_id
        Hashtable<String, String> message_id_to_msg_id = new Hashtable<String, String>();

        // This is child ==> parent
        Hashtable<String, String> msg_id_to_msg_id = new Hashtable<String, String>();
        Pattern messageid_pattern = Pattern.compile("<(.*?@.*?)>");

        try {
            for (MM2Message msg : messages) {

                // Step One: check whether the message has a message-id header
                // We assume that the "message-id" field is always set - at least to "" and is not null
                String h_message_id = msg.getHeaderEntry("message-id");
                if (h_message_id.length() > 2) {
                    // We try to identify the message identifier part
                    String maybeMatch = findLongestPatternMatch(h_message_id, messageid_pattern);
                    if (!maybeMatch.equalsIgnoreCase("")) {
                        String extracted_message_id = maybeMatch;
                        String msg_id = msg.getHeaderEntry("msg_id");
                        // Add the information to the reverse lookup table that we will need later
                        message_id_to_msg_id.put(extracted_message_id, msg_id);
                        Main.getLogger().debug(5, this,
                                "I know that message " + msg_id + " has message-id " + extracted_message_id);
                    }
                }

                // Step Two: check whether the message has an in-reply-to header
                // Again we assume that the "in-reply-to" field is at least "" and not null
                String h_in_reply_to = msg.getHeaderEntry("in-reply-to");
                if (h_in_reply_to.contains(">") && h_in_reply_to.contains("<")) {
                    // We try to identify the message identifier part
                    String maybeMatch = findLongestPatternMatch(h_in_reply_to, messageid_pattern);
                    if (!maybeMatch.equalsIgnoreCase("")) {
                        String extracted_in_reply_to_messageid = maybeMatch;
                        String msg_id = msg.getHeaderEntry("msg_id");
                        // Add the information into the forward lookup table
                        Main.getLogger().debug(5, this, "I know that message " + msg_id + " is a reply to "
                                + extracted_in_reply_to_messageid);
                        msg_id_to_message_id.put(msg_id, extracted_in_reply_to_messageid);
                    }
                }
            }

            // Step Three: After we obtained the previous information, we will
            // resolve each in-reply-to id to a msg_id. so we know for each key in the
            // forward lookup table the msg_id it is a reply to:

            for (String child_msg_id : msg_id_to_message_id.keySet()) {
                String parent_msg_id = message_id_to_msg_id.get(msg_id_to_message_id.get(child_msg_id));
                // If we found an entry in the table
                if (parent_msg_id != null) {
                    // Add this information to the parents table
                    Main.getLogger().debug(5, this,
                            "I know that message " + child_msg_id + " has the parent " + parent_msg_id);
                    msg_id_to_msg_id.put(child_msg_id, parent_msg_id);
                }
            }

            Main.getLogger().debug(5, "message-ids resolved to msg_ids = " + message_id_to_msg_id.size());
            Main.getLogger().debug(5, "message_ids resolved from in-reply-to = " + msg_id_to_message_id.size());

            // Store the parents and roots into the database
            Main.getLogger().log("The heuristic could resolve " + msg_id_to_msg_id.size() + " parent relations!");
            Main.getLogger().log("Storing associations found by in-reply-to heuristic in the database...");
            storeParents(msg_id_to_msg_id, connection);

        } catch (Exception e) {
            Main.getLogger().error("Error storing messages for heuristic in-reply!", e);
        }
    }

    /**
     * This heuristic creates associations based on the references header fields
     */
    private void heuristicReferences(List<MM2Message> messages, Connection connection) {
        // This is the msg_id ==> references-id
        Hashtable<String, String> msg_id_to_message_id = new Hashtable<String, String>();

        // This is message-id ==> msg_id
        Hashtable<String, String> message_id_to_msg_id = new Hashtable<String, String>();

        // This is child ==> parent
        Hashtable<String, String> msg_id_to_msg_id = new Hashtable<String, String>();
        Pattern messageid_pattern = Pattern.compile("<(.*?@.*?)>");

        try {
            for (MM2Message msg : messages) {

                // Step One: check whether the message has a message-id header
                // We assume that the "message-id" field is always set - at least to "" and is not null
                String h_message_id = msg.getHeaderEntry("message-id");
                if (h_message_id.length() > 2) {
                    // We try to identify the message identifier part
                    String maybeMatch = findLongestPatternMatch(h_message_id, messageid_pattern);
                    if (!maybeMatch.equalsIgnoreCase("")) {
                        String extracted_message_id = maybeMatch;
                        String msg_id = msg.getHeaderEntry("msg_id");
                        // Add the information to the reverse lookup table that we will need later
                        message_id_to_msg_id.put(extracted_message_id, msg_id);
                        Main.getLogger().debug(5, this,
                                "I know that message " + msg_id + " has message-id " + extracted_message_id);
                    }
                }

                // Step Two: check whether the message has an references header
                // Again we assume that the "references" field is at least "" and not null
                String h_references = msg.getHeaderEntry("references");
                String[] split_refs = h_references.split(" ");
                for (String ref : split_refs) {
                    if (ref.contains(">") && ref.contains("<")) {
                        // We try to identify the message identifier part
                        String maybeMatch = findLongestPatternMatch(h_references, messageid_pattern);
                        if (!maybeMatch.equalsIgnoreCase("")) {
                            String extracted_in_reply_to_messageid = maybeMatch;
                            String msg_id = msg.getHeaderEntry("msg_id");
                            // Add the information into the forward lookup table
                            Main.getLogger().debug(5, this, "I know that message " + msg_id + " is a reply to "
                                    + extracted_in_reply_to_messageid);
                            msg_id_to_message_id.put(msg_id, extracted_in_reply_to_messageid);
                        }
                    }
                }
            }

            // Step Three: After we obtained the previous information, we will
            // resolve each in-reply-to id to a msg_id. so we know for each key in the
            // forward lookup table the msg_id it is a reply to:

            for (String child_msg_id : msg_id_to_message_id.keySet()) {
                String parent_msg_id = message_id_to_msg_id.get(msg_id_to_message_id.get(child_msg_id));
                // If we found an entry in the table
                if (parent_msg_id != null) {
                    // Add this information to the parents table
                    Main.getLogger().debug(5, this,
                            "I know that message " + child_msg_id + " has the parent " + parent_msg_id);
                    msg_id_to_msg_id.put(child_msg_id, parent_msg_id);
                }
            }

            Main.getLogger().debug(5, "message-ids resolved to msg_ids = " + message_id_to_msg_id.size());
            Main.getLogger().debug(5, "message_ids resolved from references = " + msg_id_to_message_id.size());

            // Store the parents and roots into the database
            Main.getLogger().log("The heuristic could resolve " + msg_id_to_msg_id.size() + " parent relations!");
            Main.getLogger().log("Storing associations found by references heuristic in the database...");
            storeParents(msg_id_to_msg_id, connection);

        } catch (Exception e) {
            Main.getLogger().error("Error storing messages for heuristic references!", e);
        }
    }

    /**
     * This heuristic creates associations based on the subject and a time
     * window Default Time window is 1 month
     */
    private void heuristicSubject(List<MM2Message> messages, Connection connection) {

        // This is the msg_id ==> Date
        Hashtable<String, Date> msg_id_to_date = new Hashtable<String, Date>();

        // This is (original) subject ==> msg_id
        Hashtable<String, String> subject_to_msg_id = new Hashtable<String, String>();

        // This is msg_id ==> (processed) subject
        Hashtable<String, String> msg_id_to_subject = new Hashtable<String, String>();

        // This is child ==> parent
        Hashtable<String, String> msg_id_to_msg_id = new Hashtable<String, String>();

        // Capture the most commong reply patterns
        // Fw: Re: Aw: Wg: 
        Pattern reply_pattern = Pattern.compile(
                "^(\\[.*?\\] )?(([rR][eE]:)|([aA][wW]:)|([fF][wW]:)|([wW][gG]:)|([fF][wW][dD]:)|([wW][tT][rR]:)|([aA]ntwort:))(.*?)$");

        try {
            for (MM2Message msg : messages) {
                String msg_id = msg.getHeaderEntry("msg_id");
                msg_id_to_date.put(msg_id, msg.getMsg_date());
                // We assume the subject to be at least ""
                String raw_subject = msg.getSubject();
                // Determine whether the subject describes a reply or an original posting
                Matcher matcher = reply_pattern.matcher(raw_subject);
                if (matcher.matches()) {
                    String stripped_subject = matcher.group(matcher.groupCount());
                    Main.getLogger().debug(5, this,
                            "I think message is a reply and the original subject is: " + stripped_subject.trim());
                    // Store the information in the forward table
                    msg_id_to_subject.put(msg_id, stripped_subject.trim());
                } else {
                    // We think that this is not a reply - hence it must be an original posting ;-)
                    subject_to_msg_id.put(raw_subject, msg_id);
                    Main.getLogger().debug(5, this,
                            "I think message is an original posting: " + raw_subject.trim());
                }
            }

            // Now we need to find parent relations by subject.
            // Still we will apply a sliding window approach using a given offset
            // to make sure, we don't capture events of people re-using old subject names

            for (String child_msg_id : msg_id_to_subject.keySet()) {
                String origSubj = msg_id_to_subject.get(child_msg_id);
                String parent_msg_id = subject_to_msg_id.get(origSubj);
                // If we found an entry in the table
                if (parent_msg_id != null) {
                    // Check if the potential parent is (OFFSET) older than child
                    Date d1 = msg_id_to_date.get(parent_msg_id);
                    Date d2 = DateUtils.addMonths(msg_id_to_date.get(child_msg_id), OFFSET);
                    if (d1.compareTo(d2) >= 0) {
                        Main.getLogger().debug(5, this,
                                "I know that message " + child_msg_id + " has the parent " + parent_msg_id);
                        msg_id_to_msg_id.put(child_msg_id, parent_msg_id);
                    }
                }
            }

            Main.getLogger().debug(5, "original posting subjects resolved = " + subject_to_msg_id.size());
            Main.getLogger().debug(5, "subjects resolved replys = " + msg_id_to_subject.size());

            // Store the parents and roots into the database
            Main.getLogger().log("The heuristic could resolve " + msg_id_to_msg_id.size() + " parent relations!");
            Main.getLogger().log("Storing associations found by in-reply-to heuristic in the database...");
            storeParents(msg_id_to_msg_id, connection);

        } catch (Exception e) {
            Main.getLogger().error("Error storing messages for heuristic in-reply!", e);
        }
    }

    /**
     * This method tries to delete all associations in root and parent tables
     * @throws SQLException if there was an error during deletion
     */
    private void dropAssociations(Connection connection) throws SQLException {
        Statement stmt = connection.createStatement();
        stmt.execute("DELETE FROM parent WHERE TRUE; DELETE FROM root WHERE TRUE;");
        stmt.close();
    }

    /** This method calculates all roots for the given messages in messageData
     * and stores this information in the database
     * @param messages a list of Messages
     * @throws SQLException if there was an error with the database
     */
    private void calculateRoots(List<MM2Message> messages, Connection connection) throws SQLException {
        // Needed Data Structures
        Hashtable<String, String> parentsTable = new Hashtable<String, String>();
        Hashtable<String, String> rootsTable = new Hashtable<String, String>();

        // Build up parents table form database
        String query = "SELECT * FROM parent";
        Statement stmt = connection.createStatement();
        ResultSet results = stmt.executeQuery(query);
        while (results.next()) {
            parentsTable.put(results.getString("msg_id"), results.getString("parent_id"));
        }

        Main.getLogger().debug(3, parentsTable.keySet().size() + " parent relations are known!");

        // Calculate all roots
        for (MM2Message msg : messages) {
            String msgUID = msg.getHeaderEntry("msg_id");
            String rootUID = msgUID;
            if (parentsTable.containsKey(msgUID)) {
                Set<String> seenParents = new HashSet<String>();

                String myParent = parentsTable.get(msgUID);
                seenParents.add(myParent);
                while (myParent != null) {
                    rootUID = myParent;
                    myParent = parentsTable.get(myParent);
                    if (seenParents.contains(myParent)) {
                        Main.getLogger().log("Parents Cycle: " + rootUID + " " + myParent);
                        break;
                    } else {
                        seenParents.add(myParent);
                    }
                }
            }
            rootsTable.put(msgUID, rootUID);
        }
        Main.getLogger().log("Storing " + rootsTable.keySet().size() + " roots in database...");
        storeRoots(rootsTable, connection);
    }

}