fr.gouv.culture.vitam.eml.MailboxParser.java Source code

Introduction

Here is the source code for fr.gouv.culture.vitam.eml.MailboxParser.java
Source

/**
 * This file is part of Vitam Project.
 * 
 * Copyright 2010, Frederic Bregier, and individual contributors by the @author tags. See the
 * COPYRIGHT.txt in the distribution for a full listing of individual contributors.
 * 
 * All Vitam Project is free software: you can redistribute it and/or modify it under the terms of
 * the GNU General Public License as published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 * 
 * Vitam is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
 * Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with Vitam. If not, see
 * <http://www.gnu.org/licenses/>.
 */
package fr.gouv.culture.vitam.eml;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;

import javax.mail.Message;
import javax.mail.MessagingException;
import javax.mail.Session;
import javax.mail.internet.MimeMessage;

import org.dom4j.Element;

import fr.gouv.culture.vitam.eml.StringUtils.EMAIL_FIELDS;
import fr.gouv.culture.vitam.utils.ConfigLoader;
import fr.gouv.culture.vitam.utils.VitamArgument;
import fr.gouv.culture.vitam.utils.XmlDom;

/**
 * Inspired from https://github.com/nicbet/MailboxMiner
 * 
 * @author "Frederic Bregier"
 * 
 */
public class MailboxParser {
    static boolean debug = false;
    /**
     * Minimum amount of consecutive header lines to encounter after a potential separator line
     * before it is considered for splitting.
     */
    private int HEADERTHRESHOLD = 2;

    /**
     * Counts the number of collisions in the HashTable of seen messages. (I.e. counts duplicate
     * e-mails)
     */
    private int collisions;
    private int numberEmails = 0;

    /**
     * Set whether to discard duplicate messages or not. Default behaviour is to silently get rid of
     * duplicates.
     */
    private boolean IGNOREDUPLICATES = true;

    public static enum UsableCharset {
        UTF_8, ISO_8859_1, ISO_8859_15, US_ASCII;

        public String name;

        private UsableCharset() {
            this.name = name().replace('_', '-');
        }
    }

    private String charset = UsableCharset.ISO_8859_15.name;

    public int getCollisions() {
        return collisions;
    }

    public void setCollisions(int collisions) {
        this.collisions = collisions;
    }

    public boolean isIGNOREDUPLICATES() {
        return IGNOREDUPLICATES;
    }

    public void setIGNOREDUPLICATES(boolean ignoreduplicates) {
        IGNOREDUPLICATES = ignoreduplicates;
    }

    public int getHEADERTHRESHOLD() {
        return HEADERTHRESHOLD;
    }

    public void setHEADERTHRESHOLD(int headerthreshold) {
        HEADERTHRESHOLD = headerthreshold;
    }

    /**
     * Parse a given .mbox file and return a list of {@link Message} objects.
     * 
     * @param filename
     *            a {@link String} containing the path to an .mbox file.
     * @return a {@link List} of {@link Message} objects extracted from the file.
     */
    private List<Message> parseMessages(String filename) {
        collisions = 0;
        List<Message> messages = new ArrayList<Message>();
        HashSet<Integer> seenMessages = new HashSet<Integer>();

        // Open the file for reading
        BufferedReader reader = null;
        try {
            StringBuilder inputBuilder = new StringBuilder();
            String line = "";
            reader = new BufferedReader(new FileReader(filename));
            // Read the mbox file line by line
            while ((line = reader.readLine()) != null) {
                inputBuilder.append(line);
                inputBuilder.append(System.getProperty("line.separator"));
            }

            String text = inputBuilder.toString();
            inputBuilder = null;

            String[] rawlines = null;//text.split("(\n\r)|(\n)|(\r)");
            rawlines = text.split("\r?\n|\r");
            if (debug)
                System.err.println("Split file into " + rawlines.length + " lines");

            Pattern seperatorPattern = Pattern.compile("^From (.*?) (.*?):(.*?):(.*?)$"); // From
            // Apache
            // JAMES
            // server
            Pattern headerPattern = Pattern.compile("^[\\x21-\\x39\\x3B-\\x7E]+:(.*)$"); // From RFC
            // 5322
            // - Oct
            // 2008
            String ssep = System.getProperty("line.separator");

            // Here comes the big ugly loop ...
            int lastFoundSepLine = -1;
            Map<Integer, Integer> separatorsMap = new HashMap<Integer, Integer>();

            for (int line_num = 0; line_num < rawlines.length; line_num++) {

                String currentLine = rawlines[line_num];

                // If we found a header name line
                if (headerPattern.matcher(currentLine).matches()) {
                    /*if (debug)
                       System.err.println("HEADER MATCH! " + line_num);*/
                    if (lastFoundSepLine != -1) {
                        if (separatorsMap.containsKey(lastFoundSepLine)) {
                            int numHeaders = separatorsMap.get(lastFoundSepLine);
                            numHeaders++;
                            separatorsMap.put(lastFoundSepLine, numHeaders);
                        }
                    }
                }

                // If we found a separator line
                if (seperatorPattern.matcher(currentLine).matches()) {
                    /*if (debug)
                       System.err.println("SEP MATCH! " + line_num);*/
                    lastFoundSepLine = line_num;
                    separatorsMap.put(lastFoundSepLine, 0);
                }
            }
            // Treat the end of the file as potential separator ;-)
            separatorsMap.put(rawlines.length, HEADERTHRESHOLD);

            // Compose the messages
            // If we read at least HEADERTHRESHOLD many headers after the separator
            List<Integer> separators = new ArrayList<Integer>();
            for (Integer x : separatorsMap.keySet()) {
                if (separatorsMap.get(x) >= HEADERTHRESHOLD) {
                    separators.add(x);
                } else {
                    // Line x is a bogus header line and should be escaped!!
                    rawlines[x.intValue()] = ">" + rawlines[x.intValue()];
                }
            }

            Collections.sort(separators);

            for (int i = 0; i < separators.size() - 1; i++) {
                int startLine = separators.get(i);
                int endLine = separators.get(i + 1);
                if (debug)
                    System.err.println("Message from lines " + startLine + " - " + endLine + " ("
                            + (endLine - startLine + 1) + ")");
                // compose a raw message
                StringBuilder rawMsgBuilder = new StringBuilder();
                for (int l = startLine + 1; l < endLine; l++) {
                    rawMsgBuilder.append(rawlines[l] + ssep);
                }
                String rawMessageText = rawMsgBuilder.toString().trim();
                int hashKey = rawMessageText.hashCode();
                if (!seenMessages.contains(hashKey)) {
                    // XXX FIXME should take care one by one and not storing it
                    /*System.err.println("-----------------------------");
                    System.err.print(rawMessageText);
                    System.err.println();*/
                    convertTextToMimeMessage(rawMessageText);
                    numberEmails++;
                    //messages.add(convertTextToMimeMessage(rawMessageText));
                    if (IGNOREDUPLICATES) {
                        seenMessages.add(hashKey);
                    }
                } else {
                    if (debug)
                        System.err.println("Duplicated message found");
                    collisions++;
                }
            }
            // end compose the last message if that one was valid

            if (debug)
                System.err.println("Split into " + numberEmails + " messages!");

        } catch (IOException e) {
            if (debug)
                System.err.println("Error while trying to read file: " + filename);
            if (debug)
                System.err.println(e.getMessage());
            if (debug)
                System.err.println("-------- Stacktrace ----------");
            if (debug)
                e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) {
                }
            }
        }
        return messages;
    }

    VitamArgument argument;
    ConfigLoader config;
    File curPath;

    public static Element extractInfoEmail(File mboxFile, VitamArgument argument, ConfigLoader config) {
        Element root = XmlDom.factory.createElement(EMAIL_FIELDS.formatMBOX.name);
        root.addAttribute(EMAIL_FIELDS.filename.name, mboxFile.getPath());
        MailboxParser parser = new MailboxParser();
        parser.argument = argument;
        parser.config = config;
        File oldDir = argument.currentOutputDir;
        if (argument.currentOutputDir == null) {
            if (config.outputDir != null) {
                argument.currentOutputDir = new File(config.outputDir);
            } else {
                argument.currentOutputDir = new File(mboxFile.getParentFile().getAbsolutePath());
            }
        }
        if (config.extractFile) {
            parser.curPath = new File(argument.currentOutputDir, "MBOX_" + mboxFile.getName());
            parser.curPath.mkdirs();
            argument.currentOutputDir = parser.curPath;
        }
        Element res = parser.extractInfoMbox(mboxFile, root);
        argument.currentOutputDir = oldDir;
        return res;
    }

    private Element extractInfoMbox(File mboxFile, Element root) {
        collisions = 0;
        HashSet<Integer> seenMessages = new HashSet<Integer>();

        // Open the file for reading
        BufferedReader reader = null;
        try {
            StringBuilder inputBuilder = new StringBuilder();
            String line = "";
            FileInputStream inputStream = new FileInputStream(mboxFile);
            reader = new BufferedReader(new InputStreamReader(inputStream, charset));
            // Read the mbox file line by line
            while ((line = reader.readLine()) != null) {
                inputBuilder.append(line);
                inputBuilder.append(System.getProperty("line.separator"));
            }

            String text = inputBuilder.toString();
            inputBuilder = null;

            String[] rawlines = null;//text.split("(\n\r)|(\n)|(\r)");
            rawlines = text.split("\r?\n|\r");
            if (debug)
                System.err.println("Split file into " + rawlines.length + " lines");

            Pattern seperatorPattern = Pattern.compile("^From (.*?) (.*?):(.*?):(.*?)$"); // From
            // Apache
            // JAMES
            // server
            Pattern headerPattern = Pattern.compile("^[\\x21-\\x39\\x3B-\\x7E]+:(.*)$"); // From RFC
            // 5322
            // - Oct
            // 2008
            String ssep = System.getProperty("line.separator");

            // Here comes the big ugly loop ...
            int lastFoundSepLine = -1;
            Map<Integer, Integer> separatorsMap = new HashMap<Integer, Integer>();

            for (int line_num = 0; line_num < rawlines.length; line_num++) {

                String currentLine = rawlines[line_num];

                // If we found a header name line
                if (headerPattern.matcher(currentLine).matches()) {
                    /*if (debug)
                       System.err.println("HEADER MATCH! " + line_num);*/
                    if (lastFoundSepLine != -1) {
                        if (separatorsMap.containsKey(lastFoundSepLine)) {
                            int numHeaders = separatorsMap.get(lastFoundSepLine);
                            numHeaders++;
                            separatorsMap.put(lastFoundSepLine, numHeaders);
                        }
                    }
                }

                // If we found a separator line
                if (seperatorPattern.matcher(currentLine).matches()) {
                    /*if (debug)
                       System.err.println("SEP MATCH! " + line_num);*/
                    lastFoundSepLine = line_num;
                    separatorsMap.put(lastFoundSepLine, 0);
                }
            }
            // Treat the end of the file as potential separator ;-)
            separatorsMap.put(rawlines.length, HEADERTHRESHOLD);

            // Compose the messages
            // If we read at least HEADERTHRESHOLD many headers after the separator
            List<Integer> separators = new ArrayList<Integer>();
            for (Integer x : separatorsMap.keySet()) {
                if (separatorsMap.get(x) >= HEADERTHRESHOLD) {
                    separators.add(x);
                } else {
                    // Line x is a bogus header line and should be escaped!!
                    rawlines[x.intValue()] = ">" + rawlines[x.intValue()];
                }
            }

            Collections.sort(separators);

            for (int i = 0; i < separators.size() - 1; i++) {
                int startLine = separators.get(i);
                int endLine = separators.get(i + 1);
                if (debug)
                    System.err.println("Message from lines " + startLine + " - " + endLine + " ("
                            + (endLine - startLine + 1) + ")");
                else
                    System.out.print('.');
                // compose a raw message
                StringBuilder rawMsgBuilder = new StringBuilder();
                for (int l = startLine + 1; l < endLine; l++) {
                    rawMsgBuilder.append(rawlines[l] + ssep);
                }
                String rawMessageText = rawMsgBuilder.toString().trim();
                int hashKey = rawMessageText.hashCode();
                if (!seenMessages.contains(hashKey)) {
                    MimeMessage message = convertTextToMimeMessage(rawMessageText);
                    if (message == null) {
                        Element newElt = XmlDom.factory.createElement(EMAIL_FIELDS.formatEML.name);
                        String status = "Error during identification";
                        newElt.addAttribute(EMAIL_FIELDS.status.name, status);
                        root.add(newElt);
                    } else {
                        numberEmails++;
                        if (IGNOREDUPLICATES) {
                            seenMessages.add(hashKey);
                        }
                        Element emlroot = XmlDom.factory.createElement(EMAIL_FIELDS.formatEML.name);
                        // <identity format="Internet Message Format" mime="message/rfc822" puid="fmt/278" extensions="eml"/>
                        Element identification = XmlDom.factory.createElement("identification");
                        Element identity = XmlDom.factory.createElement("identity");
                        identity.addAttribute("format", "Internet Message Format");
                        identity.addAttribute("mime", "message/rfc822");
                        identity.addAttribute("puid", "fmt/278");
                        identity.addAttribute("extensions", "eml");
                        identification.add(identity);
                        emlroot.add(identification);
                        EmlExtract.extractInfoMessage(message, emlroot, argument, config);
                        root.add(emlroot);
                        /*
                        if (config.extractFile) {
                           File old = argument.currentOutputDir;
                           String id = emlroot.attributeValue(EMAIL_FIELDS.rankId.name);
                           if (config.extractFile) {
                              File newOutDir = new File(argument.currentOutputDir, id);
                              newOutDir.mkdirs();
                              argument.currentOutputDir = newOutDir;
                           }
                           // XXX FIXME should write rawMessageText to eml file using id+"_"+message.getSubject()+".eml"
                           System.out.println("should write rawMessageText to eml file using "+id+" and subdir .eml");
                           argument.currentOutputDir = old;
                        }
                        */
                    }
                } else {
                    if (debug)
                        System.err.println("Duplicated message found");
                    collisions++;
                }
            }
            // end compose the last message if that one was valid
            if (numberEmails == 0) {
                // not a MBOX
                root = null;
            } else {
                System.err.println("Split into " + numberEmails + " messages!");
            }

        } catch (IOException e) {
            System.err.println(
                    "Error while trying to read file: " + mboxFile.getAbsolutePath() + " " + e.getMessage());
            if (debug)
                System.err.println("-------- Stacktrace ----------");
            if (debug)
                e.printStackTrace();
            return null;
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) {
                }
            }
        }
        if (root != null) {
            root.addAttribute("nbEml", config.nbDoc.toString());
        }

        return root;
    }

    /**
     * Parse a text block as an email and convert it into a mime message
     * 
     * @param emailBody
     *            The headers and body of an email. This will be parsed into a mime message and
     *            stored
     */
    private static MimeMessage convertTextToMimeMessage(String emailBody) {
        // this.emailBody = emailBody;
        MimeMessage mimeMessage = null;
        // Parse the mime message as we have the full message now (in string format)
        ByteArrayInputStream mb = new ByteArrayInputStream(emailBody.getBytes());
        Properties props = System.getProperties();
        Session session = Session.getDefaultInstance(props);
        try {
            mimeMessage = new MimeMessage(session, mb);

        } catch (MessagingException e) {
            System.err.println("Error converting raw message to MimeMessage");
            if (debug)
                e.printStackTrace();
        } catch (Exception e) {
            System.err.println("Error converting raw message to MimeMessage");
            if (debug)
                e.printStackTrace();
        }

        return mimeMessage;
    }

    /**
     * @param args
     */
    public static void main(String[] args) {
        if (args.length != 1) {
            System.out.println("Please supply a path to an mbox file to parse");
            return;
        }
        MailboxParser mailboxParser = new MailboxParser();
        mailboxParser.parseMessages(args[0]);
        System.out.println("MailBox collisions: " + mailboxParser.getCollisions());
    }

}