MacchiatoPlugin.java :  » Mail-Clients » columba-1.4 » org » columba » mail » spam » Java Open Source

Java Open Source » Mail Clients » columba 1.4 
columba 1.4 » org » columba » mail » spam » MacchiatoPlugin.java
// The contents of this file are subject to the Mozilla Public License Version
// 1.1
//(the "License"); you may not use this file except in compliance with the
//License. You may obtain a copy of the License at http://www.mozilla.org/MPL/
//
//Software distributed under the License is distributed on an "AS IS" basis,
//WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
//for the specific language governing rights and
//limitations under the License.
//
//The Original Code is "The Columba Project"
//
//The Initial Developers of the Original Code are Frederik Dietz and Timo
// Stich.
//Portions created by Frederik Dietz and Timo Stich are Copyright (C) 2003.
//
//All Rights Reserved.
package org.columba.mail.spam;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.logging.Logger;

import javax.swing.JOptionPane;

import org.columba.core.config.DefaultConfigDirectory;
import org.columba.core.gui.frame.FrameManager;
import org.columba.core.io.CloneStreamMaster;
import org.columba.core.logging.Logging;
import org.columba.mail.folder.IMailbox;
import org.columba.mail.spam.command.CommandHelper;
import org.columba.mail.spam.rules.RuleList;
import org.columba.ristretto.message.Header;
import org.macchiato.DBWrapper;
import org.macchiato.Message;
import org.macchiato.SpamFilter;
import org.macchiato.SpamFilterImpl;
import org.macchiato.db.FrequencyDB;
import org.macchiato.db.MD5SumHelper;
import org.macchiato.db.berkleydb.BerkleyFrequencyDBImpl;
import org.macchiato.log.MacchiatoLogger;
import org.macchiato.maps.ProbabilityMap;

/**
 * Built-in spam filter using the Macchiato library.
 * <p>
 * Note, that its necessary for this filter to train a few hundred messages,
 * before its starting to work. I'm usually starting with around 1000 messages
 * while keeping it up-to-date with messages which are scored wrong.
 * <p>
 * If training mode is enabled, the spam filter automatically adds messages to
 * its frequency database.
 *
 * @author fdietz
 */
public class MacchiatoPlugin implements ISpamPlugin {

  /** JDK 1.4+ logging framework logger, used for logging. */
  private static final Logger LOG = Logger
      .getLogger("org.columba.core.gui.htmlviewer");

  /**
   * Delete messages from DB, if DB size > THRESHOLD
   */
  public final static int THRESHOLD = 200000;

  /**
   * Delete messages from DB after 7 days, if they don't affect the scoring
   * process because of low occurences.
   */
  public final static int AGE = 7;

  /**
   * spam filter in macchiator library doing the actual work
   */
  private SpamFilter filter;

  /**
   * database of tokens, storing occurences of tokens, etc.
   */
  private FrequencyDB db;

  /**
   * file to store the token database
   */
  private File file;

  /**
   * dirty flag for database changes
   */
  private boolean hasChanged = false;

  /**
   * is cache already loaded?
   */
  private boolean alreadyLoaded = false;

  /**
   *
   */
  public MacchiatoPlugin() {
    // create directory <config-folder>/mail/spamdb
    File configDirectory = DefaultConfigDirectory.getInstance().getCurrentPath();
    File mailDirectory = new File(configDirectory, "mail");
    file = new File(mailDirectory, "spamdb");
    if (!file.exists())
      file.mkdir();
    db = new DBWrapper(new BerkleyFrequencyDBImpl(file));

    filter = new SpamFilterImpl(db);

    // make Columba logger parent of macchiato logger
    MacchiatoLogger.setParentLogger(Logger
        .getLogger("org.columba.mail.spam"));

  }

  /**
   * Score message. Using a threshold of 90% here. Every message with at least
   * 90% is spam. This value should be increased in the future.
   *
   * @see org.columba.mail.spam.ISpamPlugin#scoreMessage(org.columba.mail.folder.IMailbox,
   *      java.lang.Object)
   */
  public boolean scoreMessage(IMailbox mailbox, Object uid) throws Exception {
    // load database from file
    load();

    // get inputstream of message body
    InputStream istream = CommandHelper.getBodyPart(mailbox, uid);

    // we are using this inpustream multiple times
    // --> istream will be closed by CloneStreamMaster
    CloneStreamMaster master = new CloneStreamMaster(istream);

    // get stream
    istream = master.getClone();

    // apply additional handcrafted rules
    ProbabilityMap map = RuleList.getInstance().getProbabilities(mailbox,
        uid);

    float score = filter.scoreMessage(new Message(istream), map);

    return score >= 0.9f;
  }

  /**
   * @see org.columba.mail.spam.ISpamPlugin#trainMessageAsSpam(org.columba.mail.folder.IMailbox,
   *      java.lang.Object)
   */
  public void trainMessageAsSpam(IMailbox mailbox, Object uid)
      throws Exception {
    // get inputstream of message body
    InputStream istream = CommandHelper.getBodyPart(mailbox, uid);

    // get headers
    Header h = mailbox.getHeaderFields(uid, Message.HEADERFIELDS);

    // put headers in list
    Enumeration e = h.getKeys();
    List list = new ArrayList();

    while (e.hasMoreElements()) {
      String key = (String) e.nextElement();
      list.add(h.get(key));
    }

    // load database from file
    load();

    try {
      CloneStreamMaster master = new CloneStreamMaster(istream);
      InputStream inputStream = master.getClone();

      byte[] md5sum = MD5SumHelper.createMD5(inputStream);
      // close stream
      inputStream.close();

      // get new inputstream
      inputStream = master.getClone();

      Message message = new Message(inputStream, list, md5sum);
      // check if this message was already learned
      // -> only add if this is not the case
      if (db.MD5SumExists(md5sum)) {
        // message already exists
        // --> correct token data
        filter.correctMessageAsSpam(message);
      } else {
        // new message
        filter.trainMessageAsSpam(message);
      }

      // close stream
      inputStream.close();

      // set dirty flag
      hasChanged = true;
    } catch (IOException e1) {
      LOG.severe(e1.getMessage());
      if (Logging.DEBUG)
        e1.printStackTrace();
    } catch (NoSuchAlgorithmException nsae) {
    } // does not occur

  }

  /**
   * @see org.columba.mail.spam.ISpamPlugin#trainMessageAsHam(org.columba.mail.folder.IMailbox,
   *      java.lang.Object)
   */
  public void trainMessageAsHam(IMailbox mailbox, Object uid)
      throws Exception {
    // get inputstream of message body
    InputStream istream = CommandHelper.getBodyPart(mailbox, uid);

    // get headers
    Header h = mailbox.getHeaderFields(uid, Message.HEADERFIELDS);

    // put headers in list
    Enumeration e = h.getKeys();
    List list = new ArrayList();

    while (e.hasMoreElements()) {
      String key = (String) e.nextElement();
      list.add(h.get(key));
    }

    // load database from file
    load();

    try {
      CloneStreamMaster master = new CloneStreamMaster(istream);
      InputStream inputStream = master.getClone();

      byte[] md5sum = MD5SumHelper.createMD5(inputStream);
      // close stream
      inputStream.close();

      // get new inputstream
      inputStream = master.getClone();
      Message message = new Message(inputStream, list, md5sum);

      // check if this message was already learned
      if (db.MD5SumExists(md5sum)) {
        // message already exists

        // --> correct token data
        filter.correctMessageAsHam(message);
      } else {
        // new message

        filter.trainMessageAsHam(message);
      }

      // close stream
      inputStream.close();

      // set dirty flag
      hasChanged = true;
    } catch (IOException e1) {
      LOG.severe(e1.getMessage());
      if (Logging.DEBUG)
        e1.printStackTrace();
    } catch (NoSuchAlgorithmException nsae) {
    } // does not occur

  }

  /**
   * @see org.columba.mail.spam.ISpamPlugin#save()
   */
  public void save() {
    try {
      // only save if changes exist
      if (alreadyLoaded && hasChanged) {
        // cleanup DB -> remove old tokens
        db.cleanupDB(THRESHOLD);

        // close DB
        db.close();
      }
    } catch (Exception e) {
      if (Logging.DEBUG) {
        e.printStackTrace();
      }
      // TODO (@author fdietz): i18n
      int value = JOptionPane.showConfirmDialog(FrameManager.getInstance()
          .getActiveFrame(),
          "An error occured while saving the spam database.\n"
              + "Try again?", "Error saving database",
          JOptionPane.YES_NO_OPTION, JOptionPane.WARNING_MESSAGE);
      if (value == JOptionPane.YES_OPTION) {
        save();
      }
    }

  }

  /**
   * @see org.columba.mail.spam.ISpamPlugin#load()
   */
  public void load() {
    /*
     * try { // only load if necessary if (!alreadyLoaded && file.exists()) {
     * FrequencyIO.load(db, file); }
     *
     * alreadyLoaded = true; } catch (IOException e) {
     * JOptionPane.showMessageDialog(
     * MainInterface.frameModel.getActiveFrame(), "An error occured while
     * loading the spam database.\n" + "I will use an empty one.", "Error
     * loading database", JOptionPane.ERROR_MESSAGE); if
     * (MainInterface.DEBUG) { e.printStackTrace(); } // fail-case db = new
     * FrequencyDBImpl();
     *
     * alreadyLoaded = true; }
     */
  }

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.