mitm.common.dlp.impl.MimeMessageTextExtractorImpl.java Source code

Introduction

Here is the source code for mitm.common.dlp.impl.MimeMessageTextExtractorImpl.java
Source

/*
 * Copyright (c) 2010-2011, Martijn Brinkers, Djigzo.
 * 
 * This file is part of Djigzo email encryption.
 *
 * Djigzo is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License 
 * version 3, 19 November 2007 as published by the Free Software 
 * Foundation.
 *
 * Djigzo is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public 
 * License along with Djigzo. If not, see <http://www.gnu.org/licenses/>
 *
 * Additional permission under GNU AGPL version 3 section 7
 * 
 * If you modify this Program, or any covered work, by linking or 
 * combining it with aspectjrt.jar, aspectjweaver.jar, tyrex-1.0.3.jar, 
 * freemarker.jar, dom4j.jar, mx4j-jmx.jar, mx4j-tools.jar, 
 * spice-classman-1.0.jar, spice-loggerstore-0.5.jar, spice-salt-0.8.jar, 
 * spice-xmlpolicy-1.0.jar, saaj-api-1.3.jar, saaj-impl-1.3.jar, 
 * wsdl4j-1.6.1.jar (or modified versions of these libraries), 
 * containing parts covered by the terms of Eclipse Public License, 
 * tyrex license, freemarker license, dom4j license, mx4j license,
 * Spice Software License, Common Development and Distribution License
 * (CDDL), Common Public License (CPL) the licensors of this Program grant 
 * you additional permission to convey the resulting work.
 */
package mitm.common.dlp.impl;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import javax.mail.Header;
import javax.mail.MessagingException;
import javax.mail.Part;
import javax.mail.internet.MimeMessage;

import mitm.common.dlp.MimeMessageTextExtractor;
import mitm.common.extractor.AutoDetectFullTextExtractor;
import mitm.common.extractor.ExtractedPart;
import mitm.common.extractor.MimeTypeDetector;
import mitm.common.extractor.TextExtractorContext;
import mitm.common.extractor.TextExtractorException;
import mitm.common.extractor.TextExtractorFactoryRegistry;
import mitm.common.extractor.impl.ExtractedPartImpl;
import mitm.common.extractor.impl.TextExtractorContextImpl;
import mitm.common.mail.BodyPartUtils;
import mitm.common.mail.HeaderUtils;
import mitm.common.mail.MimeTypes;
import mitm.common.mail.MimeUtils;
import mitm.common.mail.PartException;
import mitm.common.mail.PartScanner;
import mitm.common.mail.PartScanner.PartListener;
import mitm.common.util.MiscStringUtils;
import mitm.common.util.RewindableInputStream;
import mitm.common.util.SizeUtils;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.CharEncoding;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.commons.lang.text.StrBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Default implementation of MimeMessageTextExtractor.
 * 
 * @author Martijn Brinkers
 *
 */
public class MimeMessageTextExtractorImpl extends AutoDetectFullTextExtractor implements MimeMessageTextExtractor {
    private final static Logger logger = LoggerFactory.getLogger(MimeMessageTextExtractorImpl.class);

    /* 
     * Size at which the RewindableInputStream will start writing to disk
     */
    private final int MEM_THRESHOLD = SizeUtils.MB;

    /* 
     * maximum recursive depth for MIME parts.
     */
    private int maxMimeDepth = 8;

    /*
     * If true, and the MIME maxDepth is reached a TextExtractorException exception will be thrown.
     */
    private boolean exceptionOnMaxDepthReached = true;

    /*
     * If true, meta info like headers will be extracted
     */
    private boolean extractMetaInfo;

    /*
     * The names of headers to skip
     */
    private Set<String> skipHeaders;

    /*
     * Listener called for each MIME part in the message
     */
    private PartListener partListener = new PartListenerImpl();

    @Override
    protected Logger getLogger() {
        return logger;
    }

    private class PartListenerImpl implements PartScanner.PartListener {
        @Override
        public boolean onPart(Part parent, Part part, Object context) throws PartException {
            return MimeMessageTextExtractorImpl.this.onPart(parent, part, context);
        }
    }

    public MimeMessageTextExtractorImpl(MimeTypeDetector detector, TextExtractorFactoryRegistry factories) {
        super(detector, factories);
    }

    /*
     * Extracts the attached mime message and scans the message
     */
    private void handleRFC822(Part part, PartContext partContext)
            throws IOException, MessagingException, PartException {
        MimeMessage attachedMessage = BodyPartUtils.extractFromRFC822(part);

        if (extractMetaInfo) {
            extractMimeMessageMetaInfo(attachedMessage, partContext);
        }

        PartScanner partScanner = new PartScanner(partListener, maxMimeDepth);

        partScanner.setExceptionOnMaxDepthReached(exceptionOnMaxDepthReached);

        partScanner.scanPart(attachedMessage, partContext);
    }

    private String getPartName(Part part) {
        String name = MimeUtils.getFilenameQuietly(part);

        if (StringUtils.isBlank(name)) {
            name = "body.txt";
        }

        return name;
    }

    private void handlePart(Part part, PartContext partContext) throws IOException, MessagingException {
        RewindableInputStream input = new RewindableInputStream(part.getInputStream(), getThreshold());

        try {
            extractText(input, getPartName(part), partContext);
        } finally {
            IOUtils.closeQuietly(input);
        }
    }

    protected void logException(String message, Exception e) {
        /*
         * Only log exception when debug level is enabled. The reason for this is that
         * when scanning office documents, a log of warnings can be reported and logging
         * the exception will clutter the log too much.
         */
        if (getLogger().isDebugEnabled()) {
            getLogger().warn(message, e);
        } else {
            getLogger().warn(message + " Message: " + ExceptionUtils.getRootCauseMessage(e));
        }
    }

    /*
     * Is called for each MIME part of the message
     */
    private boolean onPart(Part parent, Part part, Object context) throws PartException {
        try {
            PartContext partContext = (PartContext) context;

            /*
             * If the part is a RFC822 message (ie. an attached MIME message we need to extract the message and
             * parse the message
             */
            if (part.isMimeType(MimeTypes.MESSAGE_RFC822)) {
                handleRFC822(part, partContext);
            } else {
                handlePart(part, partContext);
            }
        } catch (Exception e) {
            if (isFailOnException()) {
                if (e instanceof PartException) {
                    throw (PartException) e;
                }

                throw new PartException(e);
            } else {
                logException("Exception while handling part. Part will be skipped.", e);
            }
        }

        return true;
    }

    /*
     * Extract the headers of the message
     */
    private void extractMimeMessageMetaInfo(MimeMessage message, PartContext context) throws MessagingException {
        TextExtractorContext extractorContext = new TextExtractorContextImpl();

        extractorContext.setEncoding(CharEncoding.US_ASCII);
        extractorContext.setName("headers");

        StrBuilder sb = new StrBuilder(4096);

        try {
            for (Enumeration<?> headerEnum = message.getAllHeaders(); headerEnum.hasMoreElements();) {
                Header header = (Header) headerEnum.nextElement();

                if (header == null) {
                    continue;
                }

                if (skipHeaders != null && skipHeaders.contains(StringUtils.lowerCase(header.getName()))) {
                    continue;
                }

                sb.append(header.getName()).append(": ").appendln(HeaderUtils.decodeTextQuietly(header.getValue()));
            }
        } catch (MessagingException e) {
            /*
             * Fallback to raw headers
             */
            for (Enumeration<?> headerEnum = message.getAllHeaderLines(); headerEnum.hasMoreElements();) {
                sb.appendln(headerEnum.nextElement());
            }
        }

        byte[] headerBytes = MiscStringUtils.toUTF8Bytes(sb.toString());

        RewindableInputStream input = new RewindableInputStream(new ByteArrayInputStream(headerBytes),
                MEM_THRESHOLD);

        ExtractedPart part = new ExtractedPartImpl(extractorContext, input, headerBytes.length);

        try {
            context.update(part, true /* add */);
        } catch (IOException e) {
            throw new MessagingException("Error adding part to context.", e);
        }
    }

    @Override
    public List<ExtractedPart> extractText(MimeMessage message) throws MessagingException, TextExtractorException {
        PartScanner partScanner = new PartScanner(partListener, maxMimeDepth);

        partScanner.setExceptionOnMaxDepthReached(exceptionOnMaxDepthReached);

        PartContext context = new PartContext();

        if (extractMetaInfo) {
            extractMimeMessageMetaInfo(message, context);
        }

        try {
            partScanner.scanPart(message, context);
        } catch (PartException e) {
            if (isFailOnException()) {
                throw new TextExtractorException(e);
            }
            logException("PartException while extracting text.", e);
        } catch (IOException e) {
            if (isFailOnException()) {
                throw new TextExtractorException(e);
            }
            logException("IOException while extracting text.", e);
        }

        return context.getTextParts();
    }

    public int getMaxMimeDepth() {
        return maxMimeDepth;
    }

    public void setMaxMimeDepth(int maxMimeDepth) {
        this.maxMimeDepth = maxMimeDepth;
    }

    public boolean isExceptionOnMaxDepthReached() {
        return exceptionOnMaxDepthReached;
    }

    public void setExceptionOnMaxDepthReached(boolean exceptionOnMaxDepthReached) {
        this.exceptionOnMaxDepthReached = exceptionOnMaxDepthReached;
    }

    @Override
    public void setExtractMetaInfo(boolean value) {
        this.extractMetaInfo = value;
    }

    @Override
    public boolean isExtractMetaInfo() {
        return extractMetaInfo;
    }

    /*
     * Setting the headers should only be done at startup because setting skipHeaders is not
     * synchronized.
     */
    public void setSkipHeaders(Collection<String> headersToSkip) {
        if (headersToSkip == null) {
            return;
        }

        skipHeaders = Collections.synchronizedSet(new HashSet<String>());

        for (String header : headersToSkip) {
            header = StringUtils.trimToNull(header);

            if (header == null) {
                continue;
            }

            skipHeaders.add(header.toLowerCase());
        }

        logger.info("Skip headers: " + StringUtils.join(skipHeaders, ","));
    }
}