Java tutorial
/* * Copyright (c) 2010-2011, Martijn Brinkers, Djigzo. * * This file is part of Djigzo email encryption. * * Djigzo is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License * version 3, 19 November 2007 as published by the Free Software * Foundation. * * Djigzo is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public * License along with Djigzo. If not, see <http://www.gnu.org/licenses/> * * Additional permission under GNU AGPL version 3 section 7 * * If you modify this Program, or any covered work, by linking or * combining it with aspectjrt.jar, aspectjweaver.jar, tyrex-1.0.3.jar, * freemarker.jar, dom4j.jar, mx4j-jmx.jar, mx4j-tools.jar, * spice-classman-1.0.jar, spice-loggerstore-0.5.jar, spice-salt-0.8.jar, * spice-xmlpolicy-1.0.jar, saaj-api-1.3.jar, saaj-impl-1.3.jar, * wsdl4j-1.6.1.jar (or modified versions of these libraries), * containing parts covered by the terms of Eclipse Public License, * tyrex license, freemarker license, dom4j license, mx4j license, * Spice Software License, Common Development and Distribution License * (CDDL), Common Public License (CPL) the licensors of this Program grant * you additional permission to convey the resulting work. */ package mitm.common.dlp.impl; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.Enumeration; import java.util.HashSet; import java.util.List; import java.util.Set; import javax.mail.Header; import javax.mail.MessagingException; import javax.mail.Part; import javax.mail.internet.MimeMessage; import mitm.common.dlp.MimeMessageTextExtractor; import mitm.common.extractor.AutoDetectFullTextExtractor; import mitm.common.extractor.ExtractedPart; import mitm.common.extractor.MimeTypeDetector; import mitm.common.extractor.TextExtractorContext; import mitm.common.extractor.TextExtractorException; import mitm.common.extractor.TextExtractorFactoryRegistry; import mitm.common.extractor.impl.ExtractedPartImpl; import mitm.common.extractor.impl.TextExtractorContextImpl; import mitm.common.mail.BodyPartUtils; import mitm.common.mail.HeaderUtils; import mitm.common.mail.MimeTypes; import mitm.common.mail.MimeUtils; import mitm.common.mail.PartException; import mitm.common.mail.PartScanner; import mitm.common.mail.PartScanner.PartListener; import mitm.common.util.MiscStringUtils; import mitm.common.util.RewindableInputStream; import mitm.common.util.SizeUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.CharEncoding; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.exception.ExceptionUtils; import org.apache.commons.lang.text.StrBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Default implementation of MimeMessageTextExtractor. * * @author Martijn Brinkers * */ public class MimeMessageTextExtractorImpl extends AutoDetectFullTextExtractor implements MimeMessageTextExtractor { private final static Logger logger = LoggerFactory.getLogger(MimeMessageTextExtractorImpl.class); /* * Size at which the RewindableInputStream will start writing to disk */ private final int MEM_THRESHOLD = SizeUtils.MB; /* * maximum recursive depth for MIME parts. */ private int maxMimeDepth = 8; /* * If true, and the MIME maxDepth is reached a TextExtractorException exception will be thrown. */ private boolean exceptionOnMaxDepthReached = true; /* * If true, meta info like headers will be extracted */ private boolean extractMetaInfo; /* * The names of headers to skip */ private Set<String> skipHeaders; /* * Listener called for each MIME part in the message */ private PartListener partListener = new PartListenerImpl(); @Override protected Logger getLogger() { return logger; } private class PartListenerImpl implements PartScanner.PartListener { @Override public boolean onPart(Part parent, Part part, Object context) throws PartException { return MimeMessageTextExtractorImpl.this.onPart(parent, part, context); } } public MimeMessageTextExtractorImpl(MimeTypeDetector detector, TextExtractorFactoryRegistry factories) { super(detector, factories); } /* * Extracts the attached mime message and scans the message */ private void handleRFC822(Part part, PartContext partContext) throws IOException, MessagingException, PartException { MimeMessage attachedMessage = BodyPartUtils.extractFromRFC822(part); if (extractMetaInfo) { extractMimeMessageMetaInfo(attachedMessage, partContext); } PartScanner partScanner = new PartScanner(partListener, maxMimeDepth); partScanner.setExceptionOnMaxDepthReached(exceptionOnMaxDepthReached); partScanner.scanPart(attachedMessage, partContext); } private String getPartName(Part part) { String name = MimeUtils.getFilenameQuietly(part); if (StringUtils.isBlank(name)) { name = "body.txt"; } return name; } private void handlePart(Part part, PartContext partContext) throws IOException, MessagingException { RewindableInputStream input = new RewindableInputStream(part.getInputStream(), getThreshold()); try { extractText(input, getPartName(part), partContext); } finally { IOUtils.closeQuietly(input); } } protected void logException(String message, Exception e) { /* * Only log exception when debug level is enabled. The reason for this is that * when scanning office documents, a log of warnings can be reported and logging * the exception will clutter the log too much. */ if (getLogger().isDebugEnabled()) { getLogger().warn(message, e); } else { getLogger().warn(message + " Message: " + ExceptionUtils.getRootCauseMessage(e)); } } /* * Is called for each MIME part of the message */ private boolean onPart(Part parent, Part part, Object context) throws PartException { try { PartContext partContext = (PartContext) context; /* * If the part is a RFC822 message (ie. an attached MIME message we need to extract the message and * parse the message */ if (part.isMimeType(MimeTypes.MESSAGE_RFC822)) { handleRFC822(part, partContext); } else { handlePart(part, partContext); } } catch (Exception e) { if (isFailOnException()) { if (e instanceof PartException) { throw (PartException) e; } throw new PartException(e); } else { logException("Exception while handling part. Part will be skipped.", e); } } return true; } /* * Extract the headers of the message */ private void extractMimeMessageMetaInfo(MimeMessage message, PartContext context) throws MessagingException { TextExtractorContext extractorContext = new TextExtractorContextImpl(); extractorContext.setEncoding(CharEncoding.US_ASCII); extractorContext.setName("headers"); StrBuilder sb = new StrBuilder(4096); try { for (Enumeration<?> headerEnum = message.getAllHeaders(); headerEnum.hasMoreElements();) { Header header = (Header) headerEnum.nextElement(); if (header == null) { continue; } if (skipHeaders != null && skipHeaders.contains(StringUtils.lowerCase(header.getName()))) { continue; } sb.append(header.getName()).append(": ").appendln(HeaderUtils.decodeTextQuietly(header.getValue())); } } catch (MessagingException e) { /* * Fallback to raw headers */ for (Enumeration<?> headerEnum = message.getAllHeaderLines(); headerEnum.hasMoreElements();) { sb.appendln(headerEnum.nextElement()); } } byte[] headerBytes = MiscStringUtils.toUTF8Bytes(sb.toString()); RewindableInputStream input = new RewindableInputStream(new ByteArrayInputStream(headerBytes), MEM_THRESHOLD); ExtractedPart part = new ExtractedPartImpl(extractorContext, input, headerBytes.length); try { context.update(part, true /* add */); } catch (IOException e) { throw new MessagingException("Error adding part to context.", e); } } @Override public List<ExtractedPart> extractText(MimeMessage message) throws MessagingException, TextExtractorException { PartScanner partScanner = new PartScanner(partListener, maxMimeDepth); partScanner.setExceptionOnMaxDepthReached(exceptionOnMaxDepthReached); PartContext context = new PartContext(); if (extractMetaInfo) { extractMimeMessageMetaInfo(message, context); } try { partScanner.scanPart(message, context); } catch (PartException e) { if (isFailOnException()) { throw new TextExtractorException(e); } logException("PartException while extracting text.", e); } catch (IOException e) { if (isFailOnException()) { throw new TextExtractorException(e); } logException("IOException while extracting text.", e); } return context.getTextParts(); } public int getMaxMimeDepth() { return maxMimeDepth; } public void setMaxMimeDepth(int maxMimeDepth) { this.maxMimeDepth = maxMimeDepth; } public boolean isExceptionOnMaxDepthReached() { return exceptionOnMaxDepthReached; } public void setExceptionOnMaxDepthReached(boolean exceptionOnMaxDepthReached) { this.exceptionOnMaxDepthReached = exceptionOnMaxDepthReached; } @Override public void setExtractMetaInfo(boolean value) { this.extractMetaInfo = value; } @Override public boolean isExtractMetaInfo() { return extractMetaInfo; } /* * Setting the headers should only be done at startup because setting skipHeaders is not * synchronized. */ public void setSkipHeaders(Collection<String> headersToSkip) { if (headersToSkip == null) { return; } skipHeaders = Collections.synchronizedSet(new HashSet<String>()); for (String header : headersToSkip) { header = StringUtils.trimToNull(header); if (header == null) { continue; } skipHeaders.add(header.toLowerCase()); } logger.info("Skip headers: " + StringUtils.join(skipHeaders, ",")); } }