org.alfresco.repo.content.transform.EMLParser.java Source code

Java tutorial

Introduction

Here is the source code for org.alfresco.repo.content.transform.EMLParser.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional information regarding
 * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable
 * law or agreed to in writing, software distributed under the License is distributed on an "AS IS"
 * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License
 * for the specific language governing permissions and limitations under the License.
 */
package org.alfresco.repo.content.transform;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.UUID;

import javax.mail.Address;
import javax.mail.Message;
import javax.mail.MessagingException;
import javax.mail.Multipart;
import javax.mail.Part;
import javax.mail.Session;
import javax.mail.internet.AddressException;
import javax.mail.internet.MimeMessage;
import javax.mail.internet.MimeUtility;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.util.Pair;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/**
 * Defines a EML document content extractor.
 */
public class EMLParser implements AlternativeContentParser {

    /** The Constant MULTIPART_ALTERNATIVE. */
    private static final String MULTIPART_ALTERNATIVE = "multipart/alternative";

    /** The Constant serialVersionUID. */
    private static final long serialVersionUID = -8969240499669909899L;

    /** The Constant SUPPORTED_TYPES. */
    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
            new HashSet<MediaType>(Arrays.asList(MediaType.application(MimetypeMap.MIMETYPE_RFC822))));
    /** The ref cache. */
    private final Map<String, String> referencesCache = new HashMap<String, String>();

    /** The base dir. */
    private File workingDirectory;

    /** The base file. */
    private final Map<String, Pair<File, String>> parsedContent = new HashMap<String, Pair<File, String>>();
    private Logger logger = Logger.getLogger(getClass());

    /**
     * default construct with base file to store temp files.
     *
     * @param baseDir
     *            is where to store temp files.
     */
    public EMLParser(File baseDir) {
        this.workingDirectory = baseDir;
    }

    /**
     * Extracts properties and text from an EML Document input stream.
     *
     * @param stream
     *            the stream
     * @param handler
     *            the handler
     * @param metadata
     *            the metadata
     * @param context
     *            the context
     * @throws IOException
     *             Signals that an I/O exception has occurred.
     * @throws SAXException
     *             the sAX exception
     * @throws TikaException
     *             the tika exception
     */
    @Override
    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        Properties props = System.getProperties();
        Session mailSession = Session.getDefaultInstance(props, null);

        try {
            MimeMessage message = new MimeMessage(mailSession, stream);

            String subject = message.getSubject();
            String from = this.convertAddressesToString(message.getFrom());
            // Recipients :
            String messageException = "";
            String to = "";
            String cc = "";
            String bcc = "";
            try {
                // QVIDMS-2004 Added because of bug in Mail Api
                to = this.convertAddressesToString(message.getRecipients(Message.RecipientType.TO));
                cc = this.convertAddressesToString(message.getRecipients(Message.RecipientType.CC));
                bcc = this.convertAddressesToString(message.getRecipients(Message.RecipientType.BCC));
            } catch (AddressException e) {
                e.printStackTrace();
                messageException = e.getRef();
                if (messageException.indexOf("recipients:") != -1) {
                    to = messageException.substring(0, messageException.indexOf(":"));
                }
            }
            metadata.set(Office.AUTHOR, from);
            metadata.set(DublinCore.TITLE, subject);
            metadata.set(DublinCore.SUBJECT, subject);

            xhtml.element("h1", subject);

            xhtml.startElement("dl");
            header(xhtml, "From", MimeUtility.decodeText(from));
            header(xhtml, "To", MimeUtility.decodeText(to.toString()));
            header(xhtml, "Cc", MimeUtility.decodeText(cc.toString()));
            header(xhtml, "Bcc", MimeUtility.decodeText(bcc.toString()));

            // // Parse message
            // if (message.getContent() instanceof MimeMultipart) {
            // // Multipart message, call matching method
            // MimeMultipart multipart = (MimeMultipart) message.getContent();
            // this.extractMultipart(xhtml, multipart, context);

            List<String> attachmentList = new ArrayList<String>();
            // prepare attachments
            prepareExtractMultipart(xhtml, message, null, context, attachmentList);
            if (attachmentList.size() > 0) {
                // TODO internationalization
                header(xhtml, "Attachments", attachmentList.toString());
            }
            xhtml.endElement("dl");

            // a supprimer si pb et a remplacer par ce qui est commenT
            adaptedExtractMultipart(xhtml, message, null, context);

            xhtml.endDocument();
        } catch (Exception e) {
            throw new TikaException("Error while processing message", e);
        }
    }

    /**
     * Gets the content handler.
     *
     * @param targetMimeType
     *            the target mime type
     * @param output
     *            the output
     * @return the content handler
     * @throws TransformerConfigurationException
     *             the transformer configuration exception
     */
    protected static ContentHandler getContentHandler(String targetMimeType, Writer output)
            throws TransformerConfigurationException {
        if (MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimeType)) {
            return new BodyContentHandler(output);
        }

        SAXTransformerFactory factory = (SAXTransformerFactory) TransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(output));

        if (MimetypeMap.MIMETYPE_HTML.equals(targetMimeType)) {
            handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
        } else if (MimetypeMap.MIMETYPE_XHTML.equals(targetMimeType)
                || MimetypeMap.MIMETYPE_XML.equals(targetMimeType)) {
            handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
        } else {

        }
        return handler;
    }

    /**
     * Parses the data.
     *
     * @param stream
     *            the stream
     * @param handler
     *            the handler
     * @param metadata
     *            the metadata
     * @throws IOException
     *             Signals that an I/O exception has occurred.
     * @throws SAXException
     *             the sAX exception
     * @throws TikaException
     *             the tika exception
     * @deprecated This method will be removed in Apache Tika 1.0.
     */
    @Deprecated
    public void parse(InputStream stream, ContentHandler handler, Metadata metadata)
            throws IOException, SAXException, TikaException {
        parse(stream, handler, metadata, new ParseContext());
    }

    /**
     * Creates header part.
     *
     * @param xhtml
     *            the xhtml
     * @param key
     *            the key
     * @param value
     *            the value
     * @throws SAXException
     *             the sAX exception
     */
    private void header(XHTMLContentHandler xhtml, String key, String value) throws SAXException {
        if (value.length() > 0) {
            xhtml.element("dt", key);
            xhtml.element("dd", value);
        }
    }

    // Convert list of addresses into String
    /**
     * Convert addresses to string.
     *
     * @param addresses
     *            the addresses
     * @return the string
     */
    private String convertAddressesToString(Address[] addresses) {
        StringBuilder result = new StringBuilder();
        if (addresses != null) {
            String addressToAdd;
            for (int i = 0; i < addresses.length; i++) {
                addressToAdd = addresses[i].toString().replaceAll("<", "").replaceAll(">", "");
                result.append(addressToAdd).append("; ");
            }
            int resultLength = result.length();
            if (resultLength > 1)
                result.delete(resultLength - 2, resultLength);
        }
        return result.toString();
    }

    /**
     * Adapted extract multipart is the recusrsive parser that splits the data and apend it to the
     * final xhtml file.
     *
     * @param xhtml
     *            the xhtml
     * @param part
     *            the part
     * @param parentPart
     *            the parent part
     * @param context
     *            the context
     * @throws MessagingException
     *             the messaging exception
     * @throws IOException
     *             Signals that an I/O exception has occurred.
     * @throws SAXException
     *             the sAX exception
     * @throws TikaException
     *             the tika exception
     */
    public void adaptedExtractMultipart(XHTMLContentHandler xhtml, Part part, Part parentPart, ParseContext context)
            throws MessagingException, IOException, SAXException, TikaException {

        String disposition = part.getDisposition();
        if ((disposition != null && disposition.contains(Part.ATTACHMENT))) {
            return;
        }

        if (part.isMimeType("text/plain")) {
            if (parentPart != null && parentPart.isMimeType(MULTIPART_ALTERNATIVE)) {
                return;
            } else {
                // add file
                String data = part.getContent().toString();
                writeContent(part, data, MimetypeMap.MIMETYPE_TEXT_PLAIN, "txt");
            }
        } else if (part.isMimeType("multipart/*")) {
            Multipart mp = (Multipart) part.getContent();
            Part parentPartLocal = part;
            if (parentPart != null && parentPart.isMimeType(MULTIPART_ALTERNATIVE)) {
                parentPartLocal = parentPart;
            }
            int count = mp.getCount();
            for (int i = 0; i < count; i++) {
                adaptedExtractMultipart(xhtml, mp.getBodyPart(i), parentPartLocal, context);
            }
        } else if (part.isMimeType("message/rfc822")) {
            adaptedExtractMultipart(xhtml, (Part) part.getContent(), part, context);
        } else if (part.isMimeType("text/html")) {

            if ((parentPart != null && parentPart.isMimeType(MULTIPART_ALTERNATIVE))
                    || (part.getDisposition() == null || !part.getDisposition().contains(Part.ATTACHMENT))) {
                Object data = part.getContent();
                String htmlFileData = prepareString(new String(data.toString()));
                writeContent(part, htmlFileData, MimetypeMap.MIMETYPE_HTML, "html");
            }

        } else if (part.isMimeType("image/*")) {

            String[] encoded = part.getHeader("Content-Transfer-Encoding");
            if (isContained(encoded, "base64")) {
                if (part.getDisposition() != null && part.getDisposition().contains(Part.ATTACHMENT)) {
                    InputStream stream = part.getInputStream();
                    byte[] binaryData = new byte[part.getSize()];
                    stream.read(binaryData, 0, part.getSize());
                    String encodedData = new String(Base64.encodeBase64(binaryData));
                    String[] split = part.getContentType().split(";");
                    String src = "data:" + split[0].trim() + ";base64," + encodedData;
                    AttributesImpl attributes = new AttributesImpl();
                    attributes.addAttribute(null, "src", "src", "String", src);
                    xhtml.startElement("img", attributes);
                    xhtml.endElement("img");
                }
            }

        } else {
            Object content = part.getContent();
            if (content instanceof String) {
                xhtml.element("div", prepareString(part.getContent().toString()));
            } else if (content instanceof InputStream) {
                InputStream fileContent = part.getInputStream();

                Parser parser = new AutoDetectParser();
                Metadata attachmentMetadata = new Metadata();

                BodyContentHandler handlerAttachments = new BodyContentHandler();
                parser.parse(fileContent, handlerAttachments, attachmentMetadata, context);

                xhtml.element("div", handlerAttachments.toString());

            }
        }
    }

    /**
     * Write content of part to file.
     *
     * @param part
     *            the part to process
     * @param data
     *            the data to write as string
     * @param type
     *            the mimetype
     * @param extension
     *            the extension to create file with
     * @throws UnsupportedEncodingException
     *             the unsupported encoding exception
     * @throws FileNotFoundException
     *             the file not found exception
     * @throws IOException
     *             Signals that an I/O exception has occurred.
     */
    private void writeContent(Part part, String data, String type, String extension)
            throws UnsupportedEncodingException, FileNotFoundException, IOException {
        final File createdFile = new File(workingDirectory, UUID.randomUUID().toString() + "." + extension);
        final String charset = getCharset(part);
        final BufferedWriter outStream = new BufferedWriter(
                new OutputStreamWriter(new FileOutputStream(createdFile), charset));
        final char[] charArray = data.toCharArray();
        outStream.write(charArray, 0, charArray.length);
        IOUtils.closeQuietly(outStream);
        parsedContent.put(type, new Pair<File, String>(createdFile, charset));
    }

    /**
     * Gets the charset from content type.
     *
     * @param part
     *            the part to process
     * @return the charset if found or UTF-8 as default
     */
    private String getCharset(Part part) {
        String charset = UTF_8;
        String contentType = null;
        try {
            contentType = part.getContentType();
            int indexOf = contentType.indexOf("charset=");
            if (indexOf != -1) {
                charset = (contentType.substring(indexOf) + ";").replaceAll("charset=\"?", "")
                        .replaceAll("\"?;(\\s*.*)", "");
            }
            Charset.forName(charset);
        } catch (Exception e) {
            logger.error("Error processing content type: " + contentType, e);
            charset = UTF_8;
        }
        return charset;
    }

    /**
     * Prepare string.
     *
     * @param htmlFileData
     *            the html file data
     * @return the string
     */
    private String prepareString(String htmlFileData) {
        String tempData = htmlFileData;
        Iterator<String> iterator = referencesCache.keySet().iterator();
        while (iterator.hasNext()) {
            String cid = iterator.next();
            tempData = tempData.replace("cid:" + cid, referencesCache.get(cid));
        }
        return tempData;
    }

    /**
     * Prepare extract multipart.
     *
     * @param xhtml
     *            the xhtml
     * @param part
     *            the part
     * @param parentPart
     *            the parent part
     * @param context
     *            the context
     * @param attachmentList
     *            is list with attachments to fill
     * @throws MessagingException
     *             the messaging exception
     * @throws IOException
     *             Signals that an I/O exception has occurred.
     * @throws SAXException
     *             the sAX exception
     * @throws TikaException
     *             the tika exception
     */
    private void prepareExtractMultipart(XHTMLContentHandler xhtml, Part part, Part parentPart,
            ParseContext context, List<String> attachmentList)
            throws MessagingException, IOException, SAXException, TikaException {

        String disposition = part.getDisposition();
        if ((disposition != null && disposition.contains(Part.ATTACHMENT))) {
            String fileName = part.getFileName();
            if (fileName != null && fileName.startsWith("=?")) {
                fileName = MimeUtility.decodeText(fileName);
            }
            attachmentList.add(fileName);
        }

        String[] header = part.getHeader("Content-ID");
        String key = null;
        if (header != null) {
            for (String string : header) {
                key = string;
            }
        }

        if (part.isMimeType("multipart/*")) {
            Multipart mp = (Multipart) part.getContent();
            int count = mp.getCount();
            for (int i = 0; i < count; i++) {
                prepareExtractMultipart(xhtml, mp.getBodyPart(i), part, context, attachmentList);
            }
        } else if (part.isMimeType(MimetypeMap.MIMETYPE_RFC822)) {
            prepareExtractMultipart(xhtml, (Part) part.getContent(), part, context, attachmentList);
        } else {

            if (key == null) {
                return;
            }
            // if ((disposition != null && disposition.contains(Part.INLINE))) {
            InputStream stream = part.getInputStream();

            File file = new File(workingDirectory, System.currentTimeMillis() + "");
            FileOutputStream fileOutputStream = new FileOutputStream(file);
            IOUtils.copy(stream, fileOutputStream);
            IOUtils.closeQuietly(fileOutputStream);
            String src = file.getName();
            String replace = key.replace("<", "").replace(">", "");
            referencesCache.put(replace, src);
            // }

        }
    }

    /*
     * (non-Javadoc)
     * @see org.alfresco.repo.content.transform.AlternativeContentParser#getAlternatives()
     */
    @Override
    public Map<String, Pair<File, String>> getAlternatives() {
        return parsedContent;
    }

    /**
     * Clear.
     */
    public void clear() {
    }

    /*
     * (non-Javadoc)
     * @see org.apache.tika.parser.Parser#getSupportedTypes(org.apache.tika.parser .ParseContext)
     */
    @Override
    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    /**
     * Checks if value is contained in array of values.
     *
     * @param arr
     *            the arr to check
     * @param value
     *            the value
     * @return true, if is contained
     */
    private boolean isContained(String[] arr, String value) {
        if (arr != null && arr.length > 0 && arr[0].contains(value)) {
            return true;
        }
        return false;
    }
}