Example usage for org.apache.poi.hsmf MAPIMessage getMainChunks

Introduction

In this page you can find the example usage for org.apache.poi.hsmf MAPIMessage getMainChunks.

Prototype

public Chunks getMainChunks()

Source Link

Document

Gets the main, core details chunks

Usage

From source file:org.alfresco.repo.content.transform.MSGParser.java

License:Apache License

/**
 * Process header.// w w  w . ja v  a  2s . com
 *
 * @param msg
 *            the msg
 * @param metadata
 *            the metadata
 * @param xhtml
 *            the xhtml
 * @throws Exception
 *             the exception
 */
private void processHeader(MAPIMessage msg, Metadata metadata, XHTMLContentHandler xhtml) throws Exception {
    StringChunk subjectChunk = msg.getMainChunks().subjectChunk;
    if (msg.has7BitEncodingStrings()) {
        CharsetDetector detector = new CharsetDetector();
        detector.setText(subjectChunk.getRawValue());
        CharsetMatch detect = detector.detect();
        if (detect.getConfidence() >= 20) {
            subjectChunk.set7BitEncoding(detect.getName());
        }
    }
    String subject = subjectChunk.getValue();
    String from = msg.getDisplayFrom();

    metadata.set(DublinCore.CREATOR, from);
    metadata.set(Metadata.MESSAGE_FROM, from);
    metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
    metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
    metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());

    metadata.set(DublinCore.TITLE, subject);
    metadata.set(DublinCore.SUBJECT, msg.getConversationTopic());

    try {
        for (String recipientAddress : msg.getRecipientEmailAddressList()) {
            if (recipientAddress != null)
                metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
        }
    } catch (ChunkNotFoundException he) {
    } // Will be fixed in POI 3.7 Final

    // Date - try two ways to find it
    // First try via the proper chunk
    if (msg.getMessageDate() != null) {
        metadata.set(DublinCore.DATE, msg.getMessageDate().getTime());
        metadata.set(Office.CREATION_DATE, msg.getMessageDate().getTime());
        metadata.set(Office.SAVE_DATE, msg.getMessageDate().getTime());
    } else {
        try {
            // Failing that try via the raw headers
            String[] headers = msg.getHeaders();
            if (headers != null && headers.length > 0) {
                for (String header : headers) {
                    if (header.toLowerCase().startsWith("date:")) {
                        String date = header.substring(header.indexOf(':') + 1).trim();

                        // See if we can parse it as a normal mail date
                        try {
                            Date d = MboxParser.parseDate(date);
                            metadata.set(DublinCore.DATE, d);
                            metadata.set(Office.CREATION_DATE, d);
                            metadata.set(Office.SAVE_DATE, d);
                        } catch (ParseException e) {
                            // Store it as-is, and hope for the best...
                            metadata.set(DublinCore.DATE, date);
                            metadata.set(Office.CREATION_DATE, date);
                            metadata.set(Office.SAVE_DATE, date);
                        }
                        break;
                    }
                }
            }
        } catch (ChunkNotFoundException he) {
            // We can't find the date, sorry...
        }
    }

    xhtml.element("h1", subject);

    // Output the from and to details in text, as you
    // often want them in text form for searching
    xhtml.startElement("dl");
    if (from != null) {
        header(xhtml, "From", from);
    }
    header(xhtml, "To", msg.getDisplayTo());
    header(xhtml, "Cc", msg.getDisplayCC());
    header(xhtml, "Bcc", msg.getDisplayBCC());
    try {
        header(xhtml, "Recipients", msg.getRecipientEmailAddress());
    } catch (ChunkNotFoundException e) {
    }
    List<String> attachmentList = new ArrayList<String>();
    // // prepare attachments
    prepareExtractMultipart(xhtml, message, attachmentList);
    if (attachmentList.size() > 0) {
        header(xhtml, "Attachments", attachmentList.toString());
    }
    xhtml.endElement("dl");

}

From source file:org.alfresco.repo.content.transform.MSGParser.java

License:Apache License

/**
 * Adapted extract multipart is parser that extracts the html body if exists, rtf body if exists
 * or at least plain text. The html or rtf file could be obtained as alternative.
 *
 * @param xhtml/*from  w w w. j  a  v  a2  s .  co  m*/
 *            the xhtml
 * @param msg
 *            the message part
 * @param context
 *            the context
 * @throws MessagingException
 *             the messaging exception
 * @throws IOException
 *             Signals that an I/O exception has occurred.
 * @throws SAXException
 *             the sAX exception
 * @throws TikaException
 *             the tika exception
 */
public void adaptedExtractMultipart(XHTMLContentHandler xhtml, MAPIMessage msg, ParseContext context)
        throws MessagingException, IOException, SAXException, TikaException {
    // Get the message body. Preference order is: html, rtf, text
    Chunk htmlChunk = null;
    Chunk rtfChunk = null;
    Chunk textChunk = null;
    for (Chunk chunk : msg.getMainChunks().getAll()) {
        if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
            htmlChunk = chunk;
        }
        if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
            rtfChunk = chunk;
        }
        if (chunk.getChunkId() == MAPIProperty.BODY.id) {
            textChunk = chunk;
        }
    }

    boolean doneBody = false;
    if (htmlChunk != null) {
        byte[] data = null;
        if (htmlChunk instanceof ByteChunk) {
            data = ((ByteChunk) htmlChunk).getValue();
        } else if (htmlChunk instanceof StringChunk) {
            data = ((StringChunk) htmlChunk).getRawValue();
        }
        File tempHtmlFile = new File(workingDirectory, System.currentTimeMillis() + ".html");
        BufferedOutputStream rtfOutStream = new BufferedOutputStream(new FileOutputStream(tempHtmlFile));
        byte[] preparedStringData = referencesCache.size() > 0 ? prepareHTMLString(new String(data)).getBytes()
                : data;
        IOUtils.copy(new ByteArrayInputStream(preparedStringData), rtfOutStream);
        IOUtils.closeQuietly(rtfOutStream);
        parsedContent.put(MimetypeMap.MIMETYPE_HTML, new Pair<File, String>(tempHtmlFile, encoding));
        doneBody = true;

    }
    if (rtfChunk != null && !doneBody) {
        ByteChunk chunk = (ByteChunk) rtfChunk;

        MAPIProperty property = MAPIProperty.RTF_COMPRESSED;
        int type = Types.BINARY.getId();
        byte[] data = chunk.getValue();
        MAPIRtfAttribute rtf = new MAPIRtfAttribute(property, type, data);

        File tempRtfFile = new File(workingDirectory, System.currentTimeMillis() + ".rtf");
        BufferedOutputStream rtfOutStream = new BufferedOutputStream(new FileOutputStream(tempRtfFile));

        byte[] preparedStringData = referencesCache.size() > 0
                ? prepareRTFString(new String(rtf.getData())).getBytes()
                : rtf.getData();
        IOUtils.copy(new ByteArrayInputStream(preparedStringData), rtfOutStream);
        IOUtils.closeQuietly(rtfOutStream);

        parsedContent.put(MIMETYPE_RTF, new Pair<File, String>(tempRtfFile, encoding));
        doneBody = true;
    }
    if (textChunk != null && !doneBody) {
        xhtml.element("p", ((StringChunk) textChunk).getValue());
    }

}

From source file:org.apache.tika.parser.microsoft.OutlookExtractor.java

License:Apache License

/**
 * Tries to identify the correct encoding for 7-bit (non-unicode)
 *  strings in the file./*from w w  w . jav a  2  s.  com*/
 * <p>Many messages store their strings as unicode, which is
 *  nice and easy. Some use one-byte encodings for their
 *  strings, but don't always store the encoding anywhere
 *  helpful in the file.</p>
 * <p>This method checks for codepage properties, and failing that
 *  looks at the headers for the message, and uses these to
 *  guess the correct encoding for your file.</p>
 * <p>Bug #49441 has more on why this is needed</p>
 * <p>This is taken verbatim from POI (TIKA-1238)
 * as a temporary workaround to prevent unsupported encoding exceptions</p>
 */
private void guess7BitEncoding(MAPIMessage msg) {
    Chunks mainChunks = msg.getMainChunks();
    //sanity check
    if (mainChunks == null) {
        return;
    }

    Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties();
    if (props != null) {
        // First choice is a codepage property
        for (MAPIProperty prop : new MAPIProperty[] { MAPIProperty.MESSAGE_CODEPAGE,
                MAPIProperty.INTERNET_CPID }) {
            List<PropertyValue> val = props.get(prop);
            if (val != null && val.size() > 0) {
                int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue();
                String encoding = null;
                try {
                    encoding = CodePageUtil.codepageToEncoding(codepage, true);
                } catch (UnsupportedEncodingException e) {
                    //swallow
                }
                if (tryToSet7BitEncoding(msg, encoding)) {
                    return;
                }
            }
        }
    }

    // Second choice is a charset on a content type header
    try {
        String[] headers = msg.getHeaders();
        if (headers != null && headers.length > 0) {
            // Look for a content type with a charset
            Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?",
                    Pattern.CASE_INSENSITIVE);

            for (String header : headers) {
                if (header.startsWith("Content-Type")) {
                    Matcher m = p.matcher(header);
                    if (m.matches()) {
                        // Found it! Tell all the string chunks
                        String charset = m.group(1);
                        if (tryToSet7BitEncoding(msg, charset)) {
                            return;
                        }
                    }
                }
            }
        }
    } catch (ChunkNotFoundException e) {
    }

    // Nothing suitable in the headers, try HTML
    // TODO: do we need to replicate this in Tika? If we wind up
    // parsing the html version of the email, this is duplicative??
    // Or do we need to reset the header strings based on the html
    // meta header if there is no other information?
    try {
        String html = msg.getHtmlBody();
        if (html != null && html.length() > 0) {
            Charset charset = null;
            try {
                charset = detector.detect(new ByteArrayInputStream(html.getBytes(UTF_8)), EMPTY_METADATA);
            } catch (IOException e) {
                //swallow
            }
            if (charset != null && tryToSet7BitEncoding(msg, charset.name())) {
                return;
            }
        }
    } catch (ChunkNotFoundException e) {
    }

    //absolute last resort, try charset detector
    StringChunk text = mainChunks.textBodyChunk;
    if (text != null) {
        CharsetDetector detector = new CharsetDetector();
        detector.setText(text.getRawValue());
        CharsetMatch match = detector.detect();
        if (match != null && match.getConfidence() > 35 && tryToSet7BitEncoding(msg, match.getName())) {
            return;
        }
    }
}

From source file:org.silverpeas.core.mail.MsgMailExtractorIntegrationTest.java

License:Open Source License

/**
 * Extracts the reception date from an email.
 *
 * @param msg//from   w ww .j a v  a2  s . c o  m
 * @return
 * @throws Exception
 */
protected static Date extractDateOfReception(MAPIMessage msg) throws Exception {
    if (msg.getMainChunks().messageHeaders != null) {
        String chunkContent = msg.getMainChunks().messageHeaders.getValue();
        int dateIdx = chunkContent.indexOf("Date: ");
        if (dateIdx >= 0) {
            chunkContent = chunkContent.substring(dateIdx + 6, chunkContent.indexOf("\n", dateIdx))
                    .replaceAll("[\r\n]", "");
            return DATE_MAIL_FORMAT.parse(chunkContent);
        }
    }
    return null;
}

From source file:org.silverpeas.core.mail.MsgMailExtractorIT.java

License:Open Source License

/**
 * Extracts the reception date from an email.
 *
 * @param msg//ww w  .j a va 2  s . c  o  m
 * @return
 * @throws Exception
 */
protected static Date extractDateOfReception(MAPIMessage msg) throws Exception {
    if (msg.getMainChunks().getMessageHeaders() != null) {
        String chunkContent = msg.getMainChunks().getMessageHeaders().getValue();
        int dateIdx = chunkContent.indexOf("Date: ");
        if (dateIdx >= 0) {
            chunkContent = chunkContent.substring(dateIdx + 6, chunkContent.indexOf("\n", dateIdx))
                    .replaceAll("[\r\n]", "");
            return DATE_MAIL_FORMAT.parse(chunkContent);
        }
    }
    return null;
}