List of usage examples for org.apache.poi.hsmf MAPIMessage getMainChunks
public Chunks getMainChunks()
From source file:org.alfresco.repo.content.transform.MSGParser.java
License:Apache License
/** * Process header.// w w w . ja v a 2s . com * * @param msg * the msg * @param metadata * the metadata * @param xhtml * the xhtml * @throws Exception * the exception */ private void processHeader(MAPIMessage msg, Metadata metadata, XHTMLContentHandler xhtml) throws Exception { StringChunk subjectChunk = msg.getMainChunks().subjectChunk; if (msg.has7BitEncodingStrings()) { CharsetDetector detector = new CharsetDetector(); detector.setText(subjectChunk.getRawValue()); CharsetMatch detect = detector.detect(); if (detect.getConfidence() >= 20) { subjectChunk.set7BitEncoding(detect.getName()); } } String subject = subjectChunk.getValue(); String from = msg.getDisplayFrom(); metadata.set(DublinCore.CREATOR, from); metadata.set(Metadata.MESSAGE_FROM, from); metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo()); metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC()); metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC()); metadata.set(DublinCore.TITLE, subject); metadata.set(DublinCore.SUBJECT, msg.getConversationTopic()); try { for (String recipientAddress : msg.getRecipientEmailAddressList()) { if (recipientAddress != null) metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress); } } catch (ChunkNotFoundException he) { } // Will be fixed in POI 3.7 Final // Date - try two ways to find it // First try via the proper chunk if (msg.getMessageDate() != null) { metadata.set(DublinCore.DATE, msg.getMessageDate().getTime()); metadata.set(Office.CREATION_DATE, msg.getMessageDate().getTime()); metadata.set(Office.SAVE_DATE, msg.getMessageDate().getTime()); } else { try { // Failing that try via the raw headers String[] headers = msg.getHeaders(); if (headers != null && headers.length > 0) { for (String header : headers) { if (header.toLowerCase().startsWith("date:")) { String date = header.substring(header.indexOf(':') + 1).trim(); // See if we can parse it as a normal mail date try { Date d = MboxParser.parseDate(date); metadata.set(DublinCore.DATE, d); metadata.set(Office.CREATION_DATE, d); metadata.set(Office.SAVE_DATE, d); } catch (ParseException e) { // Store it as-is, and hope for the best... metadata.set(DublinCore.DATE, date); metadata.set(Office.CREATION_DATE, date); metadata.set(Office.SAVE_DATE, date); } break; } } } } catch (ChunkNotFoundException he) { // We can't find the date, sorry... } } xhtml.element("h1", subject); // Output the from and to details in text, as you // often want them in text form for searching xhtml.startElement("dl"); if (from != null) { header(xhtml, "From", from); } header(xhtml, "To", msg.getDisplayTo()); header(xhtml, "Cc", msg.getDisplayCC()); header(xhtml, "Bcc", msg.getDisplayBCC()); try { header(xhtml, "Recipients", msg.getRecipientEmailAddress()); } catch (ChunkNotFoundException e) { } List<String> attachmentList = new ArrayList<String>(); // // prepare attachments prepareExtractMultipart(xhtml, message, attachmentList); if (attachmentList.size() > 0) { header(xhtml, "Attachments", attachmentList.toString()); } xhtml.endElement("dl"); }
From source file:org.alfresco.repo.content.transform.MSGParser.java
License:Apache License
/** * Adapted extract multipart is parser that extracts the html body if exists, rtf body if exists * or at least plain text. The html or rtf file could be obtained as alternative. * * @param xhtml/*from w w w. j a v a2 s . co m*/ * the xhtml * @param msg * the message part * @param context * the context * @throws MessagingException * the messaging exception * @throws IOException * Signals that an I/O exception has occurred. * @throws SAXException * the sAX exception * @throws TikaException * the tika exception */ public void adaptedExtractMultipart(XHTMLContentHandler xhtml, MAPIMessage msg, ParseContext context) throws MessagingException, IOException, SAXException, TikaException { // Get the message body. Preference order is: html, rtf, text Chunk htmlChunk = null; Chunk rtfChunk = null; Chunk textChunk = null; for (Chunk chunk : msg.getMainChunks().getAll()) { if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) { htmlChunk = chunk; } if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) { rtfChunk = chunk; } if (chunk.getChunkId() == MAPIProperty.BODY.id) { textChunk = chunk; } } boolean doneBody = false; if (htmlChunk != null) { byte[] data = null; if (htmlChunk instanceof ByteChunk) { data = ((ByteChunk) htmlChunk).getValue(); } else if (htmlChunk instanceof StringChunk) { data = ((StringChunk) htmlChunk).getRawValue(); } File tempHtmlFile = new File(workingDirectory, System.currentTimeMillis() + ".html"); BufferedOutputStream rtfOutStream = new BufferedOutputStream(new FileOutputStream(tempHtmlFile)); byte[] preparedStringData = referencesCache.size() > 0 ? prepareHTMLString(new String(data)).getBytes() : data; IOUtils.copy(new ByteArrayInputStream(preparedStringData), rtfOutStream); IOUtils.closeQuietly(rtfOutStream); parsedContent.put(MimetypeMap.MIMETYPE_HTML, new Pair<File, String>(tempHtmlFile, encoding)); doneBody = true; } if (rtfChunk != null && !doneBody) { ByteChunk chunk = (ByteChunk) rtfChunk; MAPIProperty property = MAPIProperty.RTF_COMPRESSED; int type = Types.BINARY.getId(); byte[] data = chunk.getValue(); MAPIRtfAttribute rtf = new MAPIRtfAttribute(property, type, data); File tempRtfFile = new File(workingDirectory, System.currentTimeMillis() + ".rtf"); BufferedOutputStream rtfOutStream = new BufferedOutputStream(new FileOutputStream(tempRtfFile)); byte[] preparedStringData = referencesCache.size() > 0 ? prepareRTFString(new String(rtf.getData())).getBytes() : rtf.getData(); IOUtils.copy(new ByteArrayInputStream(preparedStringData), rtfOutStream); IOUtils.closeQuietly(rtfOutStream); parsedContent.put(MIMETYPE_RTF, new Pair<File, String>(tempRtfFile, encoding)); doneBody = true; } if (textChunk != null && !doneBody) { xhtml.element("p", ((StringChunk) textChunk).getValue()); } }
From source file:org.apache.tika.parser.microsoft.OutlookExtractor.java
License:Apache License
/** * Tries to identify the correct encoding for 7-bit (non-unicode) * strings in the file./*from w w w . jav a 2 s. com*/ * <p>Many messages store their strings as unicode, which is * nice and easy. Some use one-byte encodings for their * strings, but don't always store the encoding anywhere * helpful in the file.</p> * <p>This method checks for codepage properties, and failing that * looks at the headers for the message, and uses these to * guess the correct encoding for your file.</p> * <p>Bug #49441 has more on why this is needed</p> * <p>This is taken verbatim from POI (TIKA-1238) * as a temporary workaround to prevent unsupported encoding exceptions</p> */ private void guess7BitEncoding(MAPIMessage msg) { Chunks mainChunks = msg.getMainChunks(); //sanity check if (mainChunks == null) { return; } Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties(); if (props != null) { // First choice is a codepage property for (MAPIProperty prop : new MAPIProperty[] { MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID }) { List<PropertyValue> val = props.get(prop); if (val != null && val.size() > 0) { int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue(); String encoding = null; try { encoding = CodePageUtil.codepageToEncoding(codepage, true); } catch (UnsupportedEncodingException e) { //swallow } if (tryToSet7BitEncoding(msg, encoding)) { return; } } } } // Second choice is a charset on a content type header try { String[] headers = msg.getHeaders(); if (headers != null && headers.length > 0) { // Look for a content type with a charset Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); for (String header : headers) { if (header.startsWith("Content-Type")) { Matcher m = p.matcher(header); if (m.matches()) { // Found it! Tell all the string chunks String charset = m.group(1); if (tryToSet7BitEncoding(msg, charset)) { return; } } } } } } catch (ChunkNotFoundException e) { } // Nothing suitable in the headers, try HTML // TODO: do we need to replicate this in Tika? If we wind up // parsing the html version of the email, this is duplicative?? // Or do we need to reset the header strings based on the html // meta header if there is no other information? try { String html = msg.getHtmlBody(); if (html != null && html.length() > 0) { Charset charset = null; try { charset = detector.detect(new ByteArrayInputStream(html.getBytes(UTF_8)), EMPTY_METADATA); } catch (IOException e) { //swallow } if (charset != null && tryToSet7BitEncoding(msg, charset.name())) { return; } } } catch (ChunkNotFoundException e) { } //absolute last resort, try charset detector StringChunk text = mainChunks.textBodyChunk; if (text != null) { CharsetDetector detector = new CharsetDetector(); detector.setText(text.getRawValue()); CharsetMatch match = detector.detect(); if (match != null && match.getConfidence() > 35 && tryToSet7BitEncoding(msg, match.getName())) { return; } } }
From source file:org.silverpeas.core.mail.MsgMailExtractorIntegrationTest.java
License:Open Source License
/** * Extracts the reception date from an email. * * @param msg//from w ww .j a v a2 s . c o m * @return * @throws Exception */ protected static Date extractDateOfReception(MAPIMessage msg) throws Exception { if (msg.getMainChunks().messageHeaders != null) { String chunkContent = msg.getMainChunks().messageHeaders.getValue(); int dateIdx = chunkContent.indexOf("Date: "); if (dateIdx >= 0) { chunkContent = chunkContent.substring(dateIdx + 6, chunkContent.indexOf("\n", dateIdx)) .replaceAll("[\r\n]", ""); return DATE_MAIL_FORMAT.parse(chunkContent); } } return null; }
From source file:org.silverpeas.core.mail.MsgMailExtractorIT.java
License:Open Source License
/** * Extracts the reception date from an email. * * @param msg//ww w .j a va 2 s . c o m * @return * @throws Exception */ protected static Date extractDateOfReception(MAPIMessage msg) throws Exception { if (msg.getMainChunks().getMessageHeaders() != null) { String chunkContent = msg.getMainChunks().getMessageHeaders().getValue(); int dateIdx = chunkContent.indexOf("Date: "); if (dateIdx >= 0) { chunkContent = chunkContent.substring(dateIdx + 6, chunkContent.indexOf("\n", dateIdx)) .replaceAll("[\r\n]", ""); return DATE_MAIL_FORMAT.parse(chunkContent); } } return null; }