List of usage examples for org.dom4j Element add
void add(Namespace namespace);
Namespace
to this element. From source file:fr.gouv.culture.vitam.eml.EmlExtract.java
License:Open Source License
private static final String addSubIdentities(Element identification, BodyPart bp, InputStream inputStream, VitamArgument argument, ConfigLoader config) { Element newElt = XmlDom.factory.createElement(EMAIL_FIELDS.subidentity.name); String filename = null;//from w ww.j a v a 2 s. co m String result = ""; try { filename = bp.getFileName(); filename = StringUtils.toFileName(filename); if (filename != null) { Element elt = XmlDom.factory.createElement(EMAIL_FIELDS.filename.name); elt.setText(filename); newElt.add(elt); } else { filename = "eml.eml"; } } catch (MessagingException e) { } try { int size = bp.getSize(); if (size > 0) { Element elt = XmlDom.factory.createElement(EMAIL_FIELDS.attSize.name); elt.setText(Integer.toString(size)); newElt.add(elt); } } catch (MessagingException e) { } try { String description = bp.getDescription(); if (description != null) { Element elt = XmlDom.factory.createElement(EMAIL_FIELDS.description.name); elt.setText(description); newElt.add(elt); } } catch (MessagingException e) { } try { String disposition = bp.getDisposition(); if (disposition != null) { Element elt = XmlDom.factory.createElement(EMAIL_FIELDS.disposition.name); elt.setText(disposition); newElt.add(elt); } } catch (MessagingException e) { } File filetemp = null; FileOutputStream outputStream = null; try { // Force out to analysis if (config.extractFile) { filetemp = new File(argument.currentOutputDir, filename); } else { filetemp = File.createTempFile(StaticValues.PREFIX_TEMPFILE, filename); } byte[] buffer = new byte[8192]; int read = 0; outputStream = new FileOutputStream(filetemp); while ((read = inputStream.read(buffer)) >= 0) { outputStream.write(buffer, 0, read); } outputStream.close(); outputStream = null; } catch (IOException e1) { if (filetemp != null && !config.extractFile) { filetemp.delete(); } if (outputStream != null) { try { outputStream.close(); } catch (IOException e) { } } String status = "Error during access to attachment"; newElt.addAttribute(EMAIL_FIELDS.status.name, status); identification.add(newElt); return ""; } try { Commands.addFormatIdentification(newElt, filename, filetemp, config, argument); if (argument.extractKeyword) { // get back keyword in the main list Element keyw = (Element) newElt.selectSingleNode(EMAIL_FIELDS.keywords.name); if (keyw != null) { StringBuilder builder = new StringBuilder(); @SuppressWarnings("unchecked") List<Element> elts = (List<Element>) keyw.selectNodes(EMAIL_FIELDS.keywordRank.name); for (Element elt : elts) { String value = elt.attributeValue(EMAIL_FIELDS.keywordOccur.name); int occur = Integer.parseInt(value) / 2 + 1; @SuppressWarnings("unchecked") List<Element> words = (List<Element>) elt.selectNodes(EMAIL_FIELDS.keywordWord.name); for (Element eword : words) { String word = eword.attributeValue(EMAIL_FIELDS.keywordValue.name) + " "; for (int i = 0; i < occur; i++) { builder.append(word); } } } result = builder.toString().trim(); } } } catch (Exception e) { String status = "Error during identification"; e.printStackTrace(); config.addRankId(newElt); newElt.addAttribute(EMAIL_FIELDS.status.name, status); } if (filetemp != null && !config.extractFile) { filetemp.delete(); } identification.add(newElt); return result; }
From source file:fr.gouv.culture.vitam.eml.EmlExtract.java
License:Open Source License
private static final String handleMessageRecur(Message message, Element identification, String id, VitamArgument argument, ConfigLoader config) throws IOException, MessagingException { Object content = message.getContent(); String result = ""; if (content instanceof String) { String[] cte = message.getHeader("Content-Transfer-Encoding"); String[] aresult = null;//from w w w . j a va 2s .c o m if (cte != null && cte.length > 0) { aresult = extractContentType(message.getContentType(), cte[0]); } else { aresult = extractContentType(message.getContentType(), null); } Element emlroot = XmlDom.factory.createElement("body"); // <identity format="Internet Message Format" mime="message/rfc822" puid="fmt/278" extensions="eml"/> Element subidenti = XmlDom.factory.createElement("identification"); Element identity = XmlDom.factory.createElement("identity"); identity.addAttribute("format", "Internet Message Body Format"); identity.addAttribute("mime", aresult[0] != null ? aresult[0] : "unknown"); identity.addAttribute("extensions", aresult[3] != null ? aresult[3].substring(1) : "unknown"); if (aresult[1] != null) { identity.addAttribute("charset", aresult[1]); } identification.add(identity); emlroot.add(subidenti); identification.add(emlroot); //result += " " + saveBody((String) content.toString(), aresult, id, argument, config); result += " " + saveBody(message.getInputStream(), aresult, id, argument, config); // ignore string } else if (content instanceof Multipart) { Multipart mp = (Multipart) content; if (argument.extractKeyword) { result = handleMultipartRecur(mp, identification, id, argument, config); } else { handleMultipartRecur(mp, identification, id, argument, config); } // handle multi part } return result; }
From source file:fr.gouv.culture.vitam.eml.EmlExtract.java
License:Open Source License
private static final String handleMultipartRecur(Multipart mp, Element identification, String id, VitamArgument argument, ConfigLoader config) throws MessagingException, IOException { int count = mp.getCount(); String result = ""; for (int i = 0; i < count; i++) { BodyPart bp = mp.getBodyPart(i); Object content = bp.getContent(); if (content instanceof String) { String[] cte = bp.getHeader("Content-Transfer-Encoding"); String[] aresult = null; if (cte != null && cte.length > 0) { aresult = extractContentType(bp.getContentType(), cte[0]); } else { aresult = extractContentType(bp.getContentType(), null); }/*from ww w . j a va 2s .com*/ Element emlroot = XmlDom.factory.createElement("body"); // <identity format="Internet Message Format" mime="message/rfc822" puid="fmt/278" extensions="eml"/> Element subidenti = XmlDom.factory.createElement("identification"); Element identity = XmlDom.factory.createElement("identity"); identity.addAttribute("format", "Internet Message Body Format"); identity.addAttribute("mime", aresult[0] != null ? aresult[0] : "unknown"); identity.addAttribute("extensions", aresult[3] != null ? aresult[3].substring(1) : "unknown"); if (aresult[1] != null) { identity.addAttribute("charset", aresult[1]); } identification.add(identity); emlroot.add(subidenti); identification.add(emlroot); //result += " " + saveBody((String) content.toString(), aresult, id, argument, config); result += " " + saveBody(bp.getInputStream(), aresult, id, argument, config); // ignore string } else if (content instanceof InputStream) { // handle input stream if (argument.extractKeyword) { result += " " + addSubIdentities(identification, bp, (InputStream) content, argument, config); } else { addSubIdentities(identification, bp, (InputStream) content, argument, config); } } else if (content instanceof Message) { Message message = (Message) content; if (argument.extractKeyword) { result += " " + handleMessageRecur(message, identification, id + "_" + i, argument, config); } else { handleMessageRecur(message, identification, id + "_" + i, argument, config); } } else if (content instanceof Multipart) { Multipart mp2 = (Multipart) content; if (argument.extractKeyword) { result += " " + handleMultipartRecur(mp2, identification, id + "_" + i, argument, config); } else { handleMultipartRecur(mp2, identification, id + "_" + i, argument, config); } } } return result; }
From source file:fr.gouv.culture.vitam.eml.MailboxParser.java
License:Open Source License
private Element extractInfoMbox(File mboxFile, Element root) { collisions = 0;//from www. ja v a 2 s .com HashSet<Integer> seenMessages = new HashSet<Integer>(); // Open the file for reading BufferedReader reader = null; try { StringBuilder inputBuilder = new StringBuilder(); String line = ""; FileInputStream inputStream = new FileInputStream(mboxFile); reader = new BufferedReader(new InputStreamReader(inputStream, charset)); // Read the mbox file line by line while ((line = reader.readLine()) != null) { inputBuilder.append(line); inputBuilder.append(System.getProperty("line.separator")); } String text = inputBuilder.toString(); inputBuilder = null; String[] rawlines = null;//text.split("(\n\r)|(\n)|(\r)"); rawlines = text.split("\r?\n|\r"); if (debug) System.err.println("Split file into " + rawlines.length + " lines"); Pattern seperatorPattern = Pattern.compile("^From (.*?) (.*?):(.*?):(.*?)$"); // From // Apache // JAMES // server Pattern headerPattern = Pattern.compile("^[\\x21-\\x39\\x3B-\\x7E]+:(.*)$"); // From RFC // 5322 // - Oct // 2008 String ssep = System.getProperty("line.separator"); // Here comes the big ugly loop ... int lastFoundSepLine = -1; Map<Integer, Integer> separatorsMap = new HashMap<Integer, Integer>(); for (int line_num = 0; line_num < rawlines.length; line_num++) { String currentLine = rawlines[line_num]; // If we found a header name line if (headerPattern.matcher(currentLine).matches()) { /*if (debug) System.err.println("HEADER MATCH! " + line_num);*/ if (lastFoundSepLine != -1) { if (separatorsMap.containsKey(lastFoundSepLine)) { int numHeaders = separatorsMap.get(lastFoundSepLine); numHeaders++; separatorsMap.put(lastFoundSepLine, numHeaders); } } } // If we found a separator line if (seperatorPattern.matcher(currentLine).matches()) { /*if (debug) System.err.println("SEP MATCH! " + line_num);*/ lastFoundSepLine = line_num; separatorsMap.put(lastFoundSepLine, 0); } } // Treat the end of the file as potential separator ;-) separatorsMap.put(rawlines.length, HEADERTHRESHOLD); // Compose the messages // If we read at least HEADERTHRESHOLD many headers after the separator List<Integer> separators = new ArrayList<Integer>(); for (Integer x : separatorsMap.keySet()) { if (separatorsMap.get(x) >= HEADERTHRESHOLD) { separators.add(x); } else { // Line x is a bogus header line and should be escaped!! rawlines[x.intValue()] = ">" + rawlines[x.intValue()]; } } Collections.sort(separators); for (int i = 0; i < separators.size() - 1; i++) { int startLine = separators.get(i); int endLine = separators.get(i + 1); if (debug) System.err.println("Message from lines " + startLine + " - " + endLine + " (" + (endLine - startLine + 1) + ")"); else System.out.print('.'); // compose a raw message StringBuilder rawMsgBuilder = new StringBuilder(); for (int l = startLine + 1; l < endLine; l++) { rawMsgBuilder.append(rawlines[l] + ssep); } String rawMessageText = rawMsgBuilder.toString().trim(); int hashKey = rawMessageText.hashCode(); if (!seenMessages.contains(hashKey)) { MimeMessage message = convertTextToMimeMessage(rawMessageText); if (message == null) { Element newElt = XmlDom.factory.createElement(EMAIL_FIELDS.formatEML.name); String status = "Error during identification"; newElt.addAttribute(EMAIL_FIELDS.status.name, status); root.add(newElt); } else { numberEmails++; if (IGNOREDUPLICATES) { seenMessages.add(hashKey); } Element emlroot = XmlDom.factory.createElement(EMAIL_FIELDS.formatEML.name); // <identity format="Internet Message Format" mime="message/rfc822" puid="fmt/278" extensions="eml"/> Element identification = XmlDom.factory.createElement("identification"); Element identity = XmlDom.factory.createElement("identity"); identity.addAttribute("format", "Internet Message Format"); identity.addAttribute("mime", "message/rfc822"); identity.addAttribute("puid", "fmt/278"); identity.addAttribute("extensions", "eml"); identification.add(identity); emlroot.add(identification); EmlExtract.extractInfoMessage(message, emlroot, argument, config); root.add(emlroot); /* if (config.extractFile) { File old = argument.currentOutputDir; String id = emlroot.attributeValue(EMAIL_FIELDS.rankId.name); if (config.extractFile) { File newOutDir = new File(argument.currentOutputDir, id); newOutDir.mkdirs(); argument.currentOutputDir = newOutDir; } // XXX FIXME should write rawMessageText to eml file using id+"_"+message.getSubject()+".eml" System.out.println("should write rawMessageText to eml file using "+id+" and subdir .eml"); argument.currentOutputDir = old; } */ } } else { if (debug) System.err.println("Duplicated message found"); collisions++; } } // end compose the last message if that one was valid if (numberEmails == 0) { // not a MBOX root = null; } else { System.err.println("Split into " + numberEmails + " messages!"); } } catch (IOException e) { System.err.println( "Error while trying to read file: " + mboxFile.getAbsolutePath() + " " + e.getMessage()); if (debug) System.err.println("-------- Stacktrace ----------"); if (debug) e.printStackTrace(); return null; } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { } } } if (root != null) { root.addAttribute("nbEml", config.nbDoc.toString()); } return root; }
From source file:fr.gouv.culture.vitam.eml.MsgExtract2.java
License:Open Source License
private static void addAddress(Element root, String entry, String address) { Element val = XmlDom.factory.createElement(entry); String ad = StringUtils.selectChevron(address); if (ad == null) { ad = "";/*from w ww . ja va 2s. c o m*/ } String nams = address.replace('<' + ad + '>', ""); if (nams.length() > 0) { Element name = XmlDom.factory.createElement(EMAIL_FIELDS.emailName.name); name.setText(StringUtils.unescapeHTML(nams, true, false)); val.add(name); } if (ad != null && ad.length() > 0) { Element addresse = XmlDom.factory.createElement(EMAIL_FIELDS.emailAddress.name); addresse.setText(StringUtils.unescapeHTML(ad, true, false)); val.add(addresse); } if (val.hasContent()) { root.add(val); } }
From source file:fr.gouv.culture.vitam.eml.MsgExtract2.java
License:Open Source License
private static String extractInfoSubEmail(MAPIMessage msg, File curDir, Element root, VitamArgument argument, ConfigLoader config) {//from w ww.j a v a 2 s. c o m File curPath = null; Element keywords = XmlDom.factory.createElement(EMAIL_FIELDS.keywords.name); Element metadata = XmlDom.factory.createElement(EMAIL_FIELDS.metadata.name); String id = config.addRankId(root); curPath = new File(curDir, "MSG_" + id); //System.out.println("start of "+id); String[] values = new String[Keywords.values().length]; for (int i = 0; i < Keywords.values().length; i++) { values[i] = null; } String[] test = null; try { test = msg.getHeaders(); } catch (ChunkNotFoundException e4) { // TODO Auto-generated catch block e4.printStackTrace(); } int lastRank = -1; for (String string : test) { if (string.startsWith(Keywords.NextOne.name) && lastRank >= 0) { String recv = string.substring(Keywords.NextOne.name.length()); if (values[lastRank] == null) { values[lastRank] = recv; } else { values[lastRank] += (Keywords.Received.ordinal() == lastRank ? "\n" : " ") + recv; } } else { if (string.startsWith(Keywords.Date.name)) { values[Keywords.Date.ordinal()] = string.substring(Keywords.Date.name.length()); lastRank = -1; } else if (string.startsWith(Keywords.XOriginalArrivalTime.name)) { values[Keywords.XOriginalArrivalTime.ordinal()] = string .substring(Keywords.XOriginalArrivalTime.name.length()); int pos = values[Keywords.XOriginalArrivalTime.ordinal()].indexOf(" FILETIME="); if (pos > 0) { values[Keywords.XOriginalArrivalTime .ordinal()] = values[Keywords.XOriginalArrivalTime.ordinal()].substring(0, pos); } lastRank = -1; } else if (string.startsWith(Keywords.MessageId.name)) { values[Keywords.MessageId.ordinal()] = string.substring(Keywords.MessageId.name.length()); values[Keywords.MessageId.ordinal()] = StringUtils .removeChevron( StringUtils.unescapeHTML(values[Keywords.MessageId.ordinal()], true, false)) .trim(); lastRank = -1; } else if (string.startsWith(Keywords.InReplyTo.name)) { String reply = StringUtils.removeChevron(StringUtils .unescapeHTML(string.substring(Keywords.InReplyTo.name.length()), true, false)); if (values[Keywords.InReplyTo.ordinal()] == null) { values[Keywords.InReplyTo.ordinal()] = reply; } else { values[Keywords.InReplyTo.ordinal()] += " " + reply; } lastRank = Keywords.InReplyTo.ordinal(); } else if (string.startsWith(Keywords.Received.name)) { String recv = string.substring(Keywords.Received.name.length()); if (values[Keywords.Received.ordinal()] == null) { values[Keywords.Received.ordinal()] = recv; } else { values[Keywords.Received.ordinal()] += "\n" + recv; } lastRank = Keywords.Received.ordinal(); } else if (string.startsWith(Keywords.From.name)) { values[Keywords.From.ordinal()] = string.substring(Keywords.From.name.length()); lastRank = -1; } else if (string.startsWith(Keywords.To.name)) { if (values[Keywords.To.ordinal()] == null) { values[Keywords.To.ordinal()] = string.substring(Keywords.To.name.length()); } else { values[Keywords.To.ordinal()] += " " + string.substring(Keywords.To.name.length()); } lastRank = Keywords.To.ordinal(); } else if (string.startsWith(Keywords.Cc.name)) { if (values[Keywords.Cc.ordinal()] == null) { values[Keywords.Cc.ordinal()] = string.substring(Keywords.Cc.name.length()); } else { values[Keywords.Cc.ordinal()] += " " + string.substring(Keywords.Cc.name.length()); } lastRank = Keywords.Cc.ordinal(); } else if (string.startsWith(Keywords.Bcc.name)) { if (values[Keywords.Bcc.ordinal()] == null) { values[Keywords.Bcc.ordinal()] = string.substring(Keywords.Bcc.name.length()); } else { values[Keywords.Bcc.ordinal()] += " " + string.substring(Keywords.Bcc.name.length()); } lastRank = Keywords.Bcc.ordinal(); } else if (string.startsWith(Keywords.ReturnPath.name)) { if (values[Keywords.ReturnPath.ordinal()] == null) { values[Keywords.ReturnPath.ordinal()] = string.substring(Keywords.ReturnPath.name.length()); } else { values[Keywords.ReturnPath.ordinal()] += " " + string.substring(Keywords.ReturnPath.name.length()); } lastRank = Keywords.ReturnPath.ordinal(); } else if (string.startsWith(Keywords.Importance.name)) { values[Keywords.Importance.ordinal()] = string.substring(Keywords.Importance.name.length()); lastRank = -1; } else if (string.startsWith(Keywords.Priority.name)) { values[Keywords.Priority.ordinal()] = string.substring(Keywords.Priority.name.length()); lastRank = -1; } else if (string.startsWith(Keywords.XFolder.name)) { values[Keywords.XFolder.ordinal()] = string.substring(Keywords.XFolder.name.length()); lastRank = -1; } else if (string.startsWith(Keywords.XSDOC.name)) { values[Keywords.XSDOC.ordinal()] = string.substring(Keywords.XSDOC.name.length()); lastRank = -1; } else if (string.startsWith(Keywords.Sensitivity.name)) { values[Keywords.Sensitivity.ordinal()] = string.substring(Keywords.Sensitivity.name.length()); lastRank = -1; } else { lastRank = -1; } } } /*for (int i = 0; i < Keywords.values().length; i++) { System.out.println(Keywords.values()[i].name()+": "+values[i]); }*/ if (values[Keywords.XFolder.ordinal()] != null) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.folder.name); sub.addAttribute(EMAIL_FIELDS.folderName.name, values[Keywords.XFolder.ordinal()]); metadata.add(sub); } String fromEmail = values[Keywords.From.ordinal()]; if (fromEmail != null) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.from.name); addAddress(sub, EMAIL_FIELDS.fromUnit.name, fromEmail); String fromEmail2 = values[Keywords.ReturnPath.ordinal()]; if (fromEmail2 != null && !fromEmail.contains(fromEmail2)) { addAddress(sub, EMAIL_FIELDS.fromUnit.name, fromEmail2); } metadata.add(sub); } else { String fromEmail2 = values[Keywords.ReturnPath.ordinal()]; if (fromEmail2 != null) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.from.name); addAddress(sub, EMAIL_FIELDS.fromUnit.name, fromEmail2); metadata.add(sub); } } fromEmail = values[Keywords.To.ordinal()]; if (fromEmail != null) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.toRecipients.name); String[] to = fromEmail.split(","); for (String string2 : to) { addAddress(sub, EMAIL_FIELDS.toUnit.name, string2); } metadata.add(sub); } fromEmail = values[Keywords.Cc.ordinal()]; if (fromEmail != null) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.ccRecipients.name); String[] to = fromEmail.split(","); for (String string2 : to) { addAddress(sub, EMAIL_FIELDS.ccUnit.name, string2); } metadata.add(sub); } fromEmail = values[Keywords.Bcc.ordinal()]; if (fromEmail != null) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.bccRecipients.name); String[] to = fromEmail.split(","); for (String string2 : to) { addAddress(sub, EMAIL_FIELDS.bccUnit.name, string2); } metadata.add(sub); } String subject = null; try { subject = msg.getSubject(); } catch (ChunkNotFoundException e3) { // TODO Auto-generated catch block e3.printStackTrace(); } if (subject != null) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.subject.name); sub.setText(StringUtils.unescapeHTML(subject, true, false)); metadata.add(sub); } subject = null; try { subject = msg.getConversationTopic(); } catch (ChunkNotFoundException e3) { //System.err.println(e3.getMessage()); } if (subject != null) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.conversationTopic.name); sub.setText(StringUtils.unescapeHTML(subject, true, false)); metadata.add(sub); } if (values[Keywords.Date.ordinal()] != null) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.sentDate.name); sub.setText(values[Keywords.Date.ordinal()]); metadata.add(sub); } if (values[Keywords.XOriginalArrivalTime.ordinal()] != null) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.receivedDate.name); sub.setText(values[Keywords.XOriginalArrivalTime.ordinal()]); metadata.add(sub); } if (values[Keywords.Received.ordinal()] != null) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.receptionTrace.name); String[] traces = values[Keywords.Received.ordinal()].split("\n"); for (String string : traces) { Element sub3 = XmlDom.factory.createElement(EMAIL_FIELDS.trace.name); sub3.setText(string); sub.add(sub3); } metadata.add(sub); } if (values[Keywords.XSDOC.ordinal()] != null) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.emailSize.name); sub.setText(values[Keywords.XSDOC.ordinal()]); metadata.add(sub); } String messageId = values[Keywords.MessageId.ordinal()]; if (messageId != null) { messageId = StringUtils.removeChevron(StringUtils.unescapeHTML(messageId, true, false)).trim(); if (messageId.length() > 1) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.messageId.name); sub.setText(messageId); metadata.add(sub); } } String InReplyToId = values[Keywords.InReplyTo.ordinal()]; if (InReplyToId != null) { InReplyToId = StringUtils.removeChevron(StringUtils.unescapeHTML(InReplyToId, true, false)).trim(); if (InReplyToId.length() > 1) { Element sub = XmlDom.factory.createElement(EMAIL_FIELDS.inReplyTo.name); sub.setText(InReplyToId); if (messageId != null && messageId.length() > 1) { String old = EmlExtract.filEmls.get(InReplyToId); if (old == null) { old = messageId; } else { old += "," + messageId; } EmlExtract.filEmls.put(InReplyToId, old); } metadata.add(sub); } InReplyToId = null; } Element prop = XmlDom.factory.createElement(EMAIL_FIELDS.properties.name); String imp = values[Keywords.Importance.ordinal()]; if (imp != null && imp.length() > 0) { try { int Priority = Integer.parseInt(imp); switch (Priority) { case 5: imp = "LOWEST"; break; case 4: imp = "LOW"; break; case 3: imp = "NORMAL"; break; case 2: imp = "HIGH"; break; case 1: imp = "HIGHEST"; break; default: imp = "LEV" + Priority; } } catch (NumberFormatException e) { // ignore since imp will be used as returned } prop.addAttribute(EMAIL_FIELDS.importance.name, imp); } imp = values[Keywords.Priority.ordinal()]; if (imp != null && imp.length() > 0) { try { int Priority = Integer.parseInt(imp); switch (Priority) { case 5: imp = "LOWEST"; break; case 4: imp = "LOW"; break; case 3: imp = "NORMAL"; break; case 2: imp = "HIGH"; break; case 1: imp = "HIGHEST"; break; default: imp = "LEV" + Priority; } } catch (NumberFormatException e) { // ignore since imp will be used as returned } prop.addAttribute(EMAIL_FIELDS.priority.name, imp); } if (values[Keywords.Sensitivity.ordinal()] != null) { prop.addAttribute(EMAIL_FIELDS.sensitivity.name, values[Keywords.Sensitivity.ordinal()]); } AttachmentChunks[] files = msg.getAttachmentFiles(); boolean Attachments = (files != null && files.length > 0); prop.addAttribute(EMAIL_FIELDS.hasAttachment.name, Boolean.toString(Attachments)); metadata.add(prop); String result = ""; Element identification = null; if (Attachments) { File oldPath = curPath; if (config.extractFile) { File newDir = new File(curPath, id); newDir.mkdir(); curPath = newDir; } identification = XmlDom.factory.createElement(EMAIL_FIELDS.attachments.name); // get the number of attachments for this message int NumberOfAttachments = files.length; identification.addAttribute(EMAIL_FIELDS.attNumber.name, Integer.toString(NumberOfAttachments)); // get a specific attachment from this email. for (int attachmentNumber = 0; attachmentNumber < NumberOfAttachments; attachmentNumber++) { AttachmentChunks attachment = files[attachmentNumber]; if (argument.extractKeyword) { result += " " + extractInfoAttachment(attachment, identification, argument, config, curPath); } else { extractInfoAttachment(attachment, identification, argument, config, curPath); } } curPath = oldPath; } // Plain text e-mail body String body = ""; if (argument.extractKeyword || config.extractFile) { try { body = msg.getTextBody(); } catch (ChunkNotFoundException e2) { // TODO Auto-generated catch block e2.printStackTrace(); } boolean isTxt = true; boolean isHttp = false; if (body == null || body.isEmpty()) { isTxt = false; try { body = msg.getHtmlBody(); } catch (ChunkNotFoundException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } isHttp = true; if (body == null || body.isEmpty()) { isHttp = false; try { body = msg.getRtfBody(); } catch (ChunkNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } if (body != null && !body.isEmpty()) { if (config.extractFile) { // XXX FIXME could saved email from HTML Body (clearer) if possible // use curRank in name, and attachment will be under directory named // add currank in field File newDir = new File(curPath, id); newDir.mkdir(); String filenamebody = messageId; if (filenamebody == null || !filenamebody.isEmpty()) { filenamebody = id; } String html = null; if (isHttp) { html = body; } String rtf = null; if (!isTxt && !isHttp) { rtf = body; } if (isTxt) { FileOutputStream output = null; try { output = new FileOutputStream(new File(newDir, filenamebody + ".txt")); byte[] bb = body.getBytes(); output.write(bb, 0, bb.length); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (output != null) { try { output.close(); } catch (IOException e) { } } } try { html = msg.getHtmlBody(); } catch (ChunkNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } if (html != null && !html.isEmpty()) { FileOutputStream output = null; try { output = new FileOutputStream(new File(newDir, filenamebody + ".html")); byte[] bb = html.getBytes(); output.write(bb, 0, bb.length); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (output != null) { try { output.close(); } catch (IOException e) { } } } html = null; } if (isTxt || isHttp) { try { rtf = msg.getRtfBody(); } catch (ChunkNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } if (rtf != null && !rtf.isEmpty()) { FileOutputStream output = null; try { output = new FileOutputStream(new File(newDir, filenamebody + ".rtf")); byte[] bb = rtf.getBytes(); output.write(bb, 0, bb.length); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (output != null) { try { output.close(); } catch (IOException e) { } } } rtf = null; } } } } if (metadata.hasContent()) { root.add(metadata); } if (identification != null && identification.hasContent()) { root.add(identification); } if (argument.extractKeyword) { result = body + " " + result; body = null; ExtractInfo.exportMetadata(keywords, result, "", config, null); if (keywords.hasContent()) { root.add(keywords); } } root.addAttribute(EMAIL_FIELDS.status.name, "ok"); //System.out.println("end of "+id); return result; }
From source file:fr.gouv.culture.vitam.eml.MsgExtract2.java
License:Open Source License
private static String extractInfoAttachment(AttachmentChunks fatt, Element identification, VitamArgument argument, ConfigLoader config, File curPath) { Element newElt = XmlDom.factory.createElement(EMAIL_FIELDS.subidentity.name); String filename = null;/*from w w w.jav a 2s . c om*/ String result = ""; byte[] bytes = fatt.attachData.getValue(); long size = bytes.length; // Check file filename = fatt.attachLongFileName.toString(); if (filename.isEmpty()) { filename = fatt.attachFileName.toString(); } filename = StringUtils.toFileName(filename); FileOutputStream out = null; File filetemp = null; try { String tempfilename = filename.isEmpty() ? (config.nbDoc.get() + 1) + "_unknownAttachment.msg" : filename; // Force out as eml if (config.extractFile) { filetemp = new File(curPath, tempfilename); } else { filetemp = File.createTempFile(StaticValues.PREFIX_TEMPFILE, tempfilename); } out = new FileOutputStream(filetemp); out.write(bytes); out.close(); bytes = null; // Now check file against Droid or more try { Commands.addFormatIdentification(newElt, filename, filetemp, config, argument); if (argument.extractKeyword) { // get back keyword in the main list Element keyw = (Element) newElt.selectSingleNode(EMAIL_FIELDS.keywords.name); if (keyw != null) { StringBuilder builder = new StringBuilder(); @SuppressWarnings("unchecked") List<Element> elts = (List<Element>) keyw.selectNodes(EMAIL_FIELDS.keywordRank.name); for (Element elt : elts) { String value = elt.attributeValue(EMAIL_FIELDS.keywordOccur.name); int occur = Integer.parseInt(value) / 2 + 1; @SuppressWarnings("unchecked") List<Element> words = (List<Element>) elt.selectNodes(EMAIL_FIELDS.keywordWord.name); for (Element eword : words) { String word = eword.attributeValue(EMAIL_FIELDS.keywordValue.name) + " "; for (int i = 0; i < occur; i++) { builder.append(word); } } } result = builder.toString().trim(); } } } catch (Exception e) { config.addRankId(newElt); // String id = Long.toString(config.nbDoc.incrementAndGet()); // newElt.addAttribute(EMAIL_FIELDS.rankId.name, id); String status = "Error during identification"; e.printStackTrace(); newElt.addAttribute(EMAIL_FIELDS.status.name, status); return ""; } // then clear if (!config.extractFile) { filetemp.delete(); } } catch (IOException e) { config.addRankId(newElt); // String id = Long.toString(config.nbDoc.incrementAndGet()); // newElt.addAttribute(EMAIL_FIELDS.rankId.name, id); e.printStackTrace(); String status = "Error during access to attachment"; newElt.addAttribute(EMAIL_FIELDS.status.name, status); identification.add(newElt); return ""; } finally { if (filetemp != null && !config.extractFile) { filetemp.delete(); } if (out != null) { try { out.close(); } catch (IOException e2) { } } } if (filename != null) { Element elt = XmlDom.factory.createElement(EMAIL_FIELDS.filename.name); elt.setText(filename); newElt.add(elt); } if (size > 0) { Element elt = XmlDom.factory.createElement(EMAIL_FIELDS.attSize.name); elt.setText(Long.toString(size)); newElt.add(elt); } String mimetag = fatt.attachMimeTag.toString(); if (mimetag != null) { Element elt = XmlDom.factory.createElement("attchmentMimeType"); elt.setText(mimetag); newElt.add(elt); } identification.add(newElt); return result; }
From source file:fr.gouv.culture.vitam.eml.PstExtract.java
License:Open Source License
private void extractInfoFolder(PSTFolder folder) { depth++;//from ww w . ja va 2 s . co m Element curdepth = currentRoot; // the root folder doesn't have a display name if (depth > 0) { printDepth(); System.out.println(folder.getDisplayName()); } // go through the folders... if (folder.hasSubfolders()) { Vector<PSTFolder> childFolders; try { childFolders = folder.getSubFolders(); for (PSTFolder childFolder : childFolders) { Element nextdepth = XmlDom.factory.createElement(EMAIL_FIELDS.folder.name); nextdepth.addAttribute(EMAIL_FIELDS.folderName.name, childFolder.getDisplayName()); File pastDir = curPath; if (config.extractFile || (extractSeparateXmlFolder && writer != null)) { // XXX FIXME multiple output curPath = new File(curPath, childFolder.getDisplayName()); curPath.mkdirs(); argument.currentOutputDir = curPath; nextdepth.addAttribute(EMAIL_FIELDS.folderFile.name, curPath.getPath()); } currentRoot = nextdepth; long before = config.nbDoc.get(); extractInfoFolder(childFolder); long after = config.nbDoc.get(); currentRoot.addAttribute("nbSubMsg", Long.toString(after - before)); currentRoot.addAttribute(EMAIL_FIELDS.status.name, "ok"); if (extractSeparateXmlFolder && writer != null) { File separate = new File(curPath, "info_" + childFolder.getDisplayName() + ".xml"); FileOutputStream out2 = null; try { out2 = new FileOutputStream(separate); writer.setOutputStream(out2); writer.write(nextdepth); writer.flush(); nextdepth = XmlDom.factory.createElement(EMAIL_FIELDS.folder.name); nextdepth.addAttribute(EMAIL_FIELDS.folderName.name, childFolder.getDisplayName()); nextdepth.addAttribute(EMAIL_FIELDS.filename.name, separate.getPath()); nextdepth.addAttribute("nbSubMsg", Long.toString(after - before)); nextdepth.addAttribute(EMAIL_FIELDS.status.name, "ok"); } catch (UnsupportedEncodingException e) { } catch (IOException e) { } finally { try { if (out2 != null) { out2.close(); } } catch (IOException e) { } } } curdepth.add(nextdepth); // XXX FIXME if multiple files as output => curPath + metadata.xml = // currentRoot to duplicate (single node) and detach to save curPath = pastDir; argument.currentOutputDir = curPath; currentRoot = curdepth; } } catch (PSTException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } // and now the emails for this folder if (folder.getContentCount() > 0) { depth++; currentRoot = curdepth; PSTMessage email; try { email = (PSTMessage) folder.getNextChild(); while (email != null) { System.out.print('.'); extractInfoMessage(email); email = (PSTMessage) folder.getNextChild(); } } catch (PSTException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } depth--; System.out.println(); } depth--; }
From source file:fr.gouv.culture.vitam.eml.PstExtract.java
License:Open Source License
private String extractInfoAttachment(PSTAttachment attachment, Element identification) { Element newElt = XmlDom.factory.createElement(EMAIL_FIELDS.subidentity.name); String filename = null;//from w w w . ja v a 2 s. com String result = ""; Date creationTime = attachment.getCreationTime(); if (creationTime != null) { Element elt = XmlDom.factory.createElement(EMAIL_FIELDS.creationTime.name); elt.setText(creationTime.toString()); newElt.add(elt); } boolean isMsg = false; try { PSTMessage msg = attachment.getEmbeddedPSTMessage(); if (msg != null) { Element cur = currentRoot; currentRoot = newElt; if (argument.extractKeyword) { result = extractInfoMessage(msg); } else { extractInfoMessage(msg); } currentRoot = cur; isMsg = true; } } catch (IOException e) { } catch (PSTException e) { } if (!isMsg) { // Check file filename = attachment.getLongFilename(); if (filename.isEmpty()) { filename = attachment.getFilename(); } filename = StringUtils.toFileName(filename); long size = 0; if (true) { InputStream attachmentStream = null; FileOutputStream out = null; File filetemp = null; try { attachmentStream = attachment.getFileInputStream(); String tempfilename = filename.isEmpty() ? (config.nbDoc.get() + 1) + "_unknownAttachment.eml" : filename; // Force out as eml if (config.extractFile) { filetemp = new File(curPath, tempfilename); } else { filetemp = File.createTempFile(StaticValues.PREFIX_TEMPFILE, tempfilename); } out = new FileOutputStream(filetemp); // 8176 is the block size used internally and should give the best performance int bufferSize = 8176; byte[] buffer = new byte[bufferSize]; int count; do { count = attachmentStream.read(buffer); if (count >= 0) { out.write(buffer, 0, count); size += count; } } while (count == bufferSize); out.close(); attachmentStream.close(); // Now check file against Droid or more try { Commands.addFormatIdentification(newElt, filename, filetemp, config, argument); if (argument.extractKeyword) { // get back keyword in the main list Element keyw = (Element) newElt.selectSingleNode(EMAIL_FIELDS.keywords.name); if (keyw != null) { StringBuilder builder = new StringBuilder(); @SuppressWarnings("unchecked") List<Element> elts = (List<Element>) keyw .selectNodes(EMAIL_FIELDS.keywordRank.name); for (Element elt : elts) { String value = elt.attributeValue(EMAIL_FIELDS.keywordOccur.name); int occur = Integer.parseInt(value) / 2 + 1; @SuppressWarnings("unchecked") List<Element> words = (List<Element>) elt .selectNodes(EMAIL_FIELDS.keywordWord.name); for (Element eword : words) { String word = eword.attributeValue(EMAIL_FIELDS.keywordValue.name) + " "; for (int i = 0; i < occur; i++) { builder.append(word); } } } result = builder.toString().trim(); } } } catch (Exception e) { config.addRankId(newElt); //String id = Long.toString(config.nbDoc.incrementAndGet()); //newElt.addAttribute(EMAIL_FIELDS.rankId.name, id); String status = "Error during identification"; e.printStackTrace(); newElt.addAttribute(EMAIL_FIELDS.status.name, status); return ""; } // then clear if (!config.extractFile) { filetemp.delete(); } } catch (IOException e) { config.addRankId(newElt); //String id = Long.toString(config.nbDoc.incrementAndGet()); //newElt.addAttribute(EMAIL_FIELDS.rankId.name, id); e.printStackTrace(); String status = "Error during access to attachment"; newElt.addAttribute(EMAIL_FIELDS.status.name, status); identification.add(newElt); return ""; } catch (PSTException e) { config.addRankId(newElt); //String id = Long.toString(config.nbDoc.incrementAndGet()); //newElt.addAttribute(EMAIL_FIELDS.rankId.name, id); e.printStackTrace(); String status = "Error during access to attachment"; newElt.addAttribute(EMAIL_FIELDS.status.name, status); identification.add(newElt); return ""; } finally { if (filetemp != null && !config.extractFile) { filetemp.delete(); } if (out != null) { try { out.close(); } catch (IOException e2) { } } if (attachmentStream != null) { try { attachmentStream.close(); } catch (IOException e2) { } } } } if (filename != null) { Element elt = XmlDom.factory.createElement(EMAIL_FIELDS.filename.name); elt.setText(filename); newElt.add(elt); } if (size == 0) { size = attachment.getAttachSize(); } if (size > 0) { Element elt = XmlDom.factory.createElement(EMAIL_FIELDS.attSize.name); elt.setText(Long.toString(size)); newElt.add(elt); } } identification.add(newElt); return result; }
From source file:fr.gouv.culture.vitam.eml.PstExtract.java
License:Open Source License
private static void addAddress(Element root, String entry, String name, String address) { Element val = XmlDom.factory.createElement(entry); if (name != null && name.length() > 0) { Element nm = XmlDom.factory.createElement(EMAIL_FIELDS.emailName.name); nm.setText(StringUtils.unescapeHTML(name, true, false)); val.add(nm); }/*from w ww . j av a 2s .c o m*/ if (address != null && address.length() > 0) { Element nm = XmlDom.factory.createElement(EMAIL_FIELDS.emailAddress.name); nm.setText(StringUtils.unescapeHTML(address, true, false)); val.add(nm); } root.add(val); }