Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler.dataimport; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.Closeable; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.regex.Pattern; import javax.mail.Address; import javax.mail.FetchProfile; import javax.mail.Flags; import javax.mail.Folder; import javax.mail.Message; import javax.mail.MessagingException; import javax.mail.Multipart; import javax.mail.Part; import javax.mail.internet.AddressException; import javax.mail.internet.ContentType; import javax.mail.internet.InternetAddress; import javax.mail.internet.MimeMessage; import javax.mail.search.ComparisonTerm; import javax.mail.search.ReceivedDateTerm; import javax.mail.search.SearchTerm; import org.apache.commons.codec.digest.DigestUtils; import org.apache.tika.Tika; import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.code.samples.oauth2.OAuth2Authenticator; import com.sun.mail.imap.IMAPStore; /** * An {@link EntityProcessor} instance which can index emails along with their attachments * Gmail IMAP source using Service Account so you don't need to know passwords for individual * user accounts. * * This code borrows many parts from existing Solr's {@link MailEntityProcessor}. Too bad * MailEntityProcessor is not written with extension in mind - otherwise only authentication * mechanism and multi-user support could be added * * @author Dmitri Maximovich maxim@maximdim.com * */ public class GmailServiceUserMailEntityProcessor extends EntityProcessorBase { static final Logger LOG = LoggerFactory.getLogger(GmailServiceUserMailEntityProcessor.class); // Fields To Index // single valued private static final String USER_ID = "userId"; private static final String MESSAGE_ID = "messageId"; private static final String SUBJECT = "subject"; private static final String FROM = "from"; private static final String FROM_CLEAN = "from_clean"; private static final String SENT_DATE = "sentDate"; private static final String RECEIVED_DATE = "receivedDate"; private static final String XMAILER = "xMailer"; // first 'To' address - need it for sorting. Sort is impossible for multivalued fields private static final String TO = "to"; private static final String TO_CLEAN = "to_clean"; // hash of the important fields private static final String HASH = "hash"; // multi valued private static final String TO_CC_BCC = "allTo"; private static final String TO_CC_BCC_CLEAN = "allTo_clean"; private static final String FLAGS = "flags"; private static final String CONTENT = "content"; private static final String ATTACHMENT = "attachment"; private static final String ATTACHMENT_NAMES = "attachmentNames"; // flag values private static final String FLAG_ANSWERED = "answered"; private static final String FLAG_DELETED = "deleted"; private static final String FLAG_DRAFT = "draft"; private static final String FLAG_FLAGGED = "flagged"; private static final String FLAG_RECENT = "recent"; private static final String FLAG_SEEN = "seen"; private Tika tika = new Tika(); private String serviceAccountId; private File serviceAccountPkFile; private String domain; private File timestampFile; // <User, Date> private Map<String, Date> userTimestamps; private List<String> users; private List<String> ignoreFrom; private int userIndex; private UserMessagesIterator userMessagesIterator; @Override protected void firstInit(Context context) { //LOG.info("gmail firstInit()"); super.firstInit(context); OAuth2Authenticator.initialize(); } @Override public void init(Context context) { //LOG.info("gmail init()"); super.init(context); // init counters this.userIndex = 0; this.userMessagesIterator = null; // read config values // TODO: check for missing required values and give meaningful error this.serviceAccountId = getStringFromContext("serviceAccountId", null); this.serviceAccountPkFile = new File(getStringFromContext("serviceAccountPkFile", null)); this.domain = getStringFromContext("domain", null); this.users = Arrays.asList(getStringFromContext("users", null).split(",")); this.ignoreFrom = Arrays.asList(getStringFromContext("ignoreFrom", null).split(",")); this.timestampFile = new File(getStringFromContext("timestampFile", null)); Date oldestDate = getDate(getStringFromContext("oldestDate", "2012/01/01")); this.userTimestamps = loadTimestamp(this.timestampFile, oldestDate); LOG.info("serviceAccountId: " + this.serviceAccountId); LOG.info("serviceAccountPkFile: " + this.serviceAccountPkFile); LOG.info("domain: " + this.domain); LOG.info("users: " + this.users); LOG.info("ignoreFrom: " + this.ignoreFrom); LOG.info("timestampFile: " + this.timestampFile); LOG.info("oldestDate: " + oldestDate); } @Override public void close() { saveTimestamp(this.userTimestamps, this.timestampFile); super.close(); } @Override public Map<String, Object> nextRow() { //LOG.info("gmail nextRow()"); while (true) { if (this.userIndex >= this.users.size()) { // no more users return null; } String user = this.users.get(this.userIndex); String email = user + "@" + this.domain; if (this.userMessagesIterator == null) { try { LOG.info("Starting processing " + email); IMAPStore store = getStore(email); Date from = this.userTimestamps.get(user); this.userMessagesIterator = new UserMessagesIterator(store, from); continue; } catch (Exception e) { throw new DataImportHandlerException(DataImportHandlerException.SEVERE, "Message retreival failed for " + email, e); } } if (!this.userMessagesIterator.hasNext()) { // no more messages for this user this.userIndex++; this.userMessagesIterator = null; continue; } Message message = this.userMessagesIterator.next(); Map<String, Object> row = getDocumentFromMail(message); if (row == null) { continue; } // save user row.put(USER_ID, user); updateMessageId(row); // update timestamp if (row.containsKey(RECEIVED_DATE)) { this.userTimestamps.put(user, (Date) row.get(RECEIVED_DATE)); } return row; } } // modify messageId to include user and time (not sure if messageId is unique across many users) private void updateMessageId(Map<String, Object> row) { String user = (String) row.get(USER_ID); String id = (String) row.get(MESSAGE_ID); Date receivedDate = (Date) row.get(RECEIVED_DATE); SimpleDateFormat df = new SimpleDateFormat(ID_DATE_TIME_FORMAT); String date = receivedDate != null ? df.format(receivedDate) : "unknown"; row.put(MESSAGE_ID, user + ":" + date + ":" + id); } private IMAPStore getStore(String email) throws Exception { String authToken = OAuth2Authenticator.getToken(this.serviceAccountPkFile, this.serviceAccountId, email); LOG.info("authToken OK"); IMAPStore store = OAuth2Authenticator.connectToImap("imap.gmail.com", 993, email, authToken, false); LOG.info("imapStore for [" + email + "] OK"); return store; } static class UserMessagesIterator implements Iterator<Message> { private final List<Message> messages; private int index; public UserMessagesIterator(IMAPStore store, Date fetchFrom) throws MessagingException { this.messages = getMessages(store, fetchFrom); } @Override public boolean hasNext() { LOG.info(this.index + "/" + this.messages.size()); return this.index < this.messages.size(); } @Override public Message next() { return this.messages.get(this.index++); } @Override public void remove() { throw new UnsupportedOperationException(); } private List<Message> getMessages(IMAPStore store, Date fetchFrom) throws MessagingException { Folder folder = store.getFolder("[Gmail]/All Mail"); folder.open(Folder.READ_ONLY); LOG.info("imap folder open OK"); int totalMessages = folder.getMessageCount(); LOG.info("Total messages: " + totalMessages); // IMAP search command disregards time, only date is used SearchTerm st = new ReceivedDateTerm(ComparisonTerm.GE, fetchFrom); Message[] messages = folder.search(st); LOG.info("Search returned: " + messages.length); // Fetch profile FetchProfile fp = new FetchProfile(); fp.add(FetchProfile.Item.ENVELOPE); fp.add("X-mailer"); folder.fetch(messages, fp); List<Message> result = new ArrayList<Message>(); for (Message m : messages) { if (m.getReceivedDate() != null && m.getReceivedDate().after(fetchFrom)) { result.add(m); } } LOG.info("Result filtered to: " + result.size()); return result; } } private Map<String, Object> getDocumentFromMail(Message mail) { Map<String, Object> row = new HashMap<String, Object>(); try { if (addPartToDocument(mail, row, true)) { return row; } return null; } catch (Exception e) { LOG.error(e.getMessage(), e); return null; } } public boolean addPartToDocument(Part part, Map<String, Object> row, boolean outerMost) throws Exception { if (outerMost && part instanceof Message) { if (!addEnvelopToDocument(part, row)) { return false; } // store hash row.put(HASH, DigestUtils.md5Hex((String) row.get(FROM_CLEAN) + "" + (String) row.get(SUBJECT))); } String ct = part.getContentType(); ContentType ctype = new ContentType(ct); if (part.isMimeType("multipart/*")) { Multipart mp = (Multipart) part.getContent(); int count = mp.getCount(); if (part.isMimeType("multipart/alternative")) { count = 1; } for (int i = 0; i < count; i++) { addPartToDocument(mp.getBodyPart(i), row, false); } } else if (part.isMimeType("message/rfc822")) { addPartToDocument((Part) part.getContent(), row, false); } else { String disp = part.getDisposition(); @SuppressWarnings("resource") // Tika will close stream InputStream is = part.getInputStream(); String fileName = part.getFileName(); Metadata md = new Metadata(); md.set(HttpHeaders.CONTENT_TYPE, ctype.getBaseType().toLowerCase(Locale.ROOT)); md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName); String content = this.tika.parseToString(is, md); if (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT)) { if (row.get(ATTACHMENT) == null) { row.put(ATTACHMENT, new ArrayList<String>()); } List<String> contents = (List<String>) row.get(ATTACHMENT); contents.add(content); row.put(ATTACHMENT, contents); if (row.get(ATTACHMENT_NAMES) == null) { row.put(ATTACHMENT_NAMES, new ArrayList<String>()); } List<String> names = (List<String>) row.get(ATTACHMENT_NAMES); names.add(fileName); row.put(ATTACHMENT_NAMES, names); } else { if (row.get(CONTENT) == null) { row.put(CONTENT, new ArrayList<String>()); } List<String> contents = (List<String>) row.get(CONTENT); contents.add(content); row.put(CONTENT, contents); } } return true; } private boolean addEnvelopToDocument(Part part, Map<String, Object> row) throws MessagingException { MimeMessage mail = (MimeMessage) part; Address[] adresses; if ((adresses = mail.getFrom()) != null && adresses.length > 0) { String from = adresses[0].toString(); // check if we should ignore this sender for (String ignore : this.ignoreFrom) { if (from.toLowerCase().contains(ignore)) { LOG.info("Ignoring email from " + from); return false; } } row.put(FROM, from); row.put(FROM_CLEAN, cleanAddress(from)); } else { return false; } List<String> to = new ArrayList<String>(); if ((adresses = mail.getRecipients(Message.RecipientType.TO)) != null) { addAddressToList(adresses, to); } if ((adresses = mail.getRecipients(Message.RecipientType.CC)) != null) { addAddressToList(adresses, to); } if ((adresses = mail.getRecipients(Message.RecipientType.BCC)) != null) { addAddressToList(adresses, to); } if (!to.isEmpty()) { row.put(TO_CC_BCC, to); List<String> cleanAddresses = cleanAddresses(to); row.put(TO_CC_BCC_CLEAN, cleanAddresses); // save first TO address into separate field row.put(TO, to.get(0)); row.put(TO_CLEAN, cleanAddresses.get(0)); } row.put(MESSAGE_ID, mail.getMessageID()); row.put(SUBJECT, mail.getSubject()); { Date d = mail.getSentDate(); if (d != null) { row.put(SENT_DATE, d); } } { Date d = mail.getReceivedDate(); if (d != null) { row.put(RECEIVED_DATE, d); } } List<String> flags = new ArrayList<String>(); for (Flags.Flag flag : mail.getFlags().getSystemFlags()) { if (flag == Flags.Flag.ANSWERED) { flags.add(FLAG_ANSWERED); } else if (flag == Flags.Flag.DELETED) { flags.add(FLAG_DELETED); } else if (flag == Flags.Flag.DRAFT) { flags.add(FLAG_DRAFT); } else if (flag == Flags.Flag.FLAGGED) { flags.add(FLAG_FLAGGED); } else if (flag == Flags.Flag.RECENT) { flags.add(FLAG_RECENT); } else if (flag == Flags.Flag.SEEN) { flags.add(FLAG_SEEN); } } flags.addAll(Arrays.asList(mail.getFlags().getUserFlags())); row.put(FLAGS, flags); String[] hdrs = mail.getHeader("X-Mailer"); if (hdrs != null) { row.put(XMAILER, hdrs[0]); } return true; } private void addAddressToList(Address[] adresses, List<String> to) throws AddressException { for (Address address : adresses) { to.add(address.toString()); InternetAddress ia = (InternetAddress) address; if (ia.isGroup()) { InternetAddress[] group = ia.getGroup(false); for (InternetAddress member : group) { to.add(member.toString()); } } } } private static final String EMAIL_PATTERN = "^[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9-]+)*@" + "[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.[A-Za-z]{2,})$"; private static final Pattern emailPattern = Pattern.compile(EMAIL_PATTERN); /** * @return 'clean' email address or null if it doesn't look like email address */ private static String cleanAddress(String a) { if (a == null || a.trim().length() == 0) { return null; } int i = a.indexOf('<'); int j = a.indexOf('>', i); if (i >= 0 && j > i) { String mail = a.substring(i + 1, j); if (emailPattern.matcher(mail).matches()) { return mail.toLowerCase(); } } else { return a.trim().toLowerCase(); } return null; } private static List<String> cleanAddresses(List<String> aa) { List<String> result = new ArrayList<String>(); for (String a : aa) { String email = cleanAddress(a); if (email != null) { result.add(email); } } return result; } private Date getDate(String d) { SimpleDateFormat df = new SimpleDateFormat(DATE_FORMAT); try { return df.parse(d); } catch (ParseException e) { throw new DataImportHandlerException(DataImportHandlerException.SEVERE, "getFetchDate() failed", e); } } private String getStringFromContext(String prop, String ifNull) { String v = ifNull; String val = context.getEntityAttribute(prop); if (val != null) { val = context.replaceTokens(val); v = val; } return v; } private static final String DATE_FORMAT = "yyyy-MM-dd"; private static final String DATE_TIME_FORMAT = DATE_FORMAT + "'T'HH:mm:ssZ"; private static final String ID_DATE_TIME_FORMAT = "yyyyMMddHHmmss"; /** * load saved timestamp file (if available) * @param oldestDate * @throws IOException */ private Map<String, Date> loadTimestamp(File f, Date defaultDate) { Map<String, Date> result = new HashMap<String, Date>(); if (f.exists() && f.canRead()) { @SuppressWarnings("resource") BufferedReader br = null; try { br = new BufferedReader(new FileReader(f)); String line = null; SimpleDateFormat df = new SimpleDateFormat(DATE_TIME_FORMAT); while ((line = br.readLine()) != null) { String[] ss = line.split("="); if (ss.length != 2) { LOG.warn("Don't understand line [" + line + "]"); continue; } try { result.put(ss[0], df.parse(ss[1])); } catch (ParseException e) { LOG.warn("Unable to parse date [" + ss[1] + "]"); continue; } } } catch (IOException e) { LOG.error("Error loading user timestamps from " + f.getAbsolutePath() + ": " + e.getMessage(), e); } finally { close(br); } } // fill with defaults for (String user : this.users) { if (!result.containsKey(user)) { result.put(user, defaultDate); } } // log for (Map.Entry<String, Date> me : result.entrySet()) { LOG.info(me.getKey() + "=" + me.getValue()); } return result; } private void saveTimestamp(Map<String, Date> data, File f) { @SuppressWarnings("resource") BufferedWriter bw = null; try { bw = new BufferedWriter(new FileWriter(f)); SimpleDateFormat df = new SimpleDateFormat(DATE_TIME_FORMAT); for (Map.Entry<String, Date> me : data.entrySet()) { String line = me.getKey() + "=" + df.format(me.getValue()); bw.write(line + "\n"); LOG.info(line); } bw.flush(); } catch (IOException e) { LOG.error("Error saving user timestamps to " + f.getAbsolutePath() + ": " + e.getMessage(), e); } finally { close(bw); } } private void close(Closeable c) { if (c != null) { try { c.close(); } catch (IOException e) { LOG.warn("Error closing Closeable: " + e.getMessage(), e); } } } }