org.apache.solr.handler.dataimport.FsMailEntityProcessor.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.solr.handler.dataimport.FsMailEntityProcessor.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.dataimport;

import java.io.BufferedInputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;

import javax.mail.Address;
import javax.mail.Flags;
import javax.mail.Message;
import javax.mail.MessagingException;
import javax.mail.Multipart;
import javax.mail.Part;
import javax.mail.Session;
import javax.mail.internet.AddressException;
import javax.mail.internet.ContentType;
import javax.mail.internet.InternetAddress;
import javax.mail.internet.MimeMessage;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang.time.DateUtils;
import org.apache.tika.Tika;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * An {@link EntityProcessor} instance which can index emails along with their attachments 
 * from mail messages stored in local FS.
 * 
 * This code borrows many parts from existing Solr's {@link MailEntityProcessor}. Too bad
 * MailEntityProcessor is not written with extension in mind - otherwise only authentication
 * mechanism and multi-user support could be added
 * 
 * @author Dmitri Maximovich maxim@maximdim.com
 *
 */
public class FsMailEntityProcessor extends EntityProcessorBase {
    static final Logger LOG = LoggerFactory.getLogger(FsMailEntityProcessor.class);

    // Fields To Index
    // single valued
    private static final String USER_ID = "userId";
    private static final String MESSAGE_ID = "messageId";
    private static final String SUBJECT = "subject";
    private static final String FROM = "from";
    private static final String FROM_CLEAN = "from_clean";
    private static final String SENT_DATE = "sentDate";
    private static final String RECEIVED_DATE = "receivedDate";
    private static final String XMAILER = "xMailer";
    // first 'To' address - need it for sorting. Sort is impossible for multivalued fields
    private static final String TO = "to";
    private static final String TO_CLEAN = "to_clean";
    // hash of the important fields
    private static final String HASH = "hash";
    // multi valued
    private static final String TO_CC_BCC = "allTo";
    private static final String TO_CC_BCC_CLEAN = "allTo_clean";
    private static final String FLAGS = "flags";
    private static final String CONTENT = "content";
    private static final String ATTACHMENT = "attachment";
    private static final String ATTACHMENT_NAMES = "attachmentNames";
    // flag values
    private static final String FLAG_ANSWERED = "answered";
    private static final String FLAG_DELETED = "deleted";
    private static final String FLAG_DRAFT = "draft";
    private static final String FLAG_FLAGGED = "flagged";
    private static final String FLAG_RECENT = "recent";
    private static final String FLAG_SEEN = "seen";

    private final Tika tika = new Tika();
    private final Session session = Session.getDefaultInstance(new Properties(), null);

    private File dataDir;
    private List<String> ignoreFrom;

    private Iterator<String> fileNames;

    @Override
    public void init(Context context) {
        super.init(context);
        this.dataDir = new File(getStringFromContext("dataDir", null));
        this.ignoreFrom = Arrays.asList(getStringFromContext("ignoreFrom", "qwe123").split(","));

        LOG.info("datadir: " + this.dataDir);
        LOG.info("ignoreFrom: " + this.ignoreFrom);

        // We don't distinguish between full and delta import here. Always need to be triggered
        // as FULL dump but we would look at date from 'dataimport.properties' to determine last index time
        // and only process delta changes
        LOG.info("Current process: " + context.currentProcess());
        Date since = getSince(context);
        LOG.info("Since: " + since);

        List<String> files = new ArrayList<String>();
        getFolderFiles(dataDir, since, files);
        LOG.info("Files to process: " + files.size());

        this.fileNames = files.iterator();
    }

    private Date getSince(Context c) {
        // perhaps there is a better way to get last index time?
        String sinceStr = context.replaceTokens("${dataimporter.last_index_time}");
        if (!sinceStr.contains("1969")) { // if there are no last delta time (e.g. file removed) date in 1969 is returned
            SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
            try {
                Date date = df.parse(sinceStr);
                // it seems that last updated time is saved at the end of the run so there is a window
                // when some new files could be written which would fall into the crack so we would
                // move this time few minutes earlier. Not particularly nice hack
                Calendar cal = Calendar.getInstance();
                cal.setTime(date);
                cal.add(Calendar.MINUTE, -2);
                return cal.getTime();
            } catch (ParseException e) {
                LOG.error("Failed to parse date: [" + sinceStr + "]");
            }
        }
        return null;
    }

    @Override
    public Map<String, Object> nextRow() {
        while (this.fileNames.hasNext()) {
            File file = new File(this.fileNames.next());
            FileInfo fi = null;
            try {
                fi = new FileInfo(file);
            } catch (InvalidFileException e) {
                LOG.error(
                        "We should be able to parse this FileInfo here: " + e.getMessage() + ": " + file.getName());
                continue;
            }

            Message message = readMessage(session, file);
            if (message == null) {
                continue;
            }
            Map<String, Object> row = getDocumentFromMail(message);
            if (row == null) {
                continue;
            }
            // use file name as id - it's guaranteed to be unique
            row.put(MESSAGE_ID, fi.id);
            row.put(USER_ID, fi.user);
            LOG.info("Processed " + file.getAbsolutePath());
            return row;
        }
        return null;
    }

    @SuppressWarnings("resource")
    private Message readMessage(Session session, File f) {
        BufferedInputStream is = null;
        try {
            is = new BufferedInputStream(new FileInputStream(f));
            return new MimeMessage(session, is);
        } catch (Exception e) {
            LOG.error("Error indexing message from " + f.getAbsolutePath() + ": " + e.getMessage(), e);
            return null;
        } finally {
            close(is);
        }
    }

    private Map<String, Object> getDocumentFromMail(Message mail) {
        Map<String, Object> row = new HashMap<String, Object>();
        try {
            if (addPartToDocument(mail, row, true)) {
                return row;
            }
            return null;
        } catch (Exception e) {
            LOG.error(e.getMessage(), e);
            return null;
        }
    }

    public boolean addPartToDocument(Part part, Map<String, Object> row, boolean outerMost) throws Exception {
        if (outerMost && part instanceof Message) {
            if (!addEnvelopToDocument(part, row)) {
                return false;
            }
            // store hash
            row.put(HASH, DigestUtils.md5Hex((String) row.get(FROM_CLEAN) + "" + (String) row.get(SUBJECT)));
        }

        String ct = part.getContentType();
        ContentType ctype = new ContentType(ct);
        if (part.isMimeType("multipart/*")) {
            Multipart mp = (Multipart) part.getContent();
            int count = mp.getCount();
            if (part.isMimeType("multipart/alternative")) {
                count = 1;
            }
            for (int i = 0; i < count; i++) {
                addPartToDocument(mp.getBodyPart(i), row, false);
            }
        } else if (part.isMimeType("message/rfc822")) {
            addPartToDocument((Part) part.getContent(), row, false);
        } else {
            String disp = part.getDisposition();
            @SuppressWarnings("resource") // Tika will close stream
            InputStream is = part.getInputStream();
            String fileName = part.getFileName();
            Metadata md = new Metadata();
            md.set(HttpHeaders.CONTENT_TYPE, ctype.getBaseType().toLowerCase(Locale.ROOT));
            md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
            String content = this.tika.parseToString(is, md);
            if (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT)) {
                if (row.get(ATTACHMENT) == null) {
                    row.put(ATTACHMENT, new ArrayList<String>());
                }
                List<String> contents = (List<String>) row.get(ATTACHMENT);
                contents.add(content);
                row.put(ATTACHMENT, contents);
                if (row.get(ATTACHMENT_NAMES) == null) {
                    row.put(ATTACHMENT_NAMES, new ArrayList<String>());
                }
                List<String> names = (List<String>) row.get(ATTACHMENT_NAMES);
                names.add(fileName);
                row.put(ATTACHMENT_NAMES, names);
            } else {
                if (row.get(CONTENT) == null) {
                    row.put(CONTENT, new ArrayList<String>());
                }
                List<String> contents = (List<String>) row.get(CONTENT);
                contents.add(content);
                row.put(CONTENT, contents);
            }
        }
        return true;
    }

    private boolean addEnvelopToDocument(Part part, Map<String, Object> row) throws MessagingException {
        MimeMessage mail = (MimeMessage) part;
        Address[] adresses;
        if ((adresses = mail.getFrom()) != null && adresses.length > 0) {
            String from = adresses[0].toString();
            // check if we should ignore this sender
            for (String ignore : this.ignoreFrom) {
                if (from.toLowerCase().contains(ignore)) {
                    LOG.info("Ignoring email from " + from);
                    return false;
                }
            }
            row.put(FROM, from);
            row.put(FROM_CLEAN, cleanAddress(from));
        } else {
            return false;
        }

        List<String> to = new ArrayList<String>();
        if ((adresses = mail.getRecipients(Message.RecipientType.TO)) != null) {
            addAddressToList(adresses, to);
        }
        if ((adresses = mail.getRecipients(Message.RecipientType.CC)) != null) {
            addAddressToList(adresses, to);
        }
        if ((adresses = mail.getRecipients(Message.RecipientType.BCC)) != null) {
            addAddressToList(adresses, to);
        }
        if (!to.isEmpty()) {
            row.put(TO_CC_BCC, to);
            List<String> cleanAddresses = cleanAddresses(to);
            row.put(TO_CC_BCC_CLEAN, cleanAddresses);
            // save first TO address into separate field
            row.put(TO, to.get(0));
            row.put(TO_CLEAN, cleanAddresses.get(0));
        }

        row.put(MESSAGE_ID, mail.getMessageID());
        row.put(SUBJECT, mail.getSubject());

        {
            Date d = mail.getSentDate();
            if (d != null) {
                row.put(SENT_DATE, d);
            }
        }
        {
            Date d = mail.getReceivedDate();
            if (d != null) {
                row.put(RECEIVED_DATE, d);
            }
        }

        List<String> flags = new ArrayList<String>();
        for (Flags.Flag flag : mail.getFlags().getSystemFlags()) {
            if (flag == Flags.Flag.ANSWERED) {
                flags.add(FLAG_ANSWERED);
            } else if (flag == Flags.Flag.DELETED) {
                flags.add(FLAG_DELETED);
            } else if (flag == Flags.Flag.DRAFT) {
                flags.add(FLAG_DRAFT);
            } else if (flag == Flags.Flag.FLAGGED) {
                flags.add(FLAG_FLAGGED);
            } else if (flag == Flags.Flag.RECENT) {
                flags.add(FLAG_RECENT);
            } else if (flag == Flags.Flag.SEEN) {
                flags.add(FLAG_SEEN);
            }
        }
        flags.addAll(Arrays.asList(mail.getFlags().getUserFlags()));
        row.put(FLAGS, flags);

        String[] hdrs = mail.getHeader("X-Mailer");
        if (hdrs != null) {
            row.put(XMAILER, hdrs[0]);
        }
        return true;
    }

    private void addAddressToList(Address[] adresses, List<String> to) throws AddressException {
        for (Address address : adresses) {
            to.add(address.toString());
            InternetAddress ia = (InternetAddress) address;
            if (ia.isGroup()) {
                InternetAddress[] group = ia.getGroup(false);
                for (InternetAddress member : group) {
                    to.add(member.toString());
                }
            }
        }
    }

    private static final String EMAIL_PATTERN = "^[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9-]+)*@"
            + "[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.[A-Za-z]{2,})$";

    private static final Pattern emailPattern = Pattern.compile(EMAIL_PATTERN);

    /**
     * @return 'clean' email address or null if it doesn't look like email address
     */
    private static String cleanAddress(String a) {
        if (a == null || a.trim().length() == 0) {
            return null;
        }
        int i = a.indexOf('<');
        int j = a.indexOf('>', i);
        if (i >= 0 && j > i) {
            String mail = a.substring(i + 1, j);
            if (emailPattern.matcher(mail).matches()) {
                return mail.toLowerCase();
            }
        } else {
            return a.trim().toLowerCase();
        }
        return null;
    }

    private static List<String> cleanAddresses(List<String> aa) {
        List<String> result = new ArrayList<String>();
        for (String a : aa) {
            String email = cleanAddress(a);
            if (email != null) {
                result.add(email);
            }
        }
        return result;
    }

    private String getStringFromContext(String prop, String ifNull) {
        String v = ifNull;
        String val = context.getEntityAttribute(prop);
        if (val != null) {
            val = context.replaceTokens(val);
            v = val;
        }
        return v;
    }

    private void close(Closeable c) {
        if (c != null) {
            try {
                c.close();
            } catch (IOException e) {
                LOG.warn("Error closing Closeable: " + e.getMessage(), e);
            }
        }
    }

    void getFolderFiles(File dir, final Date since, final List<String> fileNames) {
        // Fetch an list of file objects that pass the filter, however the
        // returned array is never populated; accept() always returns false.
        // Rather we make use of the fileNames list which is populated as
        // a side affect of the accept method.
        dir.listFiles(new FileFilter() {
            @Override
            public boolean accept(File f) {
                if (f.isDirectory() && shouldAcceptDirectory(f, since)) {
                    getFolderFiles(f, since, fileNames);
                } else {
                    if (shouldAcceptFile(f, since)) {
                        fileNames.add(f.getAbsolutePath());
                    }
                }
                return false;
            }

            private final SimpleDateFormat df = new SimpleDateFormat(
                    "yyyy" + File.separator + "MM" + File.separator + "dd");

            // Optimization because path to data file has full date, e.g.:
            // foo.com/2013/05/11/user_20130611T092053.mail 
            private boolean shouldAcceptDirectory(File dir, Date since) {
                if (since == null) {
                    return true;
                }
                String name = dir.getAbsolutePath();
                if (name.length() < 10) {
                    return true;
                }
                try {
                    Date dirDate = df.parse(name.substring(name.length() - 10));
                    since = DateUtils.truncate(since, Calendar.DAY_OF_MONTH);
                    boolean oldDir = dirDate.before(since);
                    if (oldDir) {
                        LOG.info("Skipping old directory: " + dir);
                    }
                    return !oldDir;
                } catch (ParseException e) {
                    return true;
                }
            }
        });
    }

    boolean shouldAcceptFile(File f, Date since) {
        if (f.isDirectory()) {
            return false;
        }
        if (since == null) {
            return true;
        }

        try {
            FileInfo fi = new FileInfo(f);
            return fi.date.after(since);
        } catch (InvalidFileException e) {
            LOG.error(e.getMessage());
            return false;
        }
    }

    static class FileInfo {
        // gmailbackup creates files with following format: "user_yyyyMMdd'T'HHmmss.mail"
        private final static SimpleDateFormat DF = new SimpleDateFormat("yyyyMMdd'T'HHmmss");
        final String id;
        final String user;
        final Date date;
        final String hash;

        FileInfo(File f) throws InvalidFileException {
            String name = f.getName();
            if (!name.endsWith(".mail")) {
                throw new InvalidFileException("File name not recognized [" + name + "]");
            }
            // remove extension
            name = name.substring(0, name.lastIndexOf('.'));
            this.id = name;
            // split by user and date
            String[] parts = name.split("_");
            if (parts.length != 3) {
                throw new InvalidFileException("Not parseable file name [" + name + "]");
            }
            this.user = parts[0];
            this.hash = parts[2];
            try {
                this.date = DF.parse(parts[1]);
            } catch (ParseException e) {
                throw new InvalidFileException("Unable to parse date from file [" + name + "]");
            }
        }
    }

    static class InvalidFileException extends Exception {
        public InvalidFileException(String message) {
            super(message);
        }

        public InvalidFileException(String message, Throwable cause) {
            super(message, cause);
        }

        private static final long serialVersionUID = 1L;
    }

}