Java tutorial
package it.unimi.di.big.mg4j.document; /* * MG4J: Managing Gigabytes for Java (big) * * Copyright (C) 2005-2015 Paolo Boldi and Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 3 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ import it.unimi.di.big.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap; import it.unimi.dsi.fastutil.objects.Reference2ObjectMap; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.NullReader; import it.unimi.dsi.io.WordReader; import it.unimi.dsi.lang.MutableString; import it.unimi.dsi.util.Properties; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.Serializable; import java.io.StringReader; import java.lang.reflect.InvocationTargetException; import java.nio.charset.Charset; import java.text.ParseException; import java.util.Date; import javax.mail.Address; import javax.mail.Folder; import javax.mail.Message; import javax.mail.MessagingException; import javax.mail.Session; import javax.mail.Store; import javax.mail.URLName; import javax.mail.internet.AddressException; import javax.mail.internet.MailDateFormat; import org.apache.commons.configuration.ConfigurationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.martiansoftware.jsap.FlaggedOption; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.Parameter; import com.martiansoftware.jsap.SimpleJSAP; import com.martiansoftware.jsap.UnflaggedOption; /** A {@link it.unimi.di.big.mg4j.document.DocumentCollection} corresponding to * a Javamail {@link javax.mail.Store}. * * <p>This class is very simple: for instance, it will not understand correctly * multipart MIME messages, which will seen as without content. You are invited * to extend it. * * <p>This implementation is an example of a document collection that does not use a * factory: more precisely, there is an internal class that act as a wired factory. This * structure is made necessary by the fact that Javamail provide no means to parse messages * starting from an {@link java.io.InputStream}, which makes a separate implementation * of {@link it.unimi.di.big.mg4j.document.DocumentFactory#getDocument(InputStream,Reference2ObjectMap)} * impossible. * * <p>Note that to be able to use this class you must configure properly Javamail: * this involves setting up a <samp>javamail.properties</samp> file describing the * providers you want to use for the various access schemes. GNU Javamail, for instance, contains * providers for files, IMAP, POP, etc. */ public class JavamailDocumentCollection extends AbstractDocumentCollection implements Serializable { private final static Logger LOGGER = LoggerFactory.getLogger(JavamailDocumentCollection.class); /** A special date (actually, 1 January 1970) representing no date. */ public final static Date NO_DATE = new Date(0); private static final long serialVersionUID = 2L; /** Our only session . */ private final static Session SESSION = Session.getDefaultInstance(new java.util.Properties()); /** The number of messages. */ private final int numberOfMessages; /** The factory to be used by this collection. */ private final JavamailDocumentFactory factory; /** The URL for the store. */ private final String storeUrl; /** The folder name. */ private final String folderName; /** The javamail store we are reading. */ private final transient Store store; /** The javamail folder we are reading. */ private final transient Folder folder; /** Builds a document collection corresponding to a given store URL and folder name. * * <p><strong>Beware.</strong> This class is not suited for large mbox files! * * @param storeUrl the javamail URL of the store. * @param folderName the folder name. * @param factory the factory that will be used to create documents. * @throws MessagingException */ protected JavamailDocumentCollection(final String storeUrl, final String folderName, final JavamailDocumentFactory factory) throws MessagingException { this.storeUrl = storeUrl; this.folderName = folderName; this.factory = factory; this.store = SESSION.getStore(new URLName(storeUrl)); store.connect(); this.folder = store.getDefaultFolder().getFolder(folderName); folder.open(Folder.READ_ONLY); this.numberOfMessages = folder.getMessageCount(); } public JavamailDocumentCollection(final String storeUrl, final String folderName) throws MessagingException { this(storeUrl, folderName, new JavamailDocumentFactory()); } public JavamailDocumentCollection(final String storeUrl, final String folderName, final Properties properties) throws MessagingException, ConfigurationException { this(storeUrl, folderName, new JavamailDocumentFactory(properties)); } public JavamailDocumentCollection(final String storeUrl, final String folderName, final String[] property) throws MessagingException, ConfigurationException { this(storeUrl, folderName, new JavamailDocumentFactory(property)); } public JavamailDocumentCollection(final String storeUrl, final String folderName, final Reference2ObjectMap<Enum<?>, Object> defaultMetadata) throws MessagingException { this(storeUrl, folderName, new JavamailDocumentFactory(defaultMetadata)); } public JavamailDocumentCollection copy() { try { return new JavamailDocumentCollection(storeUrl, folderName, factory.copy()); } catch (MessagingException e) { throw new RuntimeException(e); } } private final static class JavamailDocumentFactory extends PropertyBasedDocumentFactory { private static final long serialVersionUID = 1L; /** The field names (each also corresponds to a header, except for the 0-th). */ private static final String[] FIELD_NAME = { "body", "subject", "from", "to", "date", "cc", "bcc", "content-type" }; /** The field types. */ private static final FieldType[] FIELD_TYPE = { FieldType.TEXT, FieldType.TEXT, FieldType.TEXT, FieldType.TEXT, FieldType.DATE, FieldType.TEXT, FieldType.TEXT, FieldType.TEXT }; /** The map from field names to field indices. */ private static final Object2IntOpenHashMap<String> FIELD2INDEX; static { FIELD2INDEX = new Object2IntOpenHashMap<String>(FIELD_NAME.length, .5f); FIELD2INDEX.defaultReturnValue(-1); for (int i = 0; i < FIELD_NAME.length; i++) FIELD2INDEX.put(FIELD_NAME[i], i); } /** The word reader used for all documents. */ private WordReader wordReader = new FastBufferedReader(); protected boolean parseProperty(final String key, final String[] values, final Reference2ObjectMap<Enum<?>, Object> metadata) throws ConfigurationException { if (sameKey(MetadataKeys.ENCODING, key)) { metadata.put(MetadataKeys.ENCODING, Charset.forName(ensureJustOne(key, values)).toString()); return true; } return super.parseProperty(key, values, metadata); } public JavamailDocumentFactory() { init(); } public JavamailDocumentFactory(final Properties properties) throws ConfigurationException { super(properties); init(); } public JavamailDocumentFactory(final Reference2ObjectMap<Enum<?>, Object> defaultMetadata) { super(defaultMetadata); init(); } public JavamailDocumentFactory(final String[] property) throws ConfigurationException { super(property); init(); } private void init() { wordReader = new FastBufferedReader(); } public JavamailDocumentFactory copy() { return new JavamailDocumentFactory(defaultMetadata); } public int numberOfFields() { return FIELD_NAME.length; } public String fieldName(final int field) { ensureFieldIndex(field); return FIELD_NAME[field]; } public FieldType fieldType(final int field) { ensureFieldIndex(field); return FIELD_TYPE[field]; } public int fieldIndex(final String fieldName) { return FIELD2INDEX.getInt(fieldName); } public Document getDocument(final InputStream rawContent, final Reference2ObjectMap<Enum<?>, Object> metadata) { throw new UnsupportedOperationException(); } } public DocumentFactory factory() { return factory; } public long size() { return numberOfMessages; } public void close() throws IOException { super.close(); try { folder.close(false); store.close(); } catch (MessagingException e) { throw new IOException(e.toString()); } } private Object readResolve() throws MessagingException, IOException { super.close(); // To avoid spurious warnings about unclosed collected objects. return new JavamailDocumentCollection(storeUrl, folderName, factory); } public Document document(final long index) throws IOException { try { return new AbstractDocument() { // Can you believe that? Javamail numbers messages from 1... final Message message = folder.getMessage((int) (index + 1)); public CharSequence title() { final String subject; try { subject = message.getSubject(); } catch (MessagingException e) { throw new RuntimeException(e.toString()); } if (subject == null) return (CharSequence) factory.resolve(MetadataKeys.TITLE, factory.defaultMetadata); else return subject; } public CharSequence uri() { try { return folder.getURLName() + "#" + message.getMessageNumber(); } catch (MessagingException e) { throw new RuntimeException(e); } } private Reader joinAddresses(final Address address[]) { if (address == null) return NullReader.getInstance(); final MutableString s = new MutableString(); if (address != null) { for (int i = 0; i < address.length; i++) { if (i > 0) s.append(", "); s.append(address[i]); } } return new FastBufferedReader(s); } public Object content(final int field) throws IOException { factory.ensureFieldIndex(field); try { switch (field) { case 0: // body // TODO: analyze multipart messages Object content = null; try { content = message.getContent(); } catch (Exception e) { LOGGER.warn("Message " + message.getMessageNumber() + " cannot be decoded; content will be empty", e); } if (content != null && content instanceof String) return new StringReader((String) content); return NullReader.getInstance(); case 1: // subject return message.getSubject() == null ? NullReader.getInstance() : new StringReader(message.getSubject()); case 2: // from return joinAddresses(message.getFrom()); case 3: // to return joinAddresses(message.getRecipients(Message.RecipientType.TO)); case 4: // date final String[] date = message.getHeader("date"); if (date == null || date.length == 0) return NO_DATE; final MailDateFormat mailDateFormat = new MailDateFormat(); try { return mailDateFormat.parse(date[0]); } catch (ParseException e) { LOGGER.warn("Error parsing date " + date[0]); return NO_DATE; } case 5: // cc return joinAddresses(message.getRecipients(Message.RecipientType.CC)); case 6: // bcc return joinAddresses(message.getRecipients(Message.RecipientType.BCC)); case 7: // content-type return new StringReader(message.getContentType()); } } catch (MessagingException e) { // A simple error if (e instanceof AddressException) { LOGGER.warn("Error while parsing address", e); return NullReader.getInstance(); } throw new IOException(e.toString()); } throw new IllegalStateException(); } public WordReader wordReader(final int field) { factory.ensureFieldIndex(field); return factory.wordReader; } }; } catch (MessagingException e) { throw new IOException(e.toString()); } } public Reference2ObjectMap<Enum<?>, Object> metadata(final long index) { ensureDocumentIndex(index); final Reference2ObjectArrayMap<Enum<?>, Object> metadata = new Reference2ObjectArrayMap<Enum<?>, Object>(2); metadata.put(MetadataKeys.TITLE, "Message #" + index); metadata.put(MetadataKeys.URI, storeUrl + folder + "#" + index); return metadata; } public InputStream stream(final long index) throws IOException { ensureDocumentIndex(index); try { // Can you believe that? Javamail numbers messages from 1... return folder.getMessage((int) (index + 1)).getInputStream(); } catch (MessagingException e) { throw new IOException(e.toString()); } } public static void main(final String[] arg) throws IOException, JSAPException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, InstantiationException, MessagingException, ConfigurationException { SimpleJSAP jsap = new SimpleJSAP(JavamailDocumentCollection.class.getName(), "Saves a serialised mbox collection based on a given mbox file.", new Parameter[] { new FlaggedOption("property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "property", "A 'key=value' specification, or the name of a property file") .setAllowMultipleDeclarations(true), new UnflaggedOption("collection", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename for the serialised collection."), new UnflaggedOption("storeUrl", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The javamail store."), new UnflaggedOption("folder", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The folder to be read.") }); JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) return; BinIO.storeObject(new JavamailDocumentCollection(jsapResult.getString("storeUrl"), jsapResult.getString("folder"), jsapResult.getStringArray("property")), jsapResult.getString("collection")); } }