edu.stanford.muse.index.Archive.java Source code

Introduction

Here is the source code for edu.stanford.muse.index.Archive.java
Source

/*
 * Copyright (C) 2012 The Stanford MobiSocial Laboratory
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.stanford.muse.index;

import com.google.common.collect.LinkedHashMultimap;
import com.google.common.collect.Multimap;
import edu.stanford.muse.Config;
import edu.stanford.muse.datacache.Blob;
import edu.stanford.muse.datacache.BlobStore;
import edu.stanford.muse.email.*;
import edu.stanford.muse.groups.SimilarGroup;
import edu.stanford.muse.ie.AuthorityMapper;
import edu.stanford.muse.ie.NameInfo;
import edu.stanford.muse.ie.variants.EntityMapper;
import edu.stanford.muse.ner.NER;
import edu.stanford.muse.ner.model.NEType;
import edu.stanford.muse.util.EmailUtils;
import edu.stanford.muse.util.Pair;
import edu.stanford.muse.util.Span;
import edu.stanford.muse.util.Util;
import edu.stanford.muse.webapp.EmailRenderer;
import edu.stanford.muse.webapp.ModeConfig;
import edu.stanford.muse.webapp.SimpleSessions;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.document.Field;
import org.apache.lucene.queryparser.classic.ParseException;

import java.io.*;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.zip.GZIPInputStream;

/**
 * Core data structure that represents an archive. Conceptually, an archive is a
 * collection of indexed messages (which can be incrementally updated), along
 * with a blob store. It also has addressbooks, group assigner etc, which are
 * second order properties -- they may be updated independently of the docs (in
 * the future). allDocs is the indexed docs, NOT the ones in the current
 * filter... need to work this out. An archive should be capable of being loaded
 * up in multiple sessions simultaneously. one problem currently is that
 * summarizer is stored in indexer -- however, we should pull it out into
 * per-session state.
 *
 */
public class Archive implements Serializable {
    private static Log log = LogFactory.getLog(Archive.class);
    private final static long serialVersionUID = 1L;

    // the archive structure: the archive's top level dir has these subdirs
    public static final String BLOBS_SUBDIR = "blobs";
    public static final String INDEXES_SUBDIR = "indexes";
    public static final String SESSIONS_SUBDIR = "sessions"; // original idea was that there would be different sessions on the same archive (index). but in practice we only have one session
    private static final String LEXICONS_SUBDIR = "lexicons";
    private static final String FEATURES_SUBDIR = "mixtures";
    public static final String IMAGES_SUBDIR = "images";

    public static String[] LEXICONS = new String[] { "default.english.lex.txt" }; // this is the default, for Muse. EpaddIntializer will set it differently. don't make it final

    ////////////  CACHE variables ///////////////
    // these 5 variables cache the list of all entities/blob names/annotations/folders/ email sources in the archive
    // of these folders, email source and blob names generally don't change
    // however, entities and annotations must be recomputed any time there is a change
    // for good measure, we invalidate all of them when close() is called on the archive
    private transient Set<String> allEntities, allBlobNames, allAnnotations, allFolders, allEmailSources;
    ////////////  END CACHE variables ///////////////

    /* all of the following don't change based on the current filter */
    public Indexer indexer;
    private IndexOptions indexOptions;
    public BlobStore blobStore;
    public AddressBook addressBook;
    public GroupAssigner groupAssigner;
    transient private Map<String, Lexicon> lexiconMap = null;
    private List<Document> allDocs; // this is the equivalent of fullEmailDocs earlier
    transient private Set<Document> allDocsAsSet = null;
    private Set<FolderInfo> fetchedFolderInfos = new LinkedHashSet<FolderInfo>(); // keep this private since its updated in a controlled way
    transient private LinkedHashMap<String, FolderInfo> fetchedFolderInfosMap = null;
    public Set<String> ownerNames = new LinkedHashSet<String>(), ownerEmailAddrs = new LinkedHashSet<String>();
    private EntityMapper entityMapper;
    public AuthorityMapper authorityMapper; /* transient because this is saved and loaded separately */
    private Map<String, NameInfo> nameMap;

    public ProcessingMetadata processingMetadata = new ProcessingMetadata();
    public List<String> allAccessions = new ArrayList<String>();
    public List<FetchStats> allStats = new ArrayList<FetchStats>(); // multiple stats because usually there is 1 per import

    public String archiveTitle; // this is the name of this archive

    public synchronized AuthorityMapper getAuthorityMapper()
            throws IOException, ParseException, ClassNotFoundException {
        // auth mapper is transient, so may have to be created each time. but it will be loaded from a file if it already exists
        if (authorityMapper == null)
            authorityMapper = AuthorityMapper.createAuthorityMapper(this);
        return authorityMapper;
    }

    /** recreates the authority mapper, call this, e.g. if the address book changes. */
    public synchronized void recreateAuthorityMapper() throws IOException, ParseException, ClassNotFoundException {
        authorityMapper = AuthorityMapper.createAuthorityMapper(this);
    }

    public synchronized EntityMapper getEntityMapper() {
        if (entityMapper == null)
            entityMapper = new EntityMapper();
        return entityMapper;
    }

    /*
     * baseDir is used loosely... it may not be fully reliable, e.g. when the
     * archive moves.
     */
    public String baseDir;

    public SentimentStats stats = new SentimentStats();

    // clusters are somewhat ephemeral and not necessarily a core part of the
    // Archive struct. consider moving it elsewhere.
    private List<MultiDoc> docClusters;

    /**
     * @return all the links extracted from the archive content*/
    //This is a better location for this than Indexer, I (@vihari) think
    public List<LinkInfo> getLinks() {
        return indexer.links;
    }

    public Set<Blob> blobsForQuery(String term) {
        return indexer.blobsForQuery(term);
    }

    public Collection<edu.stanford.muse.index.Document> docsForQuery(String term, int cluster, int threshold,
            Indexer.QueryType qt) {
        Indexer.QueryOptions options = new Indexer.QueryOptions();
        options.setQueryType(qt);
        options.setCluster(cluster);
        options.setThreshold(threshold);
        return indexer.docsForQuery(term, options);
    }

    public Collection<edu.stanford.muse.index.Document> docsForQuery(String term, int cluster, int threshold) {
        Indexer.QueryOptions options = new Indexer.QueryOptions();
        options.setCluster(cluster);
        options.setThreshold(threshold);
        return indexer.docsForQuery(term, options);
    }

    public Collection<edu.stanford.muse.index.Document> docsForQuery(String term, int cluster,
            Indexer.QueryType qt) {
        Indexer.QueryOptions options = new Indexer.QueryOptions();
        options.setCluster(cluster);
        return indexer.docsForQuery(term, options);
    }

    public Collection<Document> docsForQuery(int cluster, Indexer.QueryType qt) {
        Indexer.QueryOptions options = new Indexer.QueryOptions();
        options.setCluster(cluster);
        options.setQueryType(qt);
        return indexer.docsForQuery(null, options);
    }

    public Collection<EmailDocument> convertToED(Collection<Document> docs) {
        return indexer.convertToED(docs);
    }

    public Collection<edu.stanford.muse.index.Document> docsForQuery(String term, Indexer.QueryType qt) {
        Indexer.QueryOptions options = new Indexer.QueryOptions();
        options.setQueryType(qt);
        return indexer.docsForQuery(term, options);
    }

    /** VIP method: main way to search for documents with the term (embedded in options) in the archive*/
    public Collection<Document> docsForQuery(String term, Indexer.QueryOptions options) {
        return indexer.docsForQuery(term, options);
    }

    /**
     * @param q - query
     * @param qt - query type
     * @return number of hits for the query*/
    public int countHitsForQuery(String q, Indexer.QueryType qt) {
        return indexer.countHitsForQuery(q, qt);
    }

    public int countHitsForQuery(String q) {
        return indexer.countHitsForQuery(q, Indexer.QueryType.FULL);
    }

    public Pair<String, String> getContentsOfAttachment(String fileName) {
        return indexer.getContentsOfAttachment(fileName);
    }

    public EmailDocument docForId(String id) {
        return indexer.docForId(id);
    }

    public String getTitle(org.apache.lucene.document.Document doc) {
        return indexer.getTitle(doc);
    }

    public Indexer.IndexStats getIndexStats() {
        return indexer.stats;
    }

    // these fields are used in the library setting
    static public class ProcessingMetadata implements java.io.Serializable {
        private final static long serialVersionUID = 6304656466358754945L; // compatibility
        public String institution, repository, collectionTitle, collectionID, accessionID, findingAidLink,
                catalogRecordLink, contactEmail, rights, notes;
        public long timestamp;
        public TimeZone tz;
        public int nDocs, nIncomingMessages, nOutgoingMessages, nHackyDates; // note a message can be both incoming and outgoing.
        public int nBlobs, nUniqueBlobs, nImageBlobs, nDocBlobs, nOtherBlobs; // this is just a cache so we don't have to read the archive
        public String ownerName, about;
        //will be set by method that computes epadd-ner
        public Map<Short, Integer> entityCounts;
        public int numPotentiallySensitiveMessages = -1;
        public Date firstDate, lastDate;

        private static String mergeField(String a, String b) {
            if (a == null)
                return b;
            if (b == null)
                return a;
            if (a.equals(b))
                return a;
            else
                return a + "+" + b;
        }

        public void merge(ProcessingMetadata other) {
            mergeField(this.institution, other.institution);
            mergeField(this.repository, other.repository);
            mergeField(this.collectionTitle, other.collectionTitle);
            mergeField(this.collectionID, other.collectionID);
            mergeField(this.accessionID, other.accessionID);
            mergeField(this.findingAidLink, other.findingAidLink);
            mergeField(this.catalogRecordLink, other.catalogRecordLink);
            mergeField(this.contactEmail, other.contactEmail);
            mergeField(this.rights, other.rights);
            mergeField(this.notes, other.notes);
            // mergeField(this.tz, other.tz);
        }
    }

    /**
     * set the base dir of the archive, this is the place where all the archive cache is dumped
     * */
    public void setBaseDir(String dir) {
        baseDir = dir;
        blobStore.setDir(dir + File.separator + BLOBS_SUBDIR);
    }

    /**
     * Internal, please do not use!
     * */
    //is being used in types.jsp -> Can we get rid of types.jsp or this call?
    public void setNameMap(Map<String, NameInfo> nameMap) {
        this.nameMap = nameMap;
    }

    public class SentimentStats implements Serializable { // this is a placeholder
        // right now.. its
        // essentially storing
        // archive cluer's stats
        private final static long serialVersionUID = 1L;
        public Map<String, Integer> sentimentCounts;
    }

    private void setBlobStore(BlobStore blobStore) {
        this.blobStore = blobStore;
    }

    //TODO: this should not be public, being used in doSimpleFlow. At least put some some documentation
    public void setGroupAssigner(GroupAssigner groupAssigner) {
        this.groupAssigner = groupAssigner;
    }

    //TODO: this should not be public, being used in doSimpleFlow.
    public void setAddressBook(AddressBook ab) {
        addressBook = ab;
    }

    public BlobStore getBlobStore() {
        return blobStore;
    }

    public AddressBook getAddressBook() {
        return addressBook;
    }

    /** private constructor -- always use createArchive() instead */
    private Archive() {
    }

    public static Archive createArchive() {
        return createArchive("");
    }

    private static Archive createArchive(String title) {
        Archive archive = new Archive();
        archive.archiveTitle = title;
        return archive;
    }

    public synchronized void openForRead() {
        log.info("Opening archive read only");
        indexer.setupForRead();
    }

    public synchronized void openForWrite() throws IOException {
        log.info("Opening archive for write");

        indexer.setupForWrite();
        if (allDocs != null) {
            // we already have some docs in the index, verify it to make
            // sure the archive's idea of #docs is the same as the index's.
            int docsInIndex = indexer.nDocsInIndex();
            log.info(docsInIndex + " doc(s) in index, " + allDocs.size() + " doc(s) in archive");
            Util.warnIf(indexer.nDocsInIndex() != allDocs.size(),
                    "Warning: archive nDocsInIndex is not the same as Archive alldocs (possible if docs have been deleted?)",
                    log);
        }
    }

    public synchronized void close() {
        log.info("Closing archive");
        if (indexer != null)
            indexer.close();
        try {
            if (blobStore != null)
                blobStore.pack(); // ideally, do this only if its dirty
        } catch (Exception e) {
            Util.print_exception(e, log);
        }

        // clear all the caches, so they will be recomputed at next use
        allEntities = allBlobNames = allFolders = allEmailSources = allAnnotations = null;
    }

    // create a new/empty archive.
    // baseDir is for specifying base location of Indexer's file-based
    // directories
    /**
     * Setup an archive
     * @param baseDir - base dir of the archive
     * @param blobStore - attchmane blob store
     * @param args - options for loading @see{edu.stanford.muse.webapp.JSPHelper.preparedArchive}, set to nul to empty array for defaults
     * */
    public void setup(String baseDir, BlobStore blobStore, String args[]) throws IOException {
        prepareBaseDir(baseDir);
        lexiconMap = createLexiconMap(baseDir);
        indexOptions = new IndexOptions();
        indexOptions.parseArgs(args);
        log.info("Index options are: " + indexOptions);
        indexer = new Indexer(baseDir, indexOptions);
        if (blobStore != null)
            setBlobStore(blobStore);
    }

    /**
     * clear all fields, use when indexer needs to be completely cleared
     */
    public void clear() {
        if (indexer != null)
            indexer.clear();
        if (allDocs != null)
            allDocs.clear();
        if (allDocsAsSet != null)
            allDocsAsSet.clear();
        groupAssigner = null;
        ownerEmailAddrs.clear();
        ownerNames.clear();
        addressBook = null;
    }

    @Override
    public int hashCode() {
        return super.hashCode();
    }

    /*
     * should happen rarely, only while exporting session. fragile operation,
     * make sure blobStore etc are updated consistently
     */
    public void setAllDocs(List<Document> docs) {
        log.info("Updating archive's alldocs to new list of " + docs.size() + " docs");
        allDocs = docs;
        allDocsAsSet = null;

        // reset all these fields, they will be computed afresh
        allEntities = allBlobNames = allAnnotations = allFolders = allEmailSources = null;
    }

    public NameInfo nameLookup(String name) {
        String ctitle = name.toLowerCase().replaceAll(" ", "_");
        if (nameMap != null)
            return nameMap.get(ctitle);
        else
            return null;
    }

    public void addOwnerName(String name) {
        ownerNames.add(name);
        processingMetadata.ownerName = name;
    }

    public void addOwnerEmailAddrs(Collection<String> emailAddrs) {
        ownerEmailAddrs.addAll(emailAddrs);
    }

    public void addOwnerEmailAddr(String emailAddr) {
        ownerEmailAddrs.add(emailAddr);
    }

    /**
     * This should be the only place that creates the cache dir.
     */
    public static void prepareBaseDir(String dir) {
        dir = dir + File.separatorChar + LEXICONS_SUBDIR;
        File f_dir = new File(dir);

        f_dir.mkdirs();

        // copy lexicons over to the muse dir
        // unfortunately, hard-coded because we are loading as a ClassLoader resource and not as a file, so we can't use Util.filesWithSuffix()
        // we have a different set of lexicons for epadd and muse which will be set up in LEXICONS by the time we reach here
        log.info("copying " + LEXICONS.length + " lexicons to " + dir);
        for (String l : LEXICONS) {
            try {

                if (new File(dir + File.separator + l).exists()) {
                    log.info("Skipping lexicon " + l + " because it already exists");
                    continue;
                }

                InputStream is = EmailUtils.class.getClassLoader().getResourceAsStream("lexicon/" + l);
                if (is == null) {
                    log.warn("lexicon lexicon/" + l + " not found");
                    continue;
                }

                log.info("copying " + l + " to " + dir);
                Util.copy_stream_to_file(is, dir + File.separator + l);
            } catch (Exception e) {
                Util.print_exception(e, log);
            }
        }
    }

    public static void clearCache(String baseDir, String rootDir) {
        log.info("Clearing archive with baseDir: " + baseDir + " rootDir: " + rootDir);
        if (!Util.nullOrEmpty(baseDir)) {
            // delete only indexes, blobs, sessions
            // keep sentiment stuff around
            Util.deleteDir(baseDir);
            /*
            Util.deleteDir(baseDir + File.separatorChar + INDEXES_SUBDIR);
            Util.deleteDir(baseDir + File.separatorChar + SESSIONS_SUBDIR); // could
            Util.deleteDir(baseDir + File.separatorChar + LEXICONS_SUBDIR); // could
            Util.deleteDir(baseDir + File.separatorChar + MODELS_SUBDIR); // could
                                                     // also
                                                     // call
                                                     // sessions.deleteallsessions,
                                                     // but
                                                     // lazy...
            */
            // prepare cache dir anew
            prepareBaseDir(baseDir);
        }

        // rootdir is used only for webapp/<user> (piclens etc) we'll get rid of
        // it in future
        if (!Util.nullOrEmpty(rootDir)) {
            Util.deleteDir(rootDir);
            new File(rootDir + File.separator).mkdirs();
        }
    }

    /**
     * returns the final, sorted, deduped version of allDocs that this driver
     * worked on in its last run
     */
    public List<Document> getAllDocs() {
        if (allDocs == null) {
            synchronized (this) {
                if (allDocs == null) {
                    allDocs = new ArrayList<Document>();
                    allDocsAsSet = new LinkedHashSet<Document>();
                }
            }
        }
        return allDocs;
    }

    public Set<Document> getAllDocsAsSet() {
        // allDocsAsSet is lazily computed
        if (allDocsAsSet == null) {
            synchronized (this) {
                if (allDocsAsSet == null) {
                    allDocsAsSet = new LinkedHashSet<Document>(getAllDocs());
                    Util.softAssert(allDocs.size() == allDocsAsSet.size());
                }
            }
        }
        return allDocsAsSet;
    }

    public int nDocsInCluster(int i) {
        if (i < 0 || i >= docClusters.size())
            return -1;
        return docClusters.get(i).getDocs().size();
    }

    public int nClusters() {
        return docClusters.size();
    }

    // work in progress - status provider
    public StatusProvider getStatusProvider() {
        return indexer;
    }

    public Map<String, Collection<Document>> getSentimentMap(Lexicon lex, boolean originalContentOnly,
            String... captions) {
        if (lex == null) {
            log.warn("Warning: lexicon is null!");
            return new LinkedHashMap<>();
        }
        return lex.getEmotions(indexer, getAllDocsAsSet(), false /* doNota */, originalContentOnly, captions);
    }

    /**
     * gets original content only!
     */
    public String getContents(Document d, boolean originalContentOnly) {
        return indexer.getContents(d, originalContentOnly);
    }

    public String getContents(org.apache.lucene.document.Document ldoc, boolean originalContentOnly) {
        return indexer.getContents(ldoc, originalContentOnly);
    }

    private void setupAddressBook(List<Document> docs) {
        // in this case, we don't care whether email addrs are incoming or
        // outgoing,
        // so the ownAddrs can be just a null string
        if (addressBook == null)
            addressBook = new AddressBook(new String[0], new String[0]);
        log.info("Setting up address book for " + docs.size() + " messages (indexing driver)");
        for (Document d : docs)
            if (d instanceof EmailDocument)
                addressBook.processContactsFromMessage((EmailDocument) d);

        addressBook.organizeContacts();
    }

    public List<LinkInfo> extractLinks(Collection<Document> docs) throws Exception {
        prepareAllDocs(docs, indexOptions);
        indexer.clear();
        indexer.extractLinks(docs);
        return EmailUtils.getLinksForDocs(docs);
    }

    public Collection<DatedDocument> docsInDateRange(Date start, Date end) {
        List<DatedDocument> result = new ArrayList<DatedDocument>();
        if (Util.nullOrEmpty(allDocs))
            return result;

        for (Document d : allDocs) {
            try {
                DatedDocument dd = (DatedDocument) d;
                if ((dd.date.after(start) && dd.date.before(end)) || dd.date.equals(start) || dd.date.equals(end))
                    result.add(dd);
            } catch (Exception e) {
                Util.print_exception(e, log);
            }
        }
        return result;
    }

    public boolean containsDoc(Document doc) {
        return getAllDocsAsSet().contains(doc);
    }

    /**
     * use with caution. pseudo-adds a doc to the archive, but without any
     * subject and without any contents. useful only when doing quick screening
     * to check of emails for memory tests, etc.
     */
    public synchronized boolean addDocWithoutContents(Document doc) {
        if (containsDoc(doc))
            return false;

        getAllDocsAsSet().add(doc);
        getAllDocs().add(doc);

        String subject = "", contents = "";

        indexer.indexSubdoc(subject, contents, doc, blobStore);

        if (getAllDocs().size() % 100 == 0)
            log.info("Memory status after " + getAllDocs().size() + " emails: " + Util.getMemoryStats());

        return true;
    }

    /**
     * core method, adds a single doc to the archive. remember to call
     * postProcess at the end of any series of calls to add docs
     */
    public synchronized boolean addDoc(Document doc, String contents) {
        if (containsDoc(doc))
            return false;

        getAllDocsAsSet().add(doc);
        getAllDocs().add(doc);

        String subject = doc.getSubjectWithoutTitle();
        subject = EmailUtils.cleanupSubjectLine(subject);

        indexer.indexSubdoc(subject, contents, doc, blobStore);

        if (getAllDocs().size() % 100 == 0)
            log.info("Memory status after " + getAllDocs().size() + " emails: " + Util.getMemoryStats());

        return true;
    }

    /**
     * prepares all docs for indexing, incl. applying filters, removing dups and
     * sorting
     *
     * @throws Exception
     */
    private void prepareAllDocs(Collection<Document> docs, IndexOptions io) throws Exception {
        allDocs = new ArrayList<Document>();
        allDocs.addAll(docs);
        allDocs = EmailUtils.removeDupsAndSort(allDocs);
        log.info(allDocs.size() + " documents after removing duplicates");

        if (addressBook == null && !io.noRecipients) {
            log.warn("no address book previously set up!");
            setupAddressBook(allDocs); // set up without the benefit of ownaddrs
        }

        if (io.filter != null && addressBook != null) {
            Contact ownCI = addressBook.getContactForSelf(); // may return null
            // if we don't
            // have own info
            io.filter.setOwnContactInfo(ownCI);
        }

        // if no filter, accept doc (default)
        List<Document> newAllDocs = new ArrayList<Document>();
        for (Document d : allDocs)
            if (io.filter == null || (io.filter != null && io.filter.matches(d)))
                newAllDocs.add(d);

        EmailUtils.cleanDates(newAllDocs);

        log.info(newAllDocs.size() + " documents after filtering");

        allDocs = newAllDocs;
        Collections.sort(allDocs); // may not be essential
        allDocsAsSet = null;
    }

    /**
     * set up doc clusters by group or by time
     */
    private void prepareDocClusters(List<SimilarGroup<String>> groups) {
        /** by default, we only use month based clusters right now */
        if (indexOptions.categoryBased) {
            docClusters = IndexUtils.partitionDocsByCategory(allDocs);
        } else {
            if (groups != null) {
                Map<String, Set<EmailDocument>> groupsToDocsMap = IndexUtils
                        .partitionDocsByGroup((Collection) allDocs, groups, addressBook, true);
                int i = 0;
                for (String groupName : groupsToDocsMap.keySet()) {
                    MultiDoc md = new MultiDoc(Integer.toString(i++), groupName);
                    docClusters.add(md);
                    for (EmailDocument d : groupsToDocsMap.get(groupName))
                        md.add(d);
                }
            } else
                docClusters = IndexUtils.partitionDocsByInterval((List) allDocs, indexOptions.monthsNotYears);
        }

        log.info(docClusters.size() + " clusters of documents");

        // outputPrefix = io.outputPrefix;
        log.info(allDocs.size() + " documents in " + docClusters.size() + " time clusters, "
                + indexer.nonEmptyTimeClusterMap.size() + " non-empty");
    }

    private String getFolderInfosMapKey(String accountKey, String longName) {
        return accountKey + "..." + longName;
    }

    private void setupFolderInfosMap() {
        if (fetchedFolderInfosMap == null)
            fetchedFolderInfosMap = new LinkedHashMap<String, FolderInfo>();
        for (FolderInfo fi : fetchedFolderInfos) {
            fetchedFolderInfosMap.put(getFolderInfosMapKey(fi.accountKey, fi.longName), fi);
        }
    }

    /**
     * adds a collection of folderinfo's to the archive, updating existing ones
     * as needed
     */
    public void addFetchedFolderInfos(Collection<FolderInfo> fis) {
        // if a folderinfo with the same accountKey and longname already exists,
        // its lastSeenUID may need to be updated.

        // first organize a key -> folder info map in case we have a large # of
        // folders
        setupFolderInfosMap();

        for (FolderInfo fi : fis) {
            String key = getFolderInfosMapKey(fi.accountKey, fi.longName);
            FolderInfo existing_fi = fetchedFolderInfosMap.get(key);
            if (existing_fi != null) {
                if (existing_fi.lastSeenUID < fi.lastSeenUID)
                    existing_fi.lastSeenUID = fi.lastSeenUID;
            } else {
                fetchedFolderInfos.add(fi);
                fetchedFolderInfosMap.put(key, fi);
            }
        }
    }

    private FolderInfo getFetchedFolderInfo(String accountID, String fullFolderName) {
        setupFolderInfosMap();
        return fetchedFolderInfosMap.get(getFolderInfosMapKey(accountID, fullFolderName));
    }

    /**
     * returns last seen UID for the specified folder, -1 if its not been seen
     * before
     */
    public long getLastUIDForFolder(String accountID, String fullFolderName) {
        FolderInfo existing_fi = getFetchedFolderInfo(accountID, fullFolderName);
        if (existing_fi != null)
            return existing_fi.lastSeenUID;
        else {
            return -1L;
        }
    }

    /***/
    public List<LinkInfo> postProcess() {
        return postProcess(allDocs, null);
    }

    /**
     * should be called at the end of a series of calls to add doc to the
     * archive. returns links. splits by groups if not null, otherwise by time.
     *
     * @throws Exception
     */
    //does not make sense to have it public.
    private synchronized List<LinkInfo> postProcess(Collection<Document> docs, List<SimilarGroup<String>> groups) {
        // should we sort the messages by time here?

        log.info(indexer.computeStats());
        log.info(getLinks().size() + " links");
        // prepareAllDocs(docs, io);
        prepareDocClusters(groups);
        // TODO: should we recomputeCards? call nukeCards for now to invalidate
        // cards since archive may have been modified.
        indexer.summarizer.nukeCards();

        List<LinkInfo> links = getLinks();
        return links;
    }

    // replace subject with extracted names
    private static void replaceDescriptionWithNames(Collection<? extends Document> allDocs, Archive archive)
            throws Exception {
        for (Document d : allDocs) {
            if (!Util.nullOrEmpty(d.description)) {
                //log.info("Replacing description for docId = " + d.getUniqueId());
                // List<String> names =
                // Indexer.extractNames(d.description);
                // Collections.sort(names);
                // d.description = Util.join(names,
                // Indexer.NAMES_FIELD_DELIMITER);
                d.description = IndexUtils.retainOnlyNames(d.description, archive.getLuceneDoc(d.getUniqueId()));
            }
        }
    }

    /**
     * export archive with just the given docs to prepare for public mode.
     * docsToExport should be a subset of what's already in the archive. returns
     * true if successful.
     */
    /*
     * public boolean trimArchive(Collection<EmailDocument> docsToRetain) throws
     * Exception { if (docsToRetain == null) return true; // return without
     * doing anything
     * 
     * // exports messages in current filter (allEmailDocs) //HttpSession
     * session = request.getSession(); Collection<Document> fullEmailDocs =
     * this.getAllDocs(); Indexer indexer = sthis.indexer;
     * 
     * // compute which docs to remove vs. keep Set<Document> docsToKeep = new
     * LinkedHashSet<Document>(docsToRetain); Set<Document> docsToRemove = new
     * LinkedHashSet<Document>(); for (Document d: fullEmailDocs) if
     * (!docsToKeep.contains(d)) docsToRemove.add(d);
     * 
     * // remove unneeded docs from the index
     * indexer.removeEmailDocs(docsToRemove); // CAUTION: permanently change the
     * index! this.setAllDocs(new ArrayList<Document>(docsToRetain)); return
     * true; }
     */

    /**
     * a fresh archive is created under out_dir. name is the name of the session
     * under it. blobs are exported into this archive dir. destructive! but
     * should be so only in memory. original files on disk should be unmodified.
     *
     * @param retainedDocs
     * @throws Exception
     */
    public synchronized String export(Collection<? extends Document> retainedDocs, final boolean exportInPublicMode,
            String out_dir, String name) throws Exception {
        if (Util.nullOrEmpty(out_dir))
            return null;
        File dir = new File(out_dir);
        if (dir.exists() && dir.isDirectory()) {
            log.warn("Overwriting existing directory '" + out_dir + "' (it may already exist)");
            FileUtils.deleteDirectory(dir);
        } else if (!dir.mkdirs()) {
            log.warn("Unable to create directory: " + out_dir);
            return null;
        }
        Archive.prepareBaseDir(out_dir);
        if (!exportInPublicMode && new File(baseDir + File.separator + LEXICONS_SUBDIR).exists())
            FileUtils.copyDirectory(new File(baseDir + File.separator + LEXICONS_SUBDIR),
                    new File(out_dir + File.separator + LEXICONS_SUBDIR));
        if (new File(baseDir + File.separator + IMAGES_SUBDIR).exists())
            FileUtils.copyDirectory(new File(baseDir + File.separator + IMAGES_SUBDIR),
                    new File(out_dir + File.separator + IMAGES_SUBDIR));
        //internal disambiguation cache
        if (new File(baseDir + File.separator + FEATURES_SUBDIR).exists())
            FileUtils.copyDirectory(new File(baseDir + File.separator + FEATURES_SUBDIR),
                    new File(out_dir + File.separator + FEATURES_SUBDIR));
        if (new File(baseDir + File.separator + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME).exists())
            FileUtils.copyFile(
                    new File(baseDir + File.separator + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME),
                    new File(out_dir + File.separator + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME));

        // save the states that may get modified
        List<Document> savedAllDocs = allDocs;

        allDocs = new ArrayList<>(retainedDocs);
        if (exportInPublicMode)
            replaceDescriptionWithNames(allDocs, this);

        // copy index and if for public mode, also redact body and remove title
        // fields
        final boolean redact_body_instead_of_remove = true;
        Set<String> docIdSet = new LinkedHashSet<String>();
        for (Document d : allDocs)
            docIdSet.add(d.getUniqueId());
        final Set<String> retainedDocIds = docIdSet;
        Indexer.FilterFunctor emailFilter = new Indexer.FilterFunctor() {
            @Override
            public boolean filter(org.apache.lucene.document.Document doc) {
                if (!retainedDocIds.contains(doc.get("docId")))
                    return false;

                if (exportInPublicMode) {
                    String text = null;
                    if (redact_body_instead_of_remove) {
                        text = doc.get("body");
                    }
                    doc.removeFields("body");
                    doc.removeFields("body_original");

                    if (text != null) {
                        String redacted_text = IndexUtils.retainOnlyNames(text, doc);
                        doc.add(new Field("body", redacted_text, Indexer.full_ft));
                        //this uses standard analyzer, not stemming because redacted bodys only have names.
                    }
                    String title = doc.get("title");
                    doc.removeFields("title");
                    if (title != null) {
                        String redacted_title = IndexUtils.retainOnlyNames(text, doc);
                        doc.add(new Field("title", redacted_title, Indexer.full_ft));
                    }
                }
                return true;
            }
        };

        Indexer.FilterFunctor attachmentFilter = new Indexer.FilterFunctor() {
            @Override
            public boolean filter(org.apache.lucene.document.Document doc) {
                if (exportInPublicMode) {
                    return false;
                }
                String docId = doc.get("emailDocId");
                if (docId == null) {
                    Integer di = Integer.parseInt(doc.get("docId"));
                    //don't want to print too many messages
                    if (di == null || di < 10)
                        log.error("Looks like this is an old archive, filtering all the attachments!!\n"
                                + "Consider re-indexing with the latest version for a proper export.");
                    return false;
                }
                return retainedDocIds.contains(docId);
            }
        };
        if (exportInPublicMode) {
            List<Document> docs = this.getAllDocs();
            List<EmailDocument> eds = new ArrayList<EmailDocument>();
            for (Document doc : docs)
                eds.add((EmailDocument) doc);
            EmailUtils.maskEmailDomain(eds, this.addressBook);
        }

        indexer.copyDirectoryWithDocFilter(out_dir, emailFilter, attachmentFilter);
        log.info("Completed exporting indexes");

        // save the blobs in a new blobstore
        if (!exportInPublicMode) {
            log.info("Starting to export blobs, old blob store is: " + blobStore);
            Set<Blob> blobsToKeep = new LinkedHashSet<Blob>();
            for (Document d : allDocs)
                if (d instanceof EmailDocument)
                    if (!Util.nullOrEmpty(((EmailDocument) d).attachments))
                        blobsToKeep.addAll(((EmailDocument) d).attachments);
            String blobsDir = out_dir + File.separatorChar + BLOBS_SUBDIR;
            new File(blobsDir).mkdirs();
            BlobStore newBlobStore = blobStore.createCopy(blobsDir, blobsToKeep);
            log.info("Completed exporting blobs, newBlobStore in dir: " + blobsDir + " is: " + newBlobStore);
            // switch to the new blob store (important -- the urls and indexes in the new blob store are different from the old one! */
            blobStore = newBlobStore;
        }

        // write out the archive file
        SimpleSessions.saveArchive(out_dir, name, this); // save .session file.
        log.info("Completed saving archive object");

        // restore states
        allDocs = savedAllDocs;

        return out_dir;
    }

    public List<Document> docsWithThreadId(long threadID) {
        List<Document> result = new ArrayList<Document>();
        for (Document ed : allDocs) {
            if (((EmailDocument) ed).threadID == threadID)
                result.add(ed);
        }
        return result;
    }

    public String getStats() {
        // note: this is a legacy method that does not use the archivestats
        // object above
        StringBuilder sb = new StringBuilder(allDocs.size() + " original docs with " + ownerEmailAddrs.size()
                + " email addresses " + ownerNames.size() + " names for owner ");
        if (addressBook != null)
            sb.append(addressBook.getStats() + "\n");
        sb.append(indexer.computeStats() + "\n" + getLinks().size() + " links");
        return sb.toString();
    }

    /**
     * @return html for the given terms, with terms highlighted by the indexer.
     * if IA_links is set, points links to the Internet archive's version of the page.
     * docId is used to initialize a new view created by clicking on a link within this message,
     * date is used to create the link to the IA
     * @args ldoc - lucene doc corresponding to the content
     * s - content of the doc
     * Date
     * docId - Uniquedocid of the emaildocument
     * highlighttermsUnstemmed - terms to highlight in the content (for ex
     * lexicons)
     * highlighttermsstemmed - entities to highlight, generally are names
     * that one doesn't wish to be stemmed.
     * entitiesWithId - authorisedauthorities, for annotation
     * showDebugInfo - enabler to show debug info
     */
    private String annotate(org.apache.lucene.document.Document ldoc, String s, Date date, String docId,
            String regexToHighlight, Set<String> highlightTerms, Map<String, EmailRenderer.Entity> entitiesWithId,
            boolean IA_links, boolean showDebugInfo) {
        getAllDocs();
        try {
            Summarizer summarizer = new Summarizer(indexer);

            s = Highlighter.getHTMLAnnotatedDocumentContents(s, (IA_links ? date : null), docId, regexToHighlight,
                    highlightTerms, entitiesWithId, summarizer.importantTermsCanonical, false);

            //indexer
            //   .getHTMLAnnotatedDocumentContents(s, (IA_links ? date : null), docId, searchTerms, isRegexSearch, highlightTermsStemmed, highlightTermsUnstemmed, entitiesWithId);
        } catch (Exception e) {
            e.printStackTrace();
            log.warn("indexer failed to annotate doc contents " + Util.stackTrace(e));
        }

        return s;
    }

    public String annotate(String s, Date date, String docId, String regexToHighlight, Set<String> highlightTerms,
            Map<String, EmailRenderer.Entity> entitiesWithId, boolean IA_links, boolean showDebugInfo) {
        return annotate(null, s, date, docId, regexToHighlight, highlightTerms, entitiesWithId, IA_links,
                showDebugInfo);
    }

    public Pair<StringBuilder, Boolean> getHTMLForContents(Document d, Date date, String docId,
            String regexToHighlight, Set<String> highlightTerms, Map<String, Map<String, Short>> authorisedEntities,
            boolean IA_links, boolean inFull, boolean showDebugInfo) throws Exception {
        org.apache.lucene.document.Document ldoc = indexer.getDoc(d);
        Span[] names = getAllNamesInLuceneDoc(ldoc, true);

        String contents = indexer.getContents(d, false);
        Set<String> acrs = Util.getAcronyms(contents);

        if (ldoc == null) {
            System.err.println("Lucene Doc is null for: " + d.getUniqueId() + " but the content is "
                    + (contents == null ? "null" : "not null"));
            return null;
        }

        // Contains all entities and id if it is authorised else null
        Map<String, EmailRenderer.Entity> entitiesWithId = new HashMap<>();
        //we annotate three specially recognized types
        Map<Short, String> recMap = new HashMap<>();
        recMap.put(NEType.Type.PERSON.getCode(), "cp");
        recMap.put(NEType.Type.PLACE.getCode(), "cl");
        recMap.put(NEType.Type.ORGANISATION.getCode(), "co");
        Arrays.asList(names).stream().filter(n -> recMap.keySet().contains(NEType.getCoarseType(n.type).getCode()))
                .forEach(n -> {
                    Set<String> types = new HashSet<>();
                    types.add(recMap.get(NEType.getCoarseType(n.type).getCode()));
                    entitiesWithId.put(n.text, new EmailRenderer.Entity(n.text,
                            authorisedEntities == null ? null : authorisedEntities.get(n), types));
                });
        acrs.forEach(acr -> {
            Set<String> types = new HashSet<>();
            types.add("acr");
            entitiesWithId.put(acr, new EmailRenderer.Entity(acr,
                    authorisedEntities == null ? null : authorisedEntities.get(acr), types));
        });

        //don't want "more" button anymore
        boolean overflow = false;
        String htmlContents;
        if (contents.length() > Config.MAX_TEXT_SIZE_TO_ANNOTATE) // don't try to annotate extraordinarily long messages, probably bad data, as discovered on RF archive
            htmlContents = Util.escapeHTML(contents);
        else
            htmlContents = annotate(ldoc, contents, date, docId, regexToHighlight, highlightTerms, entitiesWithId,
                    IA_links, showDebugInfo);

        if (ModeConfig.isPublicMode())
            htmlContents = Util.maskEmailDomain(htmlContents);

        StringBuilder sb = new StringBuilder();
        sb.append(htmlContents);
        return new Pair<>(sb, overflow);
    }

    /* break up docs into clusters, based on existing docClusters
    * Note: Clustering Type MONTHLY and YEARLY not supported*/
    public List<MultiDoc> clustersForDocs(Collection<? extends Document> docs, MultiDoc.ClusteringType ct) {
        //TODO: whats the right thing to do when docClusters is null?
        if (docClusters == null || (ct == MultiDoc.ClusteringType.NONE)) {
            List<MultiDoc> new_mDocs = new ArrayList<>();
            MultiDoc md = new MultiDoc(0, "all");
            docs.forEach(md::add);

            new_mDocs.add(md);
            return new_mDocs;
        }

        Map<Document, Integer> map = new LinkedHashMap<>();
        int i = 0;
        for (MultiDoc mdoc : docClusters) {
            for (Document d : mdoc.docs)
                map.put(d, i);
            i++;
        }

        List<MultiDoc> new_mDocs = new ArrayList<>();
        for (MultiDoc md : docClusters)
            new_mDocs.add(null);

        for (Document d : docs) {
            int x = map.get(d);
            MultiDoc new_mDoc = new_mDocs.get(x);
            if (new_mDoc == null) {
                MultiDoc original = docClusters.get(x);
                new_mDoc = new MultiDoc(original.getUniqueId(), original.description);
                new_mDocs.set(x, new_mDoc);
            }
            new_mDoc.add(d);
        }

        List<MultiDoc> result = new ArrayList<>();
        for (MultiDoc md : new_mDocs)
            if (md != null)
                result.add(md);

        return result;
    }

    public String toString() {
        // be defensive here -- some of the fields may be null
        StringBuilder sb = new StringBuilder();
        if (allDocs != null)
            sb.append("Archive with #docs: " + allDocs.size() + " address book: " + addressBook + " " + getStats()
                    + " ");
        else
            sb.append("Null docs");
        if (indexer != null) {
            if (indexer.stats != null)
                sb.append(Util.fieldsToString(indexer.stats, false));
            else
                sb.append("Null indexer-stats");
        } else
            sb.append("Null indexer");
        return sb.toString();
    }

    //TODO retain only one of the two methods below
    public org.apache.lucene.document.Document getLuceneDoc(String docId) throws IOException {
        return indexer.getLDoc(docId);
    }

    public org.apache.lucene.document.Document getLuceneDoc(String docId, Set<String> fieldsToLoad)
            throws IOException {
        return indexer.getLDoc(docId, fieldsToLoad);
    }

    private Set<String> getNames(edu.stanford.muse.index.Document d, Indexer.QueryType qt) {
        try {
            return new LinkedHashSet<>(getNamesForDocId(d.getUniqueId(), qt));
        } catch (Exception e) {
            Util.print_exception(e, log);
            return new LinkedHashSet<>();
        }
    }

    /* body = true => in message body, false => in subject */
    public Span[] getEntitiesInDoc(Document d, boolean body) {
        try {
            return edu.stanford.muse.ner.NER.getNames(d, body, this);
        } catch (Exception e) {
            Util.print_exception(e, log);
            return new Span[] {};
        }
    }

    /** returns all entities in the given doc, both in body and subject */
    public synchronized Set<String> getEntitiesInDoc(Document d) {
        Set<String> entities = new LinkedHashSet<>();
        Stream.of(getEntitiesInDoc(d, false)).map(Span::getText).forEach(entities::add);
        Stream.of(getEntitiesInDoc(d, true)).map(Span::getText).forEach(entities::add);
        return entities;
    }

    public synchronized Set<String> getAllEntities() {

        if (allEntities == null) {
            allEntities = new LinkedHashSet<>();
            for (Document d : getAllDocs()) {
                try {
                    Stream.of(getEntitiesInDoc(d, true)).map(Span::getText).forEach(allEntities::add);
                } catch (Exception e) {
                    Util.print_exception("exception reading fine grained entities", e, log);
                }
            }
        }
        return allEntities;
    }

    transient private Multimap<Short, Document> entityTypeToDocs = LinkedHashMultimap.create(); // entity type code -> docs containing it

    private synchronized void computeEntityTypeToDocMap() {
        if (entityTypeToDocs != null)
            return;
        entityTypeToDocs = LinkedHashMultimap.create();
        for (Document doc : this.getAllDocs()) {
            Span[] es = this.getEntitiesInDoc(doc, true);
            Set<Short> seenInThisDoc = new LinkedHashSet<>(); // type -> docs: one value should contain a doc only once

            double theta = 0.001;
            for (Span sp : es) {
                if (sp.typeScore < theta)
                    continue;
                if (seenInThisDoc.contains(sp.type))
                    continue;
                seenInThisDoc.add(sp.type);

                entityTypeToDocs.put(sp.type, doc);
            }
        }
    }

    public synchronized Collection<Document> getDocsWithEntityType(short code) {
        return entityTypeToDocs.get(code);
    }

    public synchronized Set<NEType.Type> getEntityTypes() {
        Set<NEType.Type> result = new LinkedHashSet<>();

        computeEntityTypeToDocMap();
        for (short t : entityTypeToDocs.keys()) {
            result.add(NEType.getTypeForCode(t));
        }
        return result;
    }

    //returns a map of names recognised by NER to frequency
    private Map<String, Integer> countNames() {
        Map<String, Integer> name_count = new LinkedHashMap<>();
        for (Document d : getAllDocs()) {
            Set<String> names = getNames(d, Indexer.QueryType.FULL);
            // log.info("Names = " + Util.joinSort(names, "|"));
            for (String n : names) {
                n = n.trim();
                if (n.length() == 0)
                    continue;
                if (name_count.containsKey(n))
                    name_count.put(n, name_count.get(n) + 1);
                else
                    name_count.put(n, 1);
            }
        }

        // for (Map.Entry<String, Integer> e : entries) {
        // log.info("NameCount:" + e.getKey() + "|" + e.getValue());
        // }
        return name_count;
    }

    public List<String> getNamesForDocId(String id, Indexer.QueryType qt) throws IOException {
        return indexer.getNamesForDocId(id, qt);
    }

    public List<List<String>> getAllNames(Collection<String> ids, Indexer.QueryType qt) throws IOException {
        List<List<String>> result = new ArrayList<>();
        for (String id : ids)
            result.add(getNamesForDocId(id, qt));
        return result;
    }

    /**
     * Assign Ids to threads, can help in making out if two emails belong to the same thread
     * Subject/Title of the a message can also be used for the same purpose
     * @return the maximum thread id value assignbed to any thread in th arhchive*/
    public int assignThreadIds() {
        Collection<Collection<EmailDocument>> threads = EmailUtils.threadEmails((Collection) allDocs);
        int thrId = 1; // note: valid thread ids must be > 1
        for (Collection<EmailDocument> thread : threads) {
            for (EmailDocument doc : thread)
                doc.threadID = thrId;
            thrId++;
        }
        return thrId;
    }

    public void postDeserialized(String baseDir, boolean readOnly) throws IOException {
        if (ModeConfig.isPublicMode())
            setGroupAssigner(null);

        log.info(indexer.computeStats());

        indexer.setBaseDir(baseDir);
        openForRead();

        if (!readOnly)
            indexer.setupForWrite();
        if (addressBook != null) {
            // addressBook.reassignContactIds();
            addressBook.organizeContacts(); // is this idempotent?
        }

        if (lexiconMap == null) {
            lexiconMap = createLexiconMap(baseDir);
        }

        // recompute... sometimes the processing metadata may be stale, because some messages have been redacted at export.
        //  processingMetadata.numPotentiallySensitiveMessages = numMatchesPresetQueries();
    }

    public void merge(Archive other) {
        /* originalContentOnly */
        other.getAllDocs().stream().filter(doc -> !this.containsDoc(doc))
                .forEach(doc -> this.addDoc(doc, other.getContents(doc, /* originalContentOnly */false)));

        addressBook.merge(other.addressBook);
        this.processingMetadata.merge(other.processingMetadata);
    }

    private static Map<String, Lexicon> createLexiconMap(String baseDir) throws IOException {
        String lexDir = baseDir + File.separatorChar + LEXICONS_SUBDIR;
        Map<String, Lexicon> map = new LinkedHashMap<String, Lexicon>();
        File lexDirFile = new File(lexDir);
        if (!lexDirFile.exists()) {
            log.warn("'lexicons' directory is missing from archive");
        } else {
            for (File f : lexDirFile.listFiles(new Util.MyFilenameFilter(null, Lexicon.LEXICON_SUFFIX))) {
                String name = Lexicon.lexiconNameFromFilename(f.getName());
                if (!map.containsKey(name)) {
                    map.put(name.toLowerCase(), new Lexicon(lexDir, name));
                }
            }
        }
        return map;
    }

    public Lexicon getLexicon(String lexName) {
        // lexicon map could be stale, re-read it
        try {
            lexiconMap = createLexiconMap(baseDir);
        } catch (Exception e) {
            Util.print_exception("Error trying to read list of lexicons", e, log);
        }
        return lexiconMap.get(lexName.toLowerCase());
    }

    public Set<String> getAvailableLexicons() {
        // lexicon map could be stale, re-read it
        try {
            lexiconMap = createLexiconMap(baseDir);
        } catch (Exception e) {
            Util.print_exception("Error trying to read list of lexicons", e, log);
        }
        if (lexiconMap == null)
            return new LinkedHashSet<String>();
        return Collections.unmodifiableSet(lexiconMap.keySet());
    }

    public void addStats(FetchStats as) {
        allStats.add(as);
    }

    public Collection<String> getDataErrors() {
        Collection<String> result = new LinkedHashSet<String>();

        for (FetchStats as : allStats) {
            Collection<String> asErrors = as.dataErrors;
            if (asErrors != null)
                result.addAll(asErrors);
        }
        return result;
    }

    /**Replaces the document in the index with the supplied document*/
    public void updateDocument(org.apache.lucene.document.Document doc) {
        indexer.updateDocument(doc);
    }

    public void setupForWrite() throws IOException {
        indexer.setupForWrite();
    }

    public Span[] getOriginalNamesOfATypeInDoc(edu.stanford.muse.index.Document doc, short type)
            throws IOException {
        Span[] spans = getAllOriginalNamesInDoc(doc);
        List<Span> req = Arrays.asList(spans).stream().filter(sp -> sp.type == type).collect(Collectors.toList());
        return req.toArray(new Span[req.size()]);
    }

    public Span[] getAllOriginalNamesInDoc(edu.stanford.muse.index.Document doc) throws IOException {
        Span[] spans = getAllNamesInDoc(doc, true);
        String oc = getContents(doc, true);
        List<Span> req = Arrays.asList(spans).stream().filter(sp -> sp.end < oc.length())
                .collect(Collectors.toList());
        return req.toArray(new Span[req.size()]);
    }

    /**@return a list of names filtered to remove dictionary matches*/
    public Span[] getNamesOfATypeInDoc(edu.stanford.muse.index.Document d, boolean body, short type)
            throws IOException {
        return getNamesOfATypeInLuceneDoc(getLuceneDoc(d.getUniqueId()), body, type);
    }

    /**@return list of all names in the lucene doc without filtering dictionary words*/
    private static Span[] getNamesOfATypeInLuceneDoc(org.apache.lucene.document.Document ldoc, boolean body,
            short type) {
        Span[] allNames = NER.getNames(ldoc, body);
        List<Span> req = Arrays.asList(allNames).stream().filter(s -> type == s.type).collect(Collectors.toList());
        return req.toArray(new Span[req.size()]);
    }

    public Span[] getAllNamesInDoc(edu.stanford.muse.index.Document d, boolean body) throws IOException {
        return NER.getNames(d, body, this);
    }

    public static Span[] getAllNamesInLuceneDoc(org.apache.lucene.document.Document ldoc, boolean body) {
        return NER.getNames(ldoc, body);
    }

    public Span[] getAllNamesMapToInDoc(edu.stanford.muse.index.Document d, boolean body, short coarseType)
            throws IOException {
        Span[] allNames = getAllNamesInDoc(d, body);
        List<Span> req = Arrays.asList(allNames).stream()
                .filter(n -> NEType.getCoarseType(n.type).getCode() == coarseType).collect(Collectors.toList());
        return req.toArray(new Span[req.size()]);
    }

    /**@return list of all email sources */
    public synchronized Set<String> getAllEmailSources() {
        if (allEmailSources == null) {
            allEmailSources = new LinkedHashSet<>();
            Collection<EmailDocument> docs = (Collection) getAllDocs();
            for (EmailDocument d : docs)
                if (!Util.nullOrEmpty(d.emailSource))
                    allEmailSources.add(d.emailSource);

        }
        return allEmailSources;
    }

    /**@return list of all email sources */
    public synchronized Set<String> getAllFolders() {
        if (allFolders == null) {
            allFolders = new LinkedHashSet<>();
            Collection<EmailDocument> docs = (Collection) getAllDocs();
            for (EmailDocument d : docs)
                if (!Util.nullOrEmpty(d.folderName))
                    allFolders.add(d.folderName);
        }
        return allFolders;
    }

    // invalidate the cache of all annotations. this should be called any time an annotation changes
    public void clearAllAnnotationsCache() {
        allAnnotations = null;
    }

    /**@return list of all email sources */
    public synchronized Set<String> getAllAnnotations() {
        if (allAnnotations == null) {
            allAnnotations = new LinkedHashSet<>();
            for (Document d : getAllDocs())
                if (!Util.nullOrEmpty(d.comment))
                    allAnnotations.add(d.comment);
        }
        return allAnnotations;
    }

    /**@return list of all email sources */
    public synchronized Set<String> getAllBlobNames() {
        if (allBlobNames == null) {
            allBlobNames = new LinkedHashSet<>();
            Collection<EmailDocument> docs = (Collection) getAllDocs();
            for (EmailDocument d : docs)
                if (!Util.nullOrEmpty(d.attachments)) {
                    List<Blob> blobs = d.attachments;
                    for (Blob b : blobs) {
                        if (!Util.nullOrEmpty(b.getName()))
                            allBlobNames.add(b.getName());
                    }
                }
        }
        return allBlobNames;
    }

    /**
     * Recognises names in the supplied text with OpenNLP NER
     * @Deprecated
     */
    @Deprecated
    public static Set<String> extractNamesOpenNLP(String text) throws Exception {
        List<Pair<String, Float>> pairs = edu.stanford.muse.index.NER.namesFromText(text);
        Set<String> names = new LinkedHashSet<String>();
        for (Pair<String, ?> p : pairs)
            names.add(p.getFirst());

        return Util.scrubNames(names);
    }

    //maps locId (int) to docId (edu.stanford.muse.index.Document) using indexer storage structures
    public Integer getLDocIdForContentDocId(String docId) {
        return indexer.contentDocIds.entrySet().stream().filter(e -> docId.equals(e.getValue())).findAny()
                .map(e -> e.getKey()).orElse(0);
    }

    public String getDocIdForContentLDocId(Integer ldocId) {
        return indexer.contentDocIds.get(ldocId);
    }

    public Integer getLDocIdForBlobDocId(String docId) {
        return indexer.blobDocIds.entrySet().stream().filter(e -> docId.equals(e.getValue())).findAny()
                .map(e -> e.getKey()).orElse(0);
    }

    public String getDocIdForBlobLDocId(Integer ldocId) {
        return indexer.blobDocIds.get(ldocId);
    }

    /** transfers actions from one archive to another. returns user-displayable status message */
    public String transferActionsFrom(String otherArchiveDir) throws ClassNotFoundException, IOException {
        return "Error: transfer actions disabled in v3!";
        /*
        String file = otherArchiveDir + File.separator + SESSIONS_SUBDIR + File.separator + "default.archive.v1"; // note that is v1!
            
        if (!new File(file).exists()) {
        return "Error: no archive found in " + file;
        }
            
        ObjectInputStream ois = new ObjectInputStream(new GZIPInputStream(new FileInputStream(file)));
        Object discard = ois.readObject();
            
        Archive otherArchive = (Archive)ois.readObject();
        LinkedHashMap otherMap = new LinkedHashMap();
        Iterator nMatched = otherArchive.allDocs.iterator();
            
        while(nMatched.hasNext()) {
        Document nChanged = (Document)nMatched.next();
        EmailDocument ed = (EmailDocument)nChanged;
        if(ed.date != null && !ed.date.equals(EmailFetcherThread.INVALID_DATE)) {
            String d = ed.getSignature();
            otherMap.put(d, ed);
        }
        }
            
        int matchedMessages = 0;
        int changedMessages = 0;
        Iterator it = this.allDocs.iterator();
            
        while(it.hasNext()) {
        Document d = (Document) it.next();
        EmailDocument ed1 = (EmailDocument)d;
        String edSig = ed1.getSignature();
        EmailDocument otherEd = (EmailDocument)otherMap.get(edSig);
        if(otherEd != null) {
            ++matchedMessages;
            boolean changed = false;
            if(otherEd.doNotTransfer != ed1.doNotTransfer) {
                ed1.doNotTransfer = otherEd.doNotTransfer;
                changed = true;
            }
            
            if(otherEd.transferWithRestrictions != ed1.transferWithRestrictions) {
                ed1.transferWithRestrictions = otherEd.transferWithRestrictions;
                changed = true;
            }
            
            if(otherEd.reviewed != ed1.reviewed) {
                ed1.reviewed = otherEd.reviewed;
                changed = true;
            }
            
            if(otherEd.addedToCart != ed1.addedToCart) {
                ed1.addedToCart = otherEd.addedToCart;
                changed = true;
            }
            
            if(otherEd.comment != null && !otherEd.comment.equals(ed1.comment)) {
                ed1.comment = otherEd.comment;
                changed = true;
            }
            
            if(changed) {
                ++changedMessages;
            }
        }
        }
            
        // transfer authorities
        String authorityTransferStatus = "";
            
        try {
        Map<String, Authority> existingMap = getAuthorities();
        int existingSize = (Util.nullOrEmpty(existingMap)) ? 0 : existingMap.size();
            
        Map<String, Authority> otherAuthorities = otherArchive.getAuthorities();
        if (!Util.nullOrEmpty(otherAuthorities)) {
            // may be better to add to the existing map? what to do if there are conflicts?
            this.setAuthorities(otherAuthorities);
            authorityTransferStatus += otherAuthorities.size() + " authorities transferred, overwriting " + existingSize + " existing authorities";
        }
        } catch (Exception e2) { Util.print_exception("Unable to read existing authorities file", e2, log); }
            
        log.info ("Authority transfer status: " + authorityTransferStatus);
        return "Changes applied to " + Util.pluralize(changedMessages, "message") + " from " + "archive format v1 ("
            + Util.pluralize(otherArchive.allDocs.size(), "message") + ") in " + otherArchiveDir
            + " to current archive in format v2 (" + Util.pluralize(this.allDocs.size(), "message") + ").\n"
            + Util.pluralize(matchedMessages, "message") + " matched"
            + "\n" + authorityTransferStatus;
        */
    }

    public static void main(String[] args) {
        try {
            String userDir = System.getProperty("user.home") + File.separator + "epadd-appraisal" + File.separator
                    + "user";
            Archive archive = SimpleSessions.readArchiveIfPresent(userDir);
            List<Document> docs = archive.getAllDocs();
            int i = 0;
            archive.assignThreadIds();
            NER.NERStats stats = new NER.NERStats();
            for (Document doc : docs) {
                EmailDocument ed = (EmailDocument) doc;
                stats.update(archive.getAllNamesInDoc(ed, true));
                System.out.println(Arrays.asList(archive.getAllNamesInDoc(ed, true)));
                if (i++ > 20)
                    break;
                //                List<Document> threads = archive.docsWithThreadId(ed.threadID);
                //                if(threads.size()>0){
                //                    int numSent = 0;
                //                    for(Document d: threads){
                //                        EmailDocument thread = (EmailDocument)d;
                //                        int sent = thread.sentOrReceived(archive.addressBook)&EmailDocument.SENT_MASK;
                //                        if(sent>0)
                //                            numSent++;
                //                    }
                //                    if(threads.size()!=numSent || threads.size()>2){
                //                        System.err.println("Found a thread with "+numSent+" sent and "+threads.size()+" docs in a thread: "+ed.getSubject());
                //                        break;
                //                    }
                //                    if(i%100 == 0)
                //                        System.err.println("Scanned: "+i+" docs");
                //                }
                //                i++;
            }
            System.out.println(stats.counts);
            System.out.println(stats.all);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}