com.ikanow.infinit.e.harvest.HarvestController.java Source code

Java tutorial

Introduction

Here is the source code for com.ikanow.infinit.e.harvest.HarvestController.java

Source

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
/**
 * 
 */
package com.ikanow.infinit.e.harvest;

import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;

import org.apache.log4j.Logger;
import org.bson.types.ObjectId;

import com.ikanow.infinit.e.data_model.Globals;
import com.ikanow.infinit.e.data_model.InfiniteEnums;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDocumentLevelException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDailyLimitExceededException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelMajorException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelTransientException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.HarvestEnum;
import com.ikanow.infinit.e.data_model.interfaces.harvest.EntityExtractorEnum;
import com.ikanow.infinit.e.data_model.interfaces.harvest.IEntityExtractor;
import com.ikanow.infinit.e.data_model.interfaces.harvest.ITextExtractor;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.ikanow.infinit.e.data_model.store.social.sharing.SharePojo;
import com.ikanow.infinit.e.data_model.store.social.sharing.SharePojo.ShareCommunityPojo;
import com.ikanow.infinit.e.data_model.utils.GeoOntologyMapping;
import com.ikanow.infinit.e.data_model.utils.IkanowSecurityManager;
import com.ikanow.infinit.e.data_model.utils.TrustManagerManipulator;
import com.ikanow.infinit.e.harvest.enrichment.custom.StructuredAnalysisHarvester;
import com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester;
import com.ikanow.infinit.e.harvest.enrichment.legacy.TextRankExtractor;
import com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.ExtractorAlchemyAPI;
import com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.ExtractorAlchemyAPI_Metadata;
import com.ikanow.infinit.e.harvest.enrichment.legacy.opencalais.ExtractorOpenCalais;
import com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager;
import com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated;
import com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Standalone;
import com.ikanow.infinit.e.harvest.extraction.document.HarvestStatus;
import com.ikanow.infinit.e.harvest.extraction.document.HarvestStatus_Integrated;
import com.ikanow.infinit.e.harvest.extraction.document.HarvestStatus_Standalone;
import com.ikanow.infinit.e.harvest.extraction.document.HarvesterInterface;
import com.ikanow.infinit.e.harvest.extraction.document.database.DatabaseHarvester;
import com.ikanow.infinit.e.harvest.extraction.document.distributed.DistributedHarvester;
import com.ikanow.infinit.e.harvest.extraction.document.file.FileHarvester;
import com.ikanow.infinit.e.harvest.extraction.document.logstash.LogstashHarvester;
import com.ikanow.infinit.e.harvest.extraction.document.rss.FeedHarvester;
import com.ikanow.infinit.e.harvest.extraction.text.boilerpipe.TextExtractorBoilerpipe;
import com.ikanow.infinit.e.harvest.extraction.text.externalscript.TextExtractorExternalScript;
import com.ikanow.infinit.e.harvest.extraction.text.legacy.TextExtractorTika;
import com.ikanow.infinit.e.harvest.utils.AuthUtils;
import com.ikanow.infinit.e.harvest.utils.HarvestExceptionUtils;
import com.ikanow.infinit.e.harvest.utils.PropertiesManager;
import com.mongodb.BasicDBObject;
import com.mongodb.gridfs.GridFSDBFile;

/**
 * @author cmorgan
 *
 * Used to process all incoming sources in the system
 * @param <DimensionPojo>
 */
public class HarvestController implements HarvestContext {
    private HarvestControllerPipeline procPipeline = null;
    private IkanowSecurityManager _securityManager = null;

    public IkanowSecurityManager getSecurityManager() {
        return _securityManager;
    }

    private PropertiesManager pm = new PropertiesManager();
    private IEntityExtractor default_entity_extractor = null;
    private ITextExtractor default_text_extractor = null;
    private ArrayList<HarvesterInterface> harvesters = new ArrayList<HarvesterInterface>();
    private static Set<String> urlsThatError = new TreeSet<String>();
    private static final Logger logger = Logger.getLogger(HarvestController.class);

    private HashMap<String, IEntityExtractor> entity_extractor_mappings = null;
    private HashMap<String, ITextExtractor> text_extractor_mappings = null;
    private HashSet<String> failedDynamicExtractors = null;
    private static HashMap<String, Class<?>> dynamicExtractorClassCache = null;

    private int _nMaxDocs = Integer.MAX_VALUE;
    private DuplicateManager _duplicateManager = new DuplicateManager_Integrated();
    private HarvestStatus _harvestStatus = new HarvestStatus_Integrated(); // (can either be standalone or integrated, defaults to standalone)

    public DuplicateManager getDuplicateManager() {
        return _duplicateManager;
    }

    public HarvestStatus getHarvestStatus() {
        return _harvestStatus;
    }

    boolean _bIsStandalone = false;

    public boolean isStandalone() {
        return _bIsStandalone;
    }

    public void setStandaloneMode(int nMaxDocs) {
        setStandaloneMode(nMaxDocs, false); // (by default don't dedup, however you may want to test updates)
    }

    public void setStandaloneMode(int nMaxDocs, boolean bRealDedup) {
        _bIsStandalone = true;
        urlsThatError.clear(); // (for api testing, obviously don't want to stop trying if we get an error)
        if (nMaxDocs >= 0) {
            _nMaxDocs = nMaxDocs;
        }
        if (!bRealDedup) {
            _duplicateManager = new DuplicateManager_Standalone();
        }
        _harvestStatus = new HarvestStatus_Standalone();

        if (null != dynamicExtractorClassCache) { // (standalone so don't cache extractors)
            dynamicExtractorClassCache.clear();
        }
    }

    public int getStandaloneMaxDocs() {
        return _nMaxDocs;
    }

    private long nBetweenFeedDocs_ms = 10000; // (default 10s)

    //statistics variables
    private static AtomicInteger num_sources_harvested = new AtomicInteger(0);
    private static AtomicInteger num_docs_extracted = new AtomicInteger(0);
    private static AtomicInteger num_errors_source = new AtomicInteger(0);
    private static AtomicInteger num_error_url = new AtomicInteger(0);
    private static AtomicInteger num_error_px = new AtomicInteger(0);
    private static AtomicInteger num_ent_extracted = new AtomicInteger(0);
    private static AtomicInteger num_event_extracted = new AtomicInteger(0);

    private int nUrlErrorsThisSource = 0;

    /**
     * Used to find out the sources harvest of information is successful
     * @return
     */
    public boolean isSuccessful() {
        return true;
    }

    // Handle clean shutdown of harvester
    private static boolean bIsKilled = false;

    public static void killHarvester() {
        bIsKilled = true;
    }

    public static boolean isHarvestKilled() {
        return bIsKilled;
    }

    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

    // TOP LEVEL LOGICAL

    // Utility objects for loading custom text and entity extractors across all threads just once
    @SuppressWarnings("rawtypes")
    private static HashMap<String, Class> customExtractors = null;
    private static ClassLoader customExtractorClassLoader = HarvestController.class.getClassLoader();

    /**
     *  Constructor for Harvest Controller class
     *  
     * @throws IOException 
     */
    public HarvestController() throws IOException {
        this(false);
    }

    private static boolean _initializedSSL = false;

    @SuppressWarnings("rawtypes")
    public HarvestController(boolean overrideTypeSettings) throws IOException {
        if (!_initializedSSL) {
            _initializedSSL = true;
            try {
                // Ensure we don't have any self-signed cert debacles:
                TrustManagerManipulator.allowAllSSL();
            } finally {
            }
        }

        PropertiesManager props = new PropertiesManager();
        String sTypes = props.getHarvesterTypes();
        if (overrideTypeSettings) { // (override API settings in test mode)
            sTypes = "Feed,File,Database,Logstash,Distributed,Post_processing";
            //(the post_processor isn't needed for harvester testing - but is needed for actual harvesting,... 
            //...so they're included here for consistency - custom type scheduling is set up at publish time, so it isn't needed)
            //(similar comments apply for logstash)
        }
        String sType[] = sTypes.split("\\s*,\\s*");

        // Add a harvester for each data type
        for (String s : sType) {
            if (s.equalsIgnoreCase("distributed")) { // (custom + distributed + post processing)
                try {
                    this.harvesters.add(new DistributedHarvester());
                } catch (Exception e) {
                    logger.error(s + " not supported: " + e.getMessage());
                } catch (NoClassDefFoundError e) {
                    logger.error(s + " not supported: " + e.getMessage());
                }
            } else if (s.equalsIgnoreCase("database")) {
                try {
                    this.harvesters.add(new DatabaseHarvester());
                } catch (Exception e) {
                    logger.error(s + " not supported: " + e.getMessage());
                } catch (NoClassDefFoundError e) {
                    logger.error(s + " not supported: " + e.getMessage());
                }
            } else if (s.equalsIgnoreCase("logstash")) {
                try {
                    this.harvesters.add(new LogstashHarvester());
                } catch (Exception e) {
                    logger.error(s + " not supported: " + e.getMessage());
                } catch (NoClassDefFoundError e) {
                    logger.error(s + " not supported: " + e.getMessage());
                }
            } else if (s.equalsIgnoreCase("file")) {

                // According to http://www.ryanchapin.com/fv-b-4-648/java-lang-OutOfMemoryError--unable-to-create-new-native-thread-Exception-When-Using-SmbFileInputStream.html
                // this is needed to avoid java.lang.OutOfMemoryError (intermittent - for me at least, it's happened for exactly 1 source, but consistently when it does)
                System.setProperty("jcifs.resolveOrder", "DNS");
                System.setProperty("jcifs.smb.client.dfs.disabled", "true");

                try {
                    this.harvesters.add(new FileHarvester());
                } catch (Exception e) {
                    logger.error(s + " not supported: " + e.getMessage());
                } catch (NoClassDefFoundError e) {
                    logger.error(s + " not supported: " + e.getMessage());
                }
            } else if (s.equalsIgnoreCase("feed")) {
                try {
                    this.harvesters.add(new FeedHarvester());
                } catch (Exception e) {
                    logger.error(s + " not supported: " + e.getMessage());
                } catch (NoClassDefFoundError e) {
                    logger.error(s + " not supported: " + e.getMessage());
                }
            }
        }

        // Load all the extractors, set up defaults
        entity_extractor_mappings = new HashMap<String, IEntityExtractor>();
        text_extractor_mappings = new HashMap<String, ITextExtractor>();

        // Load custom text/entity extractors
        synchronized (HarvestController.class) {
            if (null == customExtractors) {
                customExtractors = new HashMap<String, Class>();
                customExtractorClassLoader = HarvestController.class.getClassLoader();
            }
            // Text extractors:
            String customTextList = props.getCustomTextExtractors();
            if (null != customTextList) {
                String customTextArray[] = customTextList.split("\\s*,\\s*");
                for (String customText : customTextArray) {
                    if (!customExtractors.containsKey(customText)) {
                        // (else already have this extractor)
                        try {
                            Class customTextExtractor = customExtractorClassLoader.loadClass(customText);
                            ITextExtractor obj = (ITextExtractor) customTextExtractor.newInstance();
                            text_extractor_mappings.put(obj.getName().toLowerCase(), obj);
                            customExtractors.put(customText, customTextExtractor);
                        } catch (Exception e) {
                            logger.error("ITextExtractor: Couldn't load " + customText + ": " + e.getMessage(), e);
                        } catch (NoClassDefFoundError e) {
                            logger.error("ITextExtractor: Couldn't load " + customText + ": " + e.getMessage(), e);
                        }
                    } else { // Already loaded, put in again
                        try {
                            Class customTextExtractor = customExtractors.get(customText);
                            ITextExtractor obj = (ITextExtractor) customTextExtractor.newInstance();
                            text_extractor_mappings.put(obj.getName().toLowerCase(), obj);
                        } catch (Exception e) {
                            logger.error("ITextExtractor: Couldn't use already loaded " + customText + ": "
                                    + e.getMessage(), e);
                        } catch (NoClassDefFoundError e) {
                            logger.error("ITextExtractor: Couldn't use already loaded " + customText + ": "
                                    + e.getMessage(), e);
                        }
                    }
                }
            } //TESTED
              // Entity extractors 
            String customEntityList = props.getCustomEntityExtractors();
            if (null != customEntityList) {
                String customEntityArray[] = customEntityList.split("\\s*,\\s*");
                for (String customEntity : customEntityArray) {
                    if (!customExtractors.containsKey(customEntity)) {
                        // (else already have this extractor - but may have it for text, so some work to do)
                        try {
                            Class customEntityExtractor = customExtractorClassLoader.loadClass(customEntity);
                            IEntityExtractor obj = (IEntityExtractor) customEntityExtractor.newInstance();
                            entity_extractor_mappings.put(obj.getName().toLowerCase(), obj);
                            customExtractors.put(customEntity, customEntityExtractor);
                        } catch (Exception e) {
                            logger.error("IEntityExtractor: Couldn't load " + customEntity + ": " + e.getMessage(),
                                    e);
                        } catch (NoClassDefFoundError e) {
                            logger.error("IEntityExtractor: Couldn't load " + customEntity + ": " + e.getMessage(),
                                    e);
                        }
                    } else { // If this object exists and if it's a text extractor, then see if it's also an entity extractor
                        try {
                            Class customEntityExtractor = customExtractors.get(customEntity);
                            IEntityExtractor obj = (IEntityExtractor) customEntityExtractor.newInstance();
                            entity_extractor_mappings.put(obj.getName(), obj);
                        } catch (Exception e) {
                            logger.error("IEntityExtractor: Couldn't use already loaded " + customEntity + ": "
                                    + e.getMessage(), e);
                        } catch (NoClassDefFoundError e) {
                            logger.error("IEntityExtractor: Couldn't use already loaded " + customEntity + ": "
                                    + e.getMessage(), e);
                        }
                    }
                }
            } //TESTED
        }

        try {
            entity_extractor_mappings.put("opencalais", new ExtractorOpenCalais());
        } catch (Exception e) {
            logger.warn("Can't use OpenCalais as entity extractor: " + e.getMessage());
        }
        try {
            entity_extractor_mappings.put("textrank", new TextRankExtractor());
        } catch (Exception e) {
            logger.warn("Can't use textrank as entity extractor: " + e.getMessage());
        }

        try {
            ExtractorAlchemyAPI both = new ExtractorAlchemyAPI();
            entity_extractor_mappings.put("alchemyapi", both);
            text_extractor_mappings.put("alchemyapi", both);
            ExtractorAlchemyAPI_Metadata both_metadata = new ExtractorAlchemyAPI_Metadata();
            entity_extractor_mappings.put("alchemyapi-metadata", both_metadata);
            text_extractor_mappings.put("alchemyapi-metadata", both_metadata);
        } catch (Exception e) {
            logger.warn("Can't use AlchemyAPI as entity/text extractor: " + e.getMessage());
        }
        try {
            text_extractor_mappings.put("externalscript", new TextExtractorExternalScript());
        } catch (Exception e) {
            logger.warn("Can't use ExternalScript as text extractor: " + e.getMessage());
        }
        try {
            text_extractor_mappings.put("boilerpipe", new TextExtractorBoilerpipe());
        } catch (Exception e) {
            logger.warn("Can't use Boilerpipe as text extractor: " + e.getMessage());
        }
        try {
            text_extractor_mappings.put("tika", new TextExtractorTika());
        } catch (Exception e) {
            logger.warn("Can't use Tika as text extractor: " + e.getMessage());
        }

        if (null != pm.getDefaultEntityExtractor()) {
            default_entity_extractor = entity_extractor_mappings.get(pm.getDefaultEntityExtractor().toLowerCase());
        } else {
            default_entity_extractor = null;
        }
        if (null != pm.getDefaultTextExtractor()) {
            default_text_extractor = text_extractor_mappings.get(pm.getDefaultTextExtractor().toLowerCase());
        } else {
            try {
                default_text_extractor = new TextExtractorBoilerpipe();
            } catch (Exception e) {
                logger.warn("Can't use BoilerPlate as default text extractor: " + e.getMessage());
            }
        }
        nBetweenFeedDocs_ms = props.getWebCrawlWaitTime();

        // Set up security manager - basically always needed so might as well create here

        _securityManager = new IkanowSecurityManager();
    }

    /**
     * Handles going through what to do with a source for harvesting
     * The process currently is:
     * 1. Extract from source
     * 2. Enrich with metadata from toAdd (entity, fulltext, events, etc)
     * 
     * @param source The source to harvest
     */
    public void harvestSource(SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate,
            List<DocumentPojo> toRemove) {
        nUrlErrorsThisSource = 0;

        if (HarvestController.isHarvestKilled()) { // Already spent too long - just bail out from here
            source.setReachedMaxDocs();
            return;
        }
        // New Harvest Pipeline logic
        if (null != source.getProcessingPipeline()) {
            if (null == procPipeline) {
                procPipeline = new HarvestControllerPipeline();
            }
            procPipeline.extractSource_preProcessingPipeline(source, this);
            //(just copy the config into the legacy source fields since the 
            // actual processing is the same in both cases)
        } //TESTED

        // Can override the default (feed) wait time from within the source (eg for sites that we know 
        // don't get upset about getting hammered)
        if (null != source.getRssConfig()) {
            if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
                nBetweenFeedDocs_ms = source.getRssConfig().getWaitTimeOverride_ms();
            }
        }
        LinkedList<DocumentPojo> toDuplicate = new LinkedList<DocumentPojo>();

        // Reset any state that might have been generated from the previous source
        getDuplicateManager().resetForNewSource();
        getHarvestStatus().resetForNewSource();

        // (temp location to store timings)
        source.setCreated(new Date());

        //First up, Source Extraction (could spawn off some threads to do source extraction)
        // Updates will be treated as follows:
        // - extract etc etc (since they have changed)
        // [and then in generic processing
        // - remove them (including their child objects, eg events) ...
        //   ... - but retain "created" date (and in the future artefacts like comments)]
        extractSource(source, toAdd, toUpdate, toRemove, toDuplicate);
        // (^^^ this adds toUpdate to toAdd) 

        // (temp location to store timings)
        source.setModified(new Date());

        if (null != source.getProcessingPipeline()) {
            procPipeline.setInterDocDelayTime(nBetweenFeedDocs_ms);
            try {
                procPipeline.enrichSource_processingPipeline(source, toAdd, toUpdate, toRemove);
            } finally { // (ensure can clear memory)
                procPipeline.clearState();
            }
        } else { // Old logic (more complex, less functional)
            enrichSource(source, toAdd, toUpdate, toRemove);
        }
        completeEnrichmentProcess(source, toAdd, toUpdate, toRemove);

        // (Now we've completed enrichment either normally or by cloning, add the dups back to the normal documents for generic processing)
        LinkedList<DocumentPojo> groupedDups = new LinkedList<DocumentPojo>(); // (ie clones)
        DocumentPojo masterDoc = null; // (just looking for simple pointer matching here)

        for (DocumentPojo dupDoc : toDuplicate) {
            if (null == dupDoc.getCloneFrom()) {
                toAdd.add(dupDoc);
            } else if (null != dupDoc.getCloneFrom().getTempSource()) { //(Else doc was removed from toAdd list due to extraction errors) 
                if (null == masterDoc) { // First time through
                    masterDoc = dupDoc.getCloneFrom();
                } else if (!masterDoc.getUrl().equals(dupDoc.getUrl())) { // New group!
                    groupedDups = enrichDocByCloning(groupedDups);
                    if (null != groupedDups) {
                        toAdd.addAll(groupedDups);
                        groupedDups.clear();
                    } else {
                        groupedDups = new LinkedList<DocumentPojo>();
                    }
                    masterDoc = dupDoc.getCloneFrom();
                }
                groupedDups.add(dupDoc);
            }
        } //end loop over duplicates
          //TESTED, included case where the master doc errors during extraction (by good fortune!) 

        if (null != groupedDups) { // (Leftover group)
            groupedDups = enrichDocByCloning(groupedDups);
            if (null != groupedDups) {
                toAdd.addAll(groupedDups);
            }
        } //TESTED (as above)
    }

    /**
     * Figures out what source extractors to use and then fills the toAdd list
     * with DocumentPojo objects from the extractors. 
     * 
     * @param flags The source extractors to use
     * @param start source to start extracting at
     * @param end source to stop extracting at
     * @param toAdd A reference to the toAdd that should be filled with what the source extracts
     */
    @SuppressWarnings("unchecked")
    private void extractSource(SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate,
            List<DocumentPojo> toRemove, List<DocumentPojo> toDup) {
        boolean normalCase = true;
        normalCase = (1 == source.getCommunityIds().size()) || // (normal case..)
                ((2 == source.getCommunityIds().size()) && source.getCommunityIds().contains(source.getOwnerId()));
        // (test case..)

        //determine which source extractor to use
        for (HarvesterInterface harvester : harvesters) {
            if (harvester.canHarvestType(InfiniteEnums.castExtractType(source.getExtractType()))) {
                try {
                    List<DocumentPojo> tmpToAdd = new LinkedList<DocumentPojo>();
                    List<DocumentPojo> tmpToUpdate = new LinkedList<DocumentPojo>();
                    List<DocumentPojo> tmpToRemove = new LinkedList<DocumentPojo>();
                    harvester = harvester.getClass().newInstance(); // (create a new harvester for each source, avoids problems with state...)
                    harvester.executeHarvest(this, source, tmpToAdd, tmpToUpdate, tmpToRemove);

                    int nDocs = 0;
                    for (List<DocumentPojo> docList : Arrays.asList(tmpToAdd, tmpToUpdate)) {
                        for (DocumentPojo doc : docList) {
                            if (++nDocs > _nMaxDocs) {
                                break;
                            }
                            // Handle cloning on "duplicate docs" from different sources
                            boolean bDuplicated = false;
                            if (null != doc.getDuplicateFrom() && (null == doc.getUpdateId())) {
                                DocumentPojo newDoc = enrichDocByDuplicating(doc);
                                // (Note this is compatible with the cloning case whose logic is below:
                                //  this document gets fully populated here then added to dup list (with dupFrom==null), with a set of slaves
                                //  with dupFrom==sourceKey. When the dup list is traversed (after bypassing enrichment), the slaves are
                                //   then created from this master)
                                if (null != newDoc) {
                                    doc = newDoc;
                                    bDuplicated = true;
                                }
                            } else { // if the update id is non-null then ignore the above logic
                                doc.setDuplicateFrom(null);
                            }
                            // Copy over material from source pojo:
                            doc.setSource(source.getTitle());
                            doc.setTempSource(source);
                            doc.setMediaType(source.getMediaType());
                            if ((null == source.getAppendTagsToDocs()) || source.getAppendTagsToDocs()) {
                                if (null != source.getTags()) {
                                    doc.setTags(new HashSet<String>(source.getTags()));
                                }
                            }
                            ObjectId sCommunityId = source.getCommunityIds().iterator().next(); // (multiple communities handled below) 
                            String sIndex = new StringBuffer("doc_").append(sCommunityId.toString()).toString();
                            doc.setCommunityId(sCommunityId);
                            doc.setIndex(sIndex);
                            if (normalCase) { // Normal case (or test case)
                                doc.setSourceKey(source.getKey());
                            } else { // Many communities for a single source, not a pleasant case
                                String sMasterDocSourceKey = null;
                                for (ObjectId id : source.getCommunityIds()) {
                                    if (null == sMasterDocSourceKey) {
                                        sMasterDocSourceKey = (source.getKey());
                                        doc.setSourceKey(sMasterDocSourceKey);
                                    } else { // Will defer these until after the master doc has been added to the database
                                        DocumentPojo cloneDoc = new DocumentPojo();

                                        // Will need these fields
                                        cloneDoc.setIndex(new StringBuffer("doc_").append(id).toString());
                                        cloneDoc.setCommunityId(id);
                                        cloneDoc.setSourceKey(source.getKey());
                                        cloneDoc.setSource(source.getTitle());
                                        cloneDoc.setUrl(doc.getUrl());
                                        if ((null == source.getAppendTagsToDocs())
                                                || source.getAppendTagsToDocs()) {
                                            cloneDoc.setTags(new HashSet<String>(source.getTags()));
                                        }

                                        cloneDoc.setCloneFrom(doc);
                                        toDup.add(cloneDoc);
                                    }
                                } //TESTED (both in clone and clone+duplicate)
                            }
                            // Normally add to enrichment list (for duplicates, bypass this)
                            if (bDuplicated) {
                                toDup.add(doc); // (Already enriched by duplication process)
                            } else {
                                toAdd.add(doc);
                            }
                        }
                    } //(end loop over docs to add/update)

                    num_docs_extracted.addAndGet(tmpToAdd.size() > _nMaxDocs ? _nMaxDocs : tmpToAdd.size());
                    toUpdate.addAll(tmpToUpdate);
                    toRemove.addAll(tmpToRemove);
                } catch (Exception e) {

                    //DEBUG
                    //e.printStackTrace();

                    String reason = Globals.populateStackTrace(new StringBuffer(), e).toString();
                    logger.error("Error extracting source=" + source.getKey() + ", type=" + source.getExtractType()
                            + ", reason=" + reason);
                    _harvestStatus.update(source, new Date(), HarvestEnum.error, "Extraction error: " + reason,
                            false, false);
                }
                break; //exit for loop, source is extracted
            }
        }
    }

    // 
    // (LEGACY) Gets metadata using the extractors and appends to documents
    //

    private void enrichSource(SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate,
            List<DocumentPojo> toRemove) {
        StructuredAnalysisHarvester sah = null;
        UnstructuredAnalysisHarvester usah = null;

        // Create metadata from the text using regex (also calculate header/footer information if desired)
        if (source.getUnstructuredAnalysisConfig() != null) {
            usah = new UnstructuredAnalysisHarvester();

            // If performing structured analysis also then need to mux them
            // since the UAH will run on the body/description potentially created by the SAH
            // and the SAH will take the metadata generated by UAH to create entities and events
            if (source.getStructuredAnalysisConfig() != null) {
                sah = new StructuredAnalysisHarvester();
                sah.addUnstructuredHandler(usah);
            } else {
                toAdd = usah.executeHarvest(this, source, toAdd);
            }
        }

        // For sources that generate structured data, we can turn that into entities and events
        // and fill in document fields from the metadata (that can be used by entity extraction)
        if (source.getStructuredAnalysisConfig() != null) {
            if (null == sah) {
                sah = new StructuredAnalysisHarvester();
            }
            toAdd = sah.executeHarvest(this, source, toAdd);
            // (if usah exists then this runs usah)
        }

        // Perform text and entity extraction
        if (source.getStructuredAnalysisConfig() == null) // (Else is performed during SAH above)
        {
            if (isEntityExtractionRequired(source)) {
                // Text/Entity Extraction
                try {
                    extractTextAndEntities(toAdd, source, false, false);
                } catch (Exception e) {
                    handleExtractError(e, source); //handle extractor error if need be            
                }
            }
        } // (end if no SAH)

        // Finish processing:
        // Complete batches
        if (isEntityExtractionRequired(source)) {
            try {
                extractTextAndEntities(null, source, true, false);
            } catch (Exception e) {
            }
        }
    }

    private void completeEnrichmentProcess(SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate,
            List<DocumentPojo> toRemove) {
        // Map ontologies:

        completeDocumentBuilding(toAdd, toUpdate);

        int pxErrors = getHarvestStatus().getNumMessages();
        num_error_px.addAndGet(pxErrors);

        // Log the number of feeds extracted for the current source
        if ((toAdd.size() > 0) || (toUpdate.size() > 0) || (toRemove.size() > 0) || (nUrlErrorsThisSource > 0)
                || (pxErrors > 0)) {
            StringBuffer sLog = new StringBuffer("source=")
                    .append((null == source.getUrl() ? source.getKey() : source.getUrl())).append(" ");
            // (only need this for the log, not the source harvest message)

            if ((null != source.getHarvestStatus()) && (null != source.getHarvestStatus().getHarvest_message()
                    && !source.getHarvestStatus().getHarvest_message().isEmpty())) {
                String message = source.getHarvestStatus().getHarvest_message().replace("\n", " ");
                if (message.length() > 512) {
                    sLog.append("extracterr='").append(message.substring(0, 512)).append("...' ");
                } else {
                    sLog.append("extracterr='").append(message).append("' ");
                }
            } //TESTED

            StringBuffer sLog2 = new StringBuffer();

            long extractTime_ms = source.getModified().getTime() - source.getCreated().getTime();
            long enrichTime_ms = new Date().getTime() - source.getModified().getTime();

            // Extraction stats:
            sLog2.append("extracted=").append(toAdd.size()).append(" updated=").append(toUpdate.size())
                    .append(" deleted=").append(toRemove.size()).append(" extract_time_ms=").append(extractTime_ms)
                    .append(" enrich_time_ms=").append(enrichTime_ms).append(" urlerrors=")
                    .append(nUrlErrorsThisSource).append(" pxerrors=").append(pxErrors);

            getHarvestStatus().logMessage(sLog2.toString(), false);
            sLog.append(sLog2);

            // Other error info for the log only: 
            String mostCommonMessage = getHarvestStatus().getMostCommonMessage();
            if (null != mostCommonMessage) {
                if (mostCommonMessage.length() > 256) {
                    mostCommonMessage = mostCommonMessage.substring(0, 253) + "...'";
                }
                sLog.append(mostCommonMessage); // (don't need this in the harvest status since we already have all of them)
            }
            logger.info(sLog.toString());
        } //TESTED

        // May need to update status again (eg any extractor errors or successes - in the harvesters or immediately above):
        if (getHarvestStatus().moreToLog()) {
            getHarvestStatus().update(source, new Date(), source.getHarvestStatus().getHarvest_status(), "", false,
                    false);
        }
        // (note: the harvest status is updated 3 times:
        //  1) inside the source-type harvester (which: 1.1) resets the message 1.2) wipes the messages, but sets prevStatus.getHarvest_message() above)
        //  2) above (the update call, which occurs if logMessage() has been called at any point)
        //  3) after store/index manager, which normally just sets the status unless any errors occurred during indexing

        num_sources_harvested.incrementAndGet();
    }

    // Quick utility to return if entity extraction has been specified by the user

    public boolean isEntityExtractionRequired(SourcePojo source) {
        return (((null == source.useExtractor()) && (null != default_entity_extractor))
                || ((null != source.useExtractor()) && !source.useExtractor().equalsIgnoreCase("none")))
                || (((null == source.useTextExtractor()) && (null != default_text_extractor))
                        || ((null != source.useTextExtractor())
                                && !source.useTextExtractor().equalsIgnoreCase("none")));
    }

    /**
     * Takes a list of toAdd and extracts each ones full text and entities/events/sentiment (metadata)
     * 
     * @param toAdd The list of toAdd without metadata to extract on
     * @return Any errors that occured while extracting, null if no error
     * @throws ExtractorSourceLevelTransientException 
     */
    public void extractTextAndEntities(List<DocumentPojo> toAdd, SourcePojo source, boolean bFinalizeBatchOnly,
            boolean calledFromPipeline) throws ExtractorDocumentLevelException, ExtractorSourceLevelException,
            ExtractorDailyLimitExceededException, ExtractorSourceLevelMajorException,
            ExtractorSourceLevelTransientException {
        IEntityExtractor currentEntityExtractor = null;
        try {
            int error_on_feed_count = 0, feed_count = 0;

            // EXTRACTOR SELECTION LOGIC

            if (null != source.useExtractor()) {
                currentEntityExtractor = entity_extractor_mappings.get(source.useExtractor().toLowerCase());
                if (null == currentEntityExtractor) { // (second chance)
                    currentEntityExtractor = (IEntityExtractor) lookForDynamicExtractor(source, false);
                }
            }
            if (currentEntityExtractor == null) // none specified or didn't find it (<-latter is error)
            {
                if ((null != source.useExtractor()) && !source.useExtractor().equalsIgnoreCase("none")) {

                    // ie specified one but it doesn't exist....
                    StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey())
                            .append(" no_extractor=").append(source.useExtractor());
                    logger.warn(errMsg.toString());

                    // No point trying this for the rest of the day
                    throw new ExtractorSourceLevelException(errMsg.toString());
                } else if (null == source.useExtractor()) { // Didn't specify one, just use default:
                    currentEntityExtractor = default_entity_extractor;
                }
            } //TESTED               

            if (bFinalizeBatchOnly) {
                try {
                    currentEntityExtractor.extractEntities(null);
                } catch (Exception e) {
                } // do nothing, eg handle entity extractors that don't handle things well
                return;
            }

            // A teeny bit of complex logic:
            // toAdd by default use a text extractor
            // DB/Files by default don't (but can override)

            ITextExtractor currentTextExtractor = null;
            boolean bUseRawContentWhereAvailable = false; // (only applies for feeds)
            if (null != source.useTextExtractor()) {
                currentTextExtractor = text_extractor_mappings.get(source.useTextExtractor().toLowerCase());
                if (null == currentTextExtractor) { // (second chance)
                    currentTextExtractor = (ITextExtractor) lookForDynamicExtractor(source, true);
                }
            }
            if (null == currentTextExtractor) { // none specified or didn't find it (<-latter is error)            
                if (null != source.useTextExtractor()) {

                    if ((null == source.getStructuredAnalysisConfig())
                            && (null == source.getUnstructuredAnalysisConfig())
                            && (null == source.getProcessingPipeline())) {
                        //(UAH and SAH get raw access to the data if they need it, so can carry on - ditto processing pipeline)

                        StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey())
                                .append(" no_txt_extractor=").append(source.useTextExtractor());
                        logger.warn(errMsg.toString());

                        // No point trying this for the rest of the day
                        throw new ExtractorSourceLevelException(errMsg.toString());
                    } else {
                        bUseRawContentWhereAvailable = true; // (only checked for feeds)                  
                    } //TESTED
                } else if (source.getExtractType().equalsIgnoreCase("feed")) // (DB/files just use their existing fullText) 
                {
                    if (null != currentEntityExtractor) {
                        String selfExtraction = currentEntityExtractor
                                .getCapability(EntityExtractorEnum.URLTextExtraction);
                        // Leave as null unless have no built-in capability
                        if ((null == selfExtraction) || !selfExtraction.equals("true")) {
                            currentTextExtractor = default_text_extractor;
                        }
                    } else {
                        currentTextExtractor = default_text_extractor;
                    }
                } //TESTED      
            }

            // EXTRACTION
            Iterator<DocumentPojo> i = toAdd.iterator(); //iterator created so that elements in the toAdd list can be 
            // removed within the loop
            while (i.hasNext()) {
                long nTime_ms = System.currentTimeMillis();
                DocumentPojo doc = i.next();
                boolean bExtractedText = false;

                // If I've been stopped then just remove all remaining documents
                // (pick them up next time through)
                if (bIsKilled) {
                    i.remove();
                    if (!calledFromPipeline) {
                        doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
                    }
                    continue;
                }

                if (calledFromPipeline || !urlsThatError.contains(doc.getUrl())) //only attempt if url is okay
                {
                    feed_count++;

                    try {
                        // (Check for truncation)
                        if ((null != currentEntityExtractor) && (null != doc.getFullText())) {
                            try {
                                String s = currentEntityExtractor.getCapability(EntityExtractorEnum.MaxInputBytes);
                                if (null != s) {
                                    int maxLength = Integer.parseInt(s);
                                    if (doc.getFullText().length() > maxLength) { //just warn, it's up to the extractor to sort it out
                                        getHarvestStatus().logMessage(
                                                "Warning: truncating document to max length: " + s, false);
                                    }
                                }
                            } catch (Exception e) {
                            } // max length not reported just carry on
                        }

                        if (null != currentTextExtractor) {
                            bExtractedText = true;
                            currentTextExtractor.extractText(doc);
                            if (null != currentEntityExtractor) {
                                currentEntityExtractor.extractEntities(doc);
                            }

                        } //TESTED
                        else //db/filesys should already have full text extracted (unless otherwise specified)
                        {
                            if (source.getExtractType().equalsIgnoreCase("feed")) { // Need full text so get from current

                                if ((null == doc.getFullText()) || !bUseRawContentWhereAvailable) {
                                    bExtractedText = true;
                                    if (null != currentEntityExtractor) {
                                        currentEntityExtractor.extractEntitiesAndText(doc);
                                    }
                                } //TESTED (AlchemyAPI case)
                                else { // Feed for which we've already extracted data
                                    if (null != currentEntityExtractor) {
                                        currentEntityExtractor.extractEntities(doc);
                                    }
                                } //TESTED
                            } else { // DB/File => use full text
                                if (null != currentEntityExtractor) {
                                    currentEntityExtractor.extractEntities(doc);
                                }
                            } //TESTED
                        }

                        //statistics counting
                        if (doc.getEntities() != null)
                            num_ent_extracted.addAndGet(doc.getEntities().size());
                        if (doc.getAssociations() != null)
                            num_event_extracted.addAndGet(doc.getAssociations().size());

                    } catch (ExtractorDailyLimitExceededException e) {

                        //extractor can't do anything else today, return
                        i.remove();
                        if (!calledFromPipeline) {
                            doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
                        }

                        // Source error, ignore all other documents
                        while (i.hasNext()) {
                            doc = i.next();
                            if (!calledFromPipeline) {
                                doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
                            }
                            i.remove();
                        }
                        //TESTED

                        throw e; // (ie stop processing this source)
                    } //TESTED
                    catch (Exception e) { // Anything except daily limit exceeded, expect it to be ExtractorDocumentLevelException

                        //TODO (INF-1922): put this in a separate function and call that from pipeline on failure...
                        // (not sure what to do about error_on_feed_count though, need to maintain a separate one of those in pipeline?)

                        // This can come from (sort-of/increasingly) "user" code so provide a bit more information
                        StringBuffer errMessage = HarvestExceptionUtils.createExceptionMessage(e);
                        _harvestStatus.logMessage(errMessage.toString(), true);
                        num_error_url.incrementAndGet();
                        nUrlErrorsThisSource++;

                        if (!calledFromPipeline) {
                            urlsThatError.add(doc.getUrl());
                        }

                        error_on_feed_count++;
                        i.remove();
                        if (!calledFromPipeline) {
                            doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
                        }
                    }
                    //TESTED
                }
                // (note this is only ever called in legacy mode - it's handled in the HarvestControllerPipeline)
                if ((null != source.getExtractType()) && (source.getExtractType().equalsIgnoreCase("feed"))) {
                    if (i.hasNext() && bExtractedText) {
                        nTime_ms = nBetweenFeedDocs_ms - (System.currentTimeMillis() - nTime_ms); // (ie delay time - processing time)
                        if (nTime_ms > 0) {
                            try {
                                Thread.sleep(nTime_ms);
                            } catch (Exception e) {
                            }
                            ;
                            // (wait 10s between web-site accesses for politeness)
                        }
                    }
                } //(TESTED)

            } // end loop over documents   
              //check if all toAdd were erroring, or more than 20 (arbitrary number)
              //NOTE: this is duplicated in HarvestControllerPipeline for non-legacy cases
            if ((error_on_feed_count == feed_count) && (feed_count > 5)) {
                String errorMsg = new StringBuffer().append(feed_count).append(" docs, ")
                        .append(error_on_feed_count).append(", errors").toString();
                if (error_on_feed_count > 20) {
                    throw new ExtractorSourceLevelMajorException(errorMsg);
                } else {
                    throw new ExtractorSourceLevelException(errorMsg);
                } //TESTED
            }
        } catch (ExtractorDailyLimitExceededException e) {
            // Percolate upwards!
            throw e;
        } catch (ExtractorSourceLevelException e) {
            // Percolate upwards!
            throw e;
        } catch (ExtractorSourceLevelMajorException e) {
            // Percolate upwards!
            throw e;
        } catch (Exception e) { // Misc internal error
            StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()).append(" error=")
                    .append(e.getMessage());
            logger.error(errMsg.toString(), e);
            throw new ExtractorSourceLevelTransientException(errMsg.toString());
        } //TESTED

    }//TESTED

    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

    // UTILITY FUNCTIONS

    /**
     * Decides what to do with a source when an error is returned from the
     * extractor process.
     * 
     * @param error The error that was returned from extractor
     * @param source The source that the extractor was working on
     */
    public void handleExtractError(Exception error, SourcePojo source) {
        if (null != error) {
            if (error instanceof ExtractorDocumentLevelException) {
                num_error_url.incrementAndGet();
                nUrlErrorsThisSource++;
            } else if (error instanceof ExtractorSourceLevelException) {
                num_errors_source.incrementAndGet();
                //We flag the source in mongo and temp disable
                _harvestStatus.update(source, new Date(), HarvestEnum.error,
                        "Source Level extraction error: " + error.getMessage(), true, false);
            } //TESTED
            else if (error instanceof ExtractorSourceLevelMajorException) {
                num_errors_source.incrementAndGet();
                //We flag the source in mongo and perma disable
                _harvestStatus.update(source, new Date(), HarvestEnum.error,
                        "Major source level Extraction error: " + error.getMessage(), true, true);
            } //TESTED
            else if (error instanceof ExtractorSourceLevelTransientException) {
                num_errors_source.incrementAndGet();
                //We flag the source in mongo
                _harvestStatus.update(source, new Date(), HarvestEnum.error,
                        "Transient source level extraction error: " + error.getMessage(), false, false);
            } //TESTED
            else if (error instanceof ExtractorDailyLimitExceededException) {
                //We flag the source in mongo and temp disable
                _harvestStatus.update(source, new Date(), HarvestEnum.success, "Extractor daily limit error.", true,
                        false);
            } //TESTED
        }
    }//TESTED (just that the instanceofs work)

    /**
     * Prints out some quick info about how the harvester performed
     */
    public static void logHarvesterStats() {
        StringBuilder sb = new StringBuilder();
        sb.append("num_of_sources_harvested=" + num_sources_harvested.get());
        sb.append(" num_of_docs_extracted=" + num_docs_extracted.get());
        sb.append(" num_of_entities_extracted=" + num_ent_extracted.get());
        sb.append(" num_of_events_extracted=" + num_event_extracted.get());
        sb.append(" num_of_source_errors=" + num_errors_source.get());
        sb.append(" num_of_url_errors=" + num_error_url.get());
        sb.append(" num_of_px_errors=" + num_error_px.get());
        logger.info(sb.toString());
    }

    // Utility to handle the various multiple community problems:
    // - Different sources, name URL ("duplicates") ... get the doc from the DB (it's there by definition)
    // - Same source, multiple communities ("clones") ... get the doc from the first community processed

    private static DocumentPojo enrichDocByDuplicating(DocumentPojo docToReplace) {
        DocumentPojo newDoc = null;
        BasicDBObject dbo = getDocumentMetadataFromWhichToDuplicate(docToReplace);
        if (null != dbo) {
            String sContent = getDocumentContentFromWhichToDuplicate(docToReplace);
            if (null != sContent) {
                newDoc = duplicateDocument(docToReplace, dbo, sContent, false);
                // (Note this erases the "duplicateFrom" field - this is important because it distinguishes "clones" and "duplicates")
            }
        }
        return newDoc;
    }//TESTED

    private static LinkedList<DocumentPojo> enrichDocByCloning(List<DocumentPojo> docsToReplace) {
        DocumentPojo newDoc = null;
        BasicDBObject dbo = null;
        String sContent = null;
        LinkedList<DocumentPojo> newDocs = new LinkedList<DocumentPojo>();
        for (DocumentPojo docToReplace : docsToReplace) {

            if (null == dbo) { // First time through...
                sContent = docToReplace.getCloneFrom().getFullText();
                docToReplace.getCloneFrom().setFullText(null);
                dbo = (BasicDBObject) docToReplace.getCloneFrom().toDb();
                docToReplace.getCloneFrom().setFullText(sContent);
            }
            newDoc = duplicateDocument(docToReplace, dbo, sContent, true);
            newDocs.add(newDoc);
        }
        return newDocs;

    }//TESTED

    // Sub-utility

    private static BasicDBObject getDocumentMetadataFromWhichToDuplicate(DocumentPojo docToReplace) {
        BasicDBObject query = new BasicDBObject("url", docToReplace.getUrl());
        query.put("sourceKey", docToReplace.getDuplicateFrom());
        BasicDBObject dbo = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(query);

        return dbo;
    }//TESTED

    private static String getDocumentContentFromWhichToDuplicate(DocumentPojo docToReplace) {
        try {
            // Get the full text:
            byte[] storageArray = new byte[200000];
            BasicDBObject contentQ = new BasicDBObject("url", docToReplace.getUrl());
            contentQ.put(CompressedFullTextPojo.sourceKey_,
                    new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, docToReplace.getSourceKey())));
            BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
            BasicDBObject dboContent = (BasicDBObject) DbManager.getDocument().getContent().findOne(contentQ,
                    fields);
            if (null != dboContent) {
                byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_));
                ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
                GZIPInputStream gzip = new GZIPInputStream(in);
                int nRead = 0;
                StringBuffer output = new StringBuffer();
                while (nRead >= 0) {
                    nRead = gzip.read(storageArray, 0, 200000);
                    if (nRead > 0) {
                        String s = new String(storageArray, 0, nRead, "UTF-8");
                        output.append(s);
                    }
                }
                return output.toString();
            } else { // Will just need to-reprocess this document
                return null;
            }
        } catch (Exception e) {
            // Do nothing, just carry on
            e.printStackTrace();
        }
        return null;
    }//TESTED

    private static DocumentPojo duplicateDocument(DocumentPojo docToReplace, BasicDBObject dbo, String content,
            boolean bClone) {
        DocumentPojo newDoc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
        newDoc.setFullText(content);
        newDoc.setId(null); // (ie ensure it's unique)

        if (bClone) { // Cloned docs have special source key formats (and also need to update their community)
            ObjectId docCommunity = docToReplace.getCommunityId();
            newDoc.setSourceKey(docToReplace.getSourceKey());
            newDoc.setCommunityId(docCommunity);
            newDoc.setIndex(new StringBuffer("doc_").append(docCommunity).toString());
        } else { // For cloned documents, published etc can be taken from the master document, ie newDoc is already accurate
            // Copy over timing details from new document (set by the harvesters) 
            newDoc.setPublishedDate(docToReplace.getPublishedDate());
            newDoc.setCreated(docToReplace.getCreated());
            newDoc.setModified(docToReplace.getModified());
        }
        return newDoc;
    }//TESTED

    //
    // Any documents that have got this far are going to get processed
    //

    // Processing:
    //Attempt to map entity types to set of ontology types
    //eventually the plan is to allow extractors to set the ontology_type of
    //entities to anything found in the opencyc ontology   

    static public void completeDocumentBuilding(List<DocumentPojo> docs, List<DocumentPojo> updateDocs) {
        // Handle documents to be added
        // Currently, just set ontology type
        if (docs != null) {
            for (DocumentPojo doc : docs) {
                if (doc.getEntities() != null) {
                    num_ent_extracted.addAndGet(doc.getEntities().size());
                    for (EntityPojo entity : doc.getEntities()) {
                        if (entity.getGeotag() != null) {
                            if (null == entity.getOntology_type()) {
                                entity.setOntology_type(GeoOntologyMapping.mapEntityToOntology(entity.getType()));
                            }
                        }
                    }
                }
                if (doc.getAssociations() != null) {
                    num_event_extracted.addAndGet(doc.getAssociations().size());
                }
            }
        }

        // Remove any docs from update list that didn't get updated
        if (updateDocs != null) {
            Iterator<DocumentPojo> it = updateDocs.iterator();
            while (it.hasNext()) {
                DocumentPojo d = it.next();
                if (null == d.getTempSource()) { //this doc got rejected, normally it means we'll remove from update list so the db version is left alone
                    if (null == d.getExplain()) { // exception: if d.getExplain != null then _remove_ from update list, so db version is deleted
                        it.remove();
                    } //TODO (INF-2825): TOTEST
                }
            }
        }
    }

    ///////////////////////////////////////////////////////////////
    ///////////////////////////////////////////////////////////////

    // Dynamic extraction utilities

    private synchronized Object lookForDynamicExtractor(SourcePojo source, boolean bTextExtractor) {
        String extractorName = bTextExtractor ? source.useTextExtractor() : source.useExtractor();
        if (null == extractorName) {
            return null;
        }
        Object outClassInstance = null;

        if (null != failedDynamicExtractors) { // (cache for failed shares)
            if (failedDynamicExtractors.contains(extractorName)) {
                return null;
            }
        }
        ClassLoader savedClassLoader = null;
        try {
            ObjectId extractorId = null;
            if (extractorName.startsWith("/")) { // allow /<id>/free text..
                extractorName = extractorName.substring(1).replaceFirst("/.*", "");
            } //TESTED
            try {
                extractorId = new ObjectId(extractorName);
            } catch (Exception e) { // not a dynamic share that's fine, just exit no harm done
                return null;
            }
            // If we're here then it was a share

            BasicDBObject query = new BasicDBObject("_id", extractorId);
            SharePojo extractorInfo = SharePojo.fromDb(MongoDbManager.getSocial().getShare().findOne(query),
                    SharePojo.class);
            if ((null != extractorInfo) && (null != extractorInfo.getBinaryId())) {
                // Check share owned by an admin:
                if (!AuthUtils.isAdmin(extractorInfo.getOwner().get_id())) {
                    throw new RuntimeException("Extractor share owner must be admin");
                } //TESTED
                  // Check >0 source communities are in the share communities
                int nMatches = 0;
                for (ShareCommunityPojo commObj : extractorInfo.getCommunities()) {
                    if (source.getCommunityIds().contains(commObj.get_id())) {
                        nMatches++;
                        break;
                    }
                }
                if (0 == nMatches) {
                    throw new RuntimeException("Extractor not shared across source communities");
                } //TESTED

                savedClassLoader = Thread.currentThread().getContextClassLoader();

                //HashMap<String, Class<?> > dynamicExtractorClassCache = null;
                if (null == dynamicExtractorClassCache) {
                    dynamicExtractorClassCache = new HashMap<String, Class<?>>();
                }

                URL[] cachedJarFile = { new File(maintainJarFileCache(extractorInfo)).toURI().toURL() };

                Class<?> classToLoad = dynamicExtractorClassCache.get(extractorInfo.getTitle());
                if (null == classToLoad) {
                    URLClassLoader child = new URLClassLoader(cachedJarFile, savedClassLoader);

                    Thread.currentThread().setContextClassLoader(child);
                    classToLoad = Class.forName(extractorInfo.getTitle(), true, child);
                    dynamicExtractorClassCache.put(extractorInfo.getTitle(), classToLoad);
                }

                if (bTextExtractor) {
                    ITextExtractor txtExtractor = (ITextExtractor) classToLoad.newInstance();
                    text_extractor_mappings.put(source.useTextExtractor(), txtExtractor);
                    outClassInstance = txtExtractor;
                } else {
                    IEntityExtractor entExtractor = (IEntityExtractor) classToLoad.newInstance();
                    entity_extractor_mappings.put(source.useExtractor(), entExtractor);
                    outClassInstance = entExtractor;
                }
            }
        } catch (Exception e) {
            StringBuffer sb = Globals.populateStackTrace(new StringBuffer(), e);
            sb.append(
                    " (check the share's title is the fully qualified classname, and its community permissions are correct)");
            getHarvestStatus().logMessage("custom extractor error: " + sb.toString(), false);
            if (null == failedDynamicExtractors) {
                failedDynamicExtractors = new HashSet<String>();
                failedDynamicExtractors.add(extractorName);
            }
            //e.printStackTrace();
        } // General fail just carry on 
        catch (Error err) {
            StringBuffer sb = Globals.populateStackTrace(new StringBuffer(), err);
            sb.append(
                    " (check the share's title is the fully qualified classname, and its community permissions are correct)");
            getHarvestStatus().logMessage("custom extractor error: " + sb.toString(), false);
            if (null == failedDynamicExtractors) {
                failedDynamicExtractors = new HashSet<String>();
                failedDynamicExtractors.add(extractorName);
            }
            //err.printStackTrace();

        } // General fail just carry on
        finally {
            if (null != savedClassLoader) {
                Thread.currentThread().setContextClassLoader(savedClassLoader);
            }
        }
        return outClassInstance;
    }//TOTEST

    /**
     * Finds the gridfile given by id and returns the bytes
     * 
     * @param id the object id of the gridfile to lookup (stored in sharepojo)
     * @return bytes of file stored in gridfile
     */
    //   private static byte[] getGridFile(ObjectId id)
    //   {
    //      ByteArrayOutputStream out = new ByteArrayOutputStream();
    //      try
    //      {
    //         GridFSDBFile file = DbManager.getSocial().getShareBinary().find(id);                  
    //         file.writeTo(out);
    //         byte[] toReturn = out.toByteArray();
    //         out.close();
    //         return toReturn;
    //      }
    //      catch (Exception ex){}      
    //      return null;
    //   }

    /**
     * Downloads jar file from web using URL call.  Typically
     * the jar files we be kept in our /share store so we will
     * be calling our own api.
     * 
     * @param jarURL
     * @return
     * @throws Exception 
     */
    public static String maintainJarFileCache(SharePojo share) throws Exception {
        String tempFileName = System.getProperty("java.io.tmpdir") + "/" + share.get_id() + ".cache.jar";
        File tempFile = new File(tempFileName);

        // Compare dates (if it exists) to see if we need to update the cache) 

        if (!tempFile.exists() || (tempFile.lastModified() < share.getModified().getTime())) {
            OutputStream out = new BufferedOutputStream(new FileOutputStream(tempFileName));
            if (share.getBinaryId() != null) {
                GridFSDBFile file = DbManager.getSocial().getShareBinary().find(share.getBinaryId());
                file.writeTo(out);
            } else {
                out.write(share.getBinaryData());
            }
            out.flush();
            out.close();
        } //TESTED

        return tempFileName;
    }
}