com.ikanow.infinit.e.core.utils.SourceUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.ikanow.infinit.e.core.utils.SourceUtils.java

Source

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package com.ikanow.infinit.e.core.utils;

import java.net.InetAddress;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.log4j.Logger;
import org.bson.types.ObjectId;

import com.google.gson.reflect.TypeToken;
import com.ikanow.infinit.e.data_model.InfiniteEnums;
import com.ikanow.infinit.e.data_model.InfiniteEnums.HarvestEnum;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojoSubstitutionDbMap;
import com.ikanow.infinit.e.data_model.store.document.DocCountPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCursor;

public class SourceUtils {

    private static Logger logger = Logger.getLogger(SourceUtils.class);

    private static final long _ONEDAY = 24L * 3600L * 1000L;

    /////////////////////////////////////////////////////////////////////////////////////

    // Utilities common to both harvester and synchronization

    /////////////////////////////////////////////////////////////////////////////////////

    public static boolean checkDbSyncLock() {
        DBCursor dbc = DbManager.getFeature().getSyncLock().find();
        if (!dbc.hasNext()) {
            return false; // working fine
        }
        Date now = new Date();
        while (dbc.hasNext()) {
            BasicDBObject sync_lock = (BasicDBObject) dbc.next();
            Object lastSyncObj = sync_lock.get("last_sync");
            if (null != lastSyncObj) {
                try {
                    Date last_sync = (Date) lastSyncObj;
                    if (last_sync.getTime() + _ONEDAY > now.getTime()) {

                        return true; // (ie sync object exists and is < 1 day old)
                    }
                } catch (Exception e) {
                    // class cast, do nothing
                }
            }
        } // (end "loop over" 1 object in sync_lock DB)

        return false;
    }
    //TESTED (active lock, no lock, old lock)

    /////////////////////////////////////////////////////////////////////////////////////

    // Get all sources to be harvested (max 500 per cycle, in order of harvesting so nothing should get lost)

    public static LinkedList<SourcePojo> getSourcesToWorkOn(String sCommunityOverride, String sSourceId,
            boolean bSync, boolean bDistributed) {
        // Add harvest types to the DB
        com.ikanow.infinit.e.harvest.utils.PropertiesManager props = new com.ikanow.infinit.e.harvest.utils.PropertiesManager();
        int nMaxSources = 1000;

        if (!bSync) {
            nMaxSources = props.getMaxSourcesPerHarvest(); // (normally 0 == no limit)
        }

        String sTypes = props.getHarvesterTypes();
        String sType[] = sTypes.split("\\s*,\\s*");
        String sTypeCase[] = new String[sType.length * 2];
        for (int i = 0; i < sType.length; i++) {
            String s = sType[i];
            sTypeCase[2 * i] = s.substring(0, 1).toUpperCase() + s.substring(1).toLowerCase();
            sTypeCase[2 * i + 1] = s.toLowerCase();
        }
        BasicDBObject harvestTypes = new BasicDBObject(MongoDbManager.in_, sTypeCase);
        LinkedList<SourcePojo> sources = null;
        try {
            BasicDBObject query = null;
            BasicDBObject adminUpdateQuery = new BasicDBObject();
            if (bDistributed) {
                Date now = new Date();
                query = generateNotInProgressClause(now);
                // (just don't waste time on things currently being harvested)

                // Also need to ignore any sources that have just been synced by a different node... 
                if (bSync) {
                    Date recentlySynced = new Date(now.getTime() - 1800 * 1000); //(ie not synced within 1/2 hour)
                    query.put(SourceHarvestStatusPojo.sourceQuery_synced_,
                            new BasicDBObject(MongoDbManager.lt_, recentlySynced));
                    // (will know synced exists because we set it below - the sort doesn't work without its being set for all records)
                } else if (null == sSourceId) { // for harvest, try to take into account the effect of search cycles
                    // (if manually setting the source then ignore this obviously...)
                    addSearchCycleClause(query, now);
                }
            } else {
                query = new BasicDBObject();
            }
            if (null == sSourceId) {
                query.put(SourcePojo.isApproved_, true);
            }
            if (!bSync && (null == sSourceId)) {
                query.put(SourcePojo.harvestBadSource_, new BasicDBObject(MongoDbManager.ne_, true)); // (ie true or doesn't exist)
                // (still sync bad sources)
            }
            query.put(SourcePojo.extractType_, harvestTypes);
            if (null != sCommunityOverride) {
                query.put(SourcePojo.communityIds_, new ObjectId(sCommunityOverride));
                adminUpdateQuery.put(SourcePojo.communityIds_, new ObjectId(sCommunityOverride));
            } else if (null != sSourceId) {
                try {
                    String[] idList = sSourceId.split(",");
                    List<ObjectId> lid = new ArrayList<ObjectId>(idList.length);
                    for (String s : idList) {
                        lid.add(new ObjectId(s));
                    }
                    query.put(SourcePojo._id_, new BasicDBObject(DbManager.in_, lid));
                    adminUpdateQuery.put(SourcePojo._id_, new BasicDBObject(DbManager.in_, lid));
                } catch (Exception e) { // Allow either _id or key to be used as the id...
                    query.put(SourcePojo.key_,
                            new BasicDBObject(DbManager.in_, Arrays.asList(sSourceId.split(","))));
                    adminUpdateQuery.put(SourcePojo.key_,
                            new BasicDBObject(DbManager.in_, Arrays.asList(sSourceId.split(","))));
                }
            }
            BasicDBObject orderBy = new BasicDBObject();
            if (bSync) {
                orderBy.put(SourceHarvestStatusPojo.sourceQuery_synced_, 1);
            } else {
                orderBy.put(SourceHarvestStatusPojo.sourceQuery_harvested_, 1);
            }
            //(note although there's a complex query preceding this, it should be using the above index 
            // anyway so there should be some benefit to this)

            BasicDBObject fields = new BasicDBObject();
            if (bDistributed) {
                // Mainly just _id and extractType, we'll get these for debugging
                fields.put(SourcePojo._id_, 1);
                fields.put(SourcePojo.extractType_, 1);

                fields.put(SourcePojo.key_, 1);
                fields.put(SourceHarvestStatusPojo.sourceQuery_harvested_, 1);
                fields.put(SourceHarvestStatusPojo.sourceQuery_synced_, 1);
                fields.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, 1);
                if (null != sSourceId) {
                    //put a random field in just so we know it's a source override:
                    fields.put(SourcePojo.ownerId_, 1);
                    //(plus don't add searchCycle, we're just going to ignore it anyway)
                } //TESTED
                else {
                    fields.put(SourcePojo.searchCycle_secs_, 1);
                } //TESTED

                // (need these for distributed logic)
                fields.put(SourcePojo.distributionFactor_, 1);
                fields.put(SourceHarvestStatusPojo.sourceQuery_distributionTokensFree_, 1);
            }
            // (first off, set the harvest/sync date for any sources that don't have it set,
            //  needed because sort doesn't return records without the sorting field) 
            Date yesteryear = new Date(new Date().getTime() - 365L * _ONEDAY);
            // (NOTE this time being >=1 yr is depended upon by applications, so you don't get to change it. Ever)
            if (bSync) {
                adminUpdateQuery.put(SourceHarvestStatusPojo.sourceQuery_synced_,
                        new BasicDBObject(MongoDbManager.exists_, false));
                DbManager.getIngest().getSource().update(adminUpdateQuery,
                        new BasicDBObject(MongoDbManager.set_,
                                new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_synced_, yesteryear)),
                        false, true);
            } else {
                adminUpdateQuery.put(SourceHarvestStatusPojo.sourceQuery_harvested_,
                        new BasicDBObject(MongoDbManager.exists_, false));
                DbManager.getIngest().getSource().update(adminUpdateQuery,
                        new BasicDBObject(MongoDbManager.set_,
                                new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvested_, yesteryear)),
                        false, true);

                // Also ... if I've left any sources in_progress (eg by exiting uncleanly) then sort that out now
                adminUpdateQuery.remove(SourceHarvestStatusPojo.sourceQuery_harvested_);
                adminUpdateQuery.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_,
                        HarvestEnum.in_progress.toString());
                adminUpdateQuery.put(SourceHarvestStatusPojo.sourceQuery_lastHarvestedBy_, getHostname());
                DbManager.getIngest().getSource().update(adminUpdateQuery,
                        new BasicDBObject(MongoDbManager.set_, new BasicDBObject(
                                SourceHarvestStatusPojo.sourceQuery_harvest_status_, HarvestEnum.error.toString())),
                        false, true);
                //TESTED (by hand)
            }
            // (then perform query)
            DBCursor cur = DbManager.getIngest().getSource().find(query, fields).sort(orderBy).limit(nMaxSources);

            sources = SourcePojo.listFromDb(cur, new TypeToken<LinkedList<SourcePojo>>() {
            });
        } catch (Exception e) {
            logger.error("Exception Message getting sources for sync: " + e.getMessage(), e);
        }
        return sources;
    }//TESTED (mostly unchanged from "tested" Beta version - few changes for distribution tested by eye)

    /////////////////////////////////////////////////////////////////////////////////////

    // Share sources to be harvested across all running harvesters

    public static LinkedList<SourcePojo> getDistributedSourceList(LinkedList<SourcePojo> uncheckedSources,
            String sSourceType, boolean bSync) {
        Date now = new Date();
        LinkedList<SourcePojo> nextSetToProcess = new LinkedList<SourcePojo>();

        // Some additional distributed logic
        LinkedList<SourcePojo> putMeBackAtTheStart_distributed = null;

        PropertiesManager pm = new PropertiesManager();
        int nBatchSize = pm.getDistributionBatchSize(bSync);
        Long defaultSearchCycle_ms = pm.getMinimumHarvestTimePerSourceMs();

        // The logic for getting the next set of sources is:
        // 2] Get the oldest 20 sources that are:
        // 2.1] In progress and >a day old (assume the harvester/sync running them crashed)
        // 2.2] Not in progress and have either never been harvested or synced, or in age of how long ago

        for (int nNumSourcesGot = 0; (nNumSourcesGot < nBatchSize) && (!uncheckedSources.isEmpty());) {

            BasicDBObject query = generateNotInProgressClause(now);
            SourcePojo candidate = null;
            synchronized (SourceUtils.class) { // (can be called across multiple threads)
                candidate = uncheckedSources.pop();
            }

            //DEBUG
            //System.out.println(" CANDIDATE=" + candidate.getKey() + " ..." + candidate.getId());         

            if ((null != sSourceType) && !candidate.getExtractType().equalsIgnoreCase(sSourceType)) {
                continue;
            }
            HarvestEnum candidateStatus = null;
            if (null != candidate.getHarvestStatus()) {
                candidateStatus = candidate.getHarvestStatus().getHarvest_status();
            }
            if (bSync && (null == candidateStatus)) { // Don't sync unharvested sources, obviously!
                continue;
            }
            //(DISTRIBUTON LOGIC)

            // Checking whether to respect the searchCycle_secs for distributed sources is a bit more complex
            boolean isDistributed = (null != candidate.getDistributionFactor());
            boolean distributedInProcess = isDistributed && candidate.reachedMaxDocs() || // (<- only set inside a process)
                    ((null != candidate.getHarvestStatus()) && // (robustness) 
                            (null != candidate.getHarvestStatus().getDistributionTokensFree()) && // (else starting out)
                            (candidate.getDistributionFactor() != candidate.getHarvestStatus()
                                    .getDistributionTokensFree()));
            // (else this is the start)
            //(TESTED - local and distributed)
            //(END DISTRIBUTON LOGIC)

            if (((HarvestEnum.success_iteration != candidateStatus) && !distributedInProcess)
                    || ((null != candidate.getSearchCycle_secs()) && (candidate.getSearchCycle_secs() < 0))) {
                // (ie EITHER we're not iteration OR we're disabled)
                //(^^^ don't respect iteration status if source manually disabled)

                if ((null != candidate.getSearchCycle_secs()) || (null != defaultSearchCycle_ms)) {
                    if (null == candidate.getSearchCycle_secs()) {
                        candidate.setSearchCycle_secs((int) (defaultSearchCycle_ms / 1000));
                    }
                    if (candidate.getSearchCycle_secs() < 0) {
                        continue; // negative search cycle => disabled
                    }
                    if ((null != candidate.getHarvestStatus())
                            && (null != candidate.getHarvestStatus().getHarvested())) {
                        //(ie the source has been harvested, and there is a non-default search cycle setting)

                        if ((candidate.getHarvestStatus().getHarvested().getTime()
                                + 1000L * candidate.getSearchCycle_secs()) > now.getTime()) {
                            if ((HarvestEnum.in_progress != candidateStatus) && (null != candidateStatus)
                                    && (null == candidate.getOwnerId())) {
                                //(^^ last test, if it's in_progress then it died recently (or hasn't started) so go ahead and harvest anyway)
                                // (also hacky use of getOwnerId just to see if this is a source override source or not)
                                continue; // (too soon since the last harvest...)
                            } //TESTED (including hacky use of ownerId)
                        }
                    }
                } //TESTED
            }
            //TESTED: manually disabled (ignore), not success_iteration (ignore if outside cycle), success_iteration (always process)

            query.put(SourcePojo._id_, candidate.getId());
            BasicDBObject modifyClause = new BasicDBObject();
            modifyClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_,
                    HarvestEnum.in_progress.toString());
            if (bSync) {
                modifyClause.put(SourceHarvestStatusPojo.sourceQuery_synced_, now);
            } else {
                modifyClause.put(SourceHarvestStatusPojo.sourceQuery_harvested_, now);
            }
            modifyClause.put(SourceHarvestStatusPojo.sourceQuery_lastHarvestedBy_, getHostname());
            BasicDBObject modify = new BasicDBObject(MongoDbManager.set_, modifyClause);

            try {
                BasicDBObject fields = new BasicDBObject(SourcePojo.templateProcessingFlow_, 0);
                BasicDBObject dbo = (BasicDBObject) DbManager.getIngest().getSource().findAndModify(query, fields,
                        null, false, modify, false, false);
                if (null != dbo) {
                    SourcePojo fullSource = SourcePojo.fromDb(dbo, SourcePojo.class,
                            new SourcePojoSubstitutionDbMap());
                    nextSetToProcess.add(fullSource);
                    nNumSourcesGot++;

                    ////////////////////////////////////////////////////////////////////////
                    //
                    // DISTRIBUTION LOGIC:
                    // If distributionFactor set then grab one token and set state back to 
                    // success_iteration, to allow other threads/processes to grab me
                    if ((null != fullSource.getDistributionFactor()) && !bSync) {
                        // Get the current distribution token
                        int distributionToken = 0;
                        boolean bReset = false;
                        if ((null == fullSource.getHarvestStatus())
                                || (null == fullSource.getHarvestStatus().getDistributionTokensFree())) {
                            distributionToken = fullSource.getDistributionFactor();
                            // (also set up some parameters so don't need to worry about null checks later)
                            if (null == fullSource.getHarvestStatus()) {
                                fullSource.setHarvestStatus(new SourceHarvestStatusPojo());
                            }
                            fullSource.getHarvestStatus().setDistributionTokensFree(distributionToken);
                            fullSource.getHarvestStatus().setDistributionTokensComplete(0);
                        } else {
                            distributionToken = fullSource.getHarvestStatus().getDistributionTokensFree();

                            //Check last harvested time to ensure this isn't an old state (reset if so)
                            if ((distributionToken != fullSource.getDistributionFactor())
                                    || (0 != fullSource.getHarvestStatus().getDistributionTokensComplete())) {
                                if (null != fullSource.getHarvestStatus().getRealHarvested()) { // harvested is useless here because it's already been updated
                                    if ((new Date().getTime()
                                            - fullSource.getHarvestStatus().getRealHarvested().getTime()) > _ONEDAY) // (ie older than a day)
                                    {
                                        distributionToken = fullSource.getDistributionFactor(); // ie start again
                                    }
                                }
                            } //TESTED
                        } //(end check for any existing state)               

                        if (distributionToken == fullSource.getDistributionFactor()) {
                            bReset = true; // (first time through, might as well go ahead and reset to ensure all the vars are present)
                        }

                        // If in error then just want to grab all remaining tokens and reset the status
                        if (HarvestEnum.error == fullSource.getHarvestStatus().getHarvest_status()) { // currently an error
                            if (distributionToken != fullSource.getDistributionFactor()) { // In the middle, ie just errored
                                fullSource.setDistributionTokens(new HashSet<Integer>());
                                while (distributionToken > 0) {
                                    distributionToken--;
                                    fullSource.getDistributionTokens().add(distributionToken);
                                }
                                BasicDBObject dummy = new BasicDBObject();
                                bReset = updateHarvestDistributionState_tokenComplete(fullSource, HarvestEnum.error,
                                        dummy, dummy);
                                // (then finish off completion down below)                        
                            }
                        } //TESTED (error mode, 2 cases: complete and incomplete)

                        //DEBUG
                        //System.out.println(" DIST_SOURCE=" + fullSource.getKey() + "/" + fullSource.getDistributionFactor() + ": " + distributionToken + ", " + bReset);

                        //(note we'll see this even if searchCycle is set because the "source" var (which still has the old
                        // state) is stuck back at the start of uncheckedList, so each harvester will see the source >1 time)

                        if (0 != distributionToken) { // (else no available tokens for this cycle)
                            distributionToken--;

                            fullSource.setDistributionTokens(new HashSet<Integer>());
                            fullSource.getDistributionTokens().add(distributionToken);

                            // Remove one of the available tokens (they don't get reset until the source is complete)
                            updateHarvestDistributionState_newToken(fullSource.getId(), distributionToken,
                                    HarvestEnum.success_iteration, bReset);

                            // After this loop is complete, put back at the start of the unchecked list
                            // so another thread can pick up more tokens:
                            if (null == putMeBackAtTheStart_distributed) {
                                putMeBackAtTheStart_distributed = new LinkedList<SourcePojo>();
                            }
                            putMeBackAtTheStart_distributed.add(candidate);

                            // Before adding back to list, set a transient field to ensure it bypasses any search cycle checks
                            // (for in process logic where we won't see the update status from the DB)
                            candidate.setReachedMaxDocs();

                            // Reset full source's status so we know if we started in success/error/success_iteration
                            if (null == candidateStatus) {
                                candidateStatus = HarvestEnum.success;
                            }
                            fullSource.getHarvestStatus().setHarvest_status(candidateStatus);

                        } // (end if available tokens)
                        else { // (don't process, just set back to original status)
                            HarvestEnum harvestStatus = HarvestEnum.success;
                            if (null != fullSource.getHarvestStatus()) {
                                if (null != fullSource.getHarvestStatus().getHarvest_status()) {
                                    harvestStatus = fullSource.getHarvestStatus().getHarvest_status();
                                }
                            }
                            if (bReset) { // resetting back to 10 
                                distributionToken = fullSource.getDistributionFactor();
                            }
                            updateHarvestDistributionState_newToken(fullSource.getId(), distributionToken,
                                    harvestStatus, bReset);
                            // (bReset can be true in the error case handled above)

                            nextSetToProcess.removeLast();
                            nNumSourcesGot--;
                        } //TESTED                  

                    } //TESTED
                    else if (bSync) {
                        // Not allowed to sync "distributed in progress"
                        if ((null != fullSource.getHarvestStatus())
                                || (null != fullSource.getHarvestStatus().getDistributionTokensFree())) {
                            if (null == fullSource.getHarvestStatus().getHarvest_status()) { // (shouldn't ever happen)
                                fullSource.getHarvestStatus().setHarvest_status(HarvestEnum.success_iteration);
                            }
                            if (fullSource.getHarvestStatus().getDistributionTokensFree() != fullSource
                                    .getDistributionFactor()) {
                                updateHarvestDistributionState_newToken(fullSource.getId(),
                                        fullSource.getHarvestStatus().getDistributionTokensFree(),
                                        fullSource.getHarvestStatus().getHarvest_status(), false);
                                nextSetToProcess.removeLast();
                                nNumSourcesGot--;
                            }
                        }
                    } //TESTED
                      //
                      //(end DISTRIBUTION LOGIC)
                      ////////////////////////////////////////////////////////////////////////

                } //(end found source - note could have been gazumped by a different thread in the meantime, and that's fine)
            } catch (Exception e) {

                // Unset the in-progress clause for robustness
                modifyClause = new BasicDBObject();
                modifyClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, HarvestEnum.error.toString());
                modify = new BasicDBObject(MongoDbManager.set_, modifyClause);
                DbManager.getIngest().getSource().update(query, modify);

                // This source has failed somehow, just carry on
                logger.error("Source " + candidate.getKey() + " has errored during distribution " + e.getMessage());
                e.printStackTrace();
            }

        } // (end loop over unchecked sources until we have >20)

        // Little bit more distribution logic:
        if (null != putMeBackAtTheStart_distributed) {
            synchronized (SourceUtils.class) { // (can be called across multiple threads)
                for (SourcePojo distSource : putMeBackAtTheStart_distributed) {
                    uncheckedSources.addFirst(distSource);
                }
            }
        } //TESTED

        return nextSetToProcess;
    } //TESTED

    /////////////////////////////////////////////////////////////////////////////////////

    // Sub-utility function used by both the above functions

    private static BasicDBObject generateNotInProgressClause(Date date) {

        //24hrs ago
        Date oldDate = new Date(date.getTime() - _ONEDAY);

        // This query says: if the query isn't in progress [1] (or the harvest object doesn't exist [3,4]) ... or if it is but nothing's happened in 24 hours [2]

        BasicDBObject subclause1 = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_,
                new BasicDBObject(MongoDbManager.ne_, HarvestEnum.in_progress.toString()));
        BasicDBObject subclause2 = new BasicDBObject();
        subclause2.put(SourceHarvestStatusPojo.sourceQuery_harvested_,
                new BasicDBObject(MongoDbManager.lt_, oldDate));
        // (always check for harvested, don't care if synced isn't happening regularly)
        BasicDBObject subclause3 = new BasicDBObject(SourcePojo.harvest_,
                new BasicDBObject(MongoDbManager.exists_, false));
        BasicDBObject subclause4 = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_,
                new BasicDBObject(MongoDbManager.exists_, false));

        BasicDBObject clause = new BasicDBObject(MongoDbManager.or_,
                Arrays.asList(subclause1, subclause2, subclause3, subclause4));
        return clause;
    }//TESTED

    //(NOTE: IF RUN IN CONJUNCTION WITH "ABUSIVE" MAP/REDUCE WILL CAUSE DB HANG)
    private static void addSearchCycleClause(BasicDBObject currQuery, Date now) {
        BasicDBObject subclause1 = new BasicDBObject(SourcePojo.searchCycle_secs_,
                new BasicDBObject(MongoDbManager.exists_, false));
        StringBuffer js = new StringBuffer();
        js.append(
                "(null == this.harvest) || ('success_iteration'== this.harvest.harvest_status) || (null == this.harvest.harvested) || (null == this.searchCycle_secs) || ((this.searchCycle_secs >= 0) && ((this.harvest.harvested.getTime() + 1000*this.searchCycle_secs) <= ");
        js.append(now.getTime());
        js.append("))");
        BasicDBObject subclause2 = new BasicDBObject(MongoDbManager.where_, js.toString());
        currQuery.append(MongoDbManager.or_, Arrays.asList(subclause1, subclause2));
    }//TESTED (by hand/eye)

    public static void checkSourcesHaveHashes(String sCommunityOverride, String sSourceDebug) {

        BasicDBObject query = new BasicDBObject(SourcePojo.shah256Hash_,
                new BasicDBObject(MongoDbManager.exists_, false));
        if (null != sCommunityOverride) {
            query.put(SourcePojo.communityIds_, new ObjectId(sCommunityOverride));
        }
        if (null != sSourceDebug) {
            try {
                query.put(SourcePojo._id_, new ObjectId(sSourceDebug));
            } catch (Exception e) { // Allow key also
                query.put(SourcePojo.key_, sSourceDebug);
            }
        }
        DBCursor dbc = DbManager.getIngest().getSource().find(query);

        int nSrcFixCount = 0;
        while (dbc.hasNext()) {
            SourcePojo src = SourcePojo.fromDb(dbc.next(), SourcePojo.class);
            nSrcFixCount++;
            src.generateShah256Hash();
            DbManager.getIngest().getSource().update(new BasicDBObject(SourcePojo._id_, src.getId()),
                    new BasicDBObject(MongoDbManager.set_,
                            new BasicDBObject(SourcePojo.shah256Hash_, src.getShah256Hash())));
        }
        if (nSrcFixCount > 0) {
            logger.info("Core.Server: Fixed " + nSrcFixCount + " missing source hash(es)");
        }
    }//TESTED (by hand/eye)

    ////////////////////////////////////////////////////////////////////////////////////////////

    // Synchronization specific utilities

    // Updates "in_progress" to either "success" or "error"

    public static void updateSyncStatus(SourcePojo source, HarvestEnum harvestStatus) {
        BasicDBObject query = new BasicDBObject(SourcePojo._id_, source.getId());
        BasicDBObject update = new BasicDBObject(MongoDbManager.set_,
                new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_, harvestStatus.toString()));
        DbManager.getIngest().getSource().update(query, update);
    }

    ////////////////////////////////////////////////////////////////////////////////////////////

    // Harvest specific source utilities

    // Updates "in_progress" to either "success" or "error", increments the doccount (per source and per community)

    public static void updateHarvestStatus(SourcePojo source, HarvestEnum harvestStatus, List<DocumentPojo> added,
            long nDocsDeleted, String extraMessage) {
        // Handle successful harvests where the max docs were reached, so don't want to respect the searchCycle
        if ((harvestStatus == HarvestEnum.success) && (source.reachedMaxDocs())) {
            harvestStatus = HarvestEnum.success_iteration;
        }
        // Always update status object in order to release the "in_progress" lock
        // (make really really sure we don't exception out before doing this!)

        BasicDBObject query = new BasicDBObject(SourcePojo._id_, source.getId());
        BasicDBObject setClause = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_,
                harvestStatus.toString());
        if ((null != added) && !added.isEmpty()) {
            setClause.put(SourceHarvestStatusPojo.sourceQuery_extracted_, new Date());
        }
        if (null != extraMessage) {
            if ((null == source.getHarvestStatus()) || (null == source.getHarvestStatus().getHarvest_message())) {
                setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_message_, extraMessage);
            } else {
                source.getHarvestStatus()
                        .setHarvest_message(source.getHarvestStatus().getHarvest_message() + "\n" + extraMessage);
                setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_message_,
                        source.getHarvestStatus().getHarvest_message());
            }
        }
        BasicDBObject update = new BasicDBObject(MongoDbManager.set_, setClause);

        int docsAdded = 0;
        if (null != added) {
            docsAdded = added.size();
        }
        BasicDBObject incClause = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_,
                docsAdded - nDocsDeleted);
        update.put(MongoDbManager.inc_, incClause);

        // Special case, if searchCycle_secs == 0 and not success_iteration, then suspend:
        if ((harvestStatus != HarvestEnum.success_iteration) && (null != source.getSearchCycle_secs())
                && (0 == source.getSearchCycle_secs())) {
            setClause.put(SourcePojo.searchCycle_secs_, -1);
        }

        if (null != source.getDistributionTokens()) { // Distribution logic (specified and also enabled - eg ignore Feed/DB)
            updateHarvestDistributionState_tokenComplete(source, harvestStatus, incClause, setClause);
        }
        if (setClause.isEmpty()) { // (ie got removed by the distribution logic above)
            update.remove(MongoDbManager.set_);
        } //TESTED

        long nTotalDocsAfterInsert = 0;
        BasicDBObject fieldsToReturn = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
        BasicDBObject updatedSource = (BasicDBObject) DbManager.getIngest().getSource().findAndModify(query,
                fieldsToReturn, null, false, update, true, false);
        BasicDBObject harvestStatusObj = (BasicDBObject) updatedSource.get(SourcePojo.harvest_);
        if (null != harvestStatusObj) {
            Long docCount = harvestStatusObj.getLong(SourceHarvestStatusPojo.doccount_);
            if (null != docCount) {
                nTotalDocsAfterInsert = docCount;
            }
        }
        //TESTED

        // Prune documents if necessary
        if ((null != source.getMaxDocs()) && (nTotalDocsAfterInsert > source.getMaxDocs())) {
            long nToPrune = (nTotalDocsAfterInsert - source.getMaxDocs());
            SourceUtils.pruneSource(source, (int) nToPrune, -1);
            nDocsDeleted += nToPrune;

            // And update to reflect that it now has max docs...
            BasicDBObject update2_1 = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_,
                    source.getMaxDocs());
            BasicDBObject update2 = new BasicDBObject(DbManager.set_, update2_1);
            DbManager.getIngest().getSource().update(query, update2);
        }
        //TESTED      

        if ((null != source.getTimeToLive_days())) {
            nDocsDeleted += SourceUtils.pruneSource(source, Integer.MAX_VALUE, source.getTimeToLive_days());
        } //TODO: TOTEST

        // (OK now the only thing we really had to do is complete, add some handy metadata)

        // Also update the document count table in doc_metadata:
        if (docsAdded > 0) {
            if (1 == source.getCommunityIds().size()) { // (simple/usual case, just 1 community)
                query = new BasicDBObject(DocCountPojo._id_, source.getCommunityIds().iterator().next());
                update = new BasicDBObject(MongoDbManager.inc_,
                        new BasicDBObject(DocCountPojo.doccount_, docsAdded - nDocsDeleted));
                if ((docsAdded != 0) || (nDocsDeleted != 0)) {
                    update.put(DbManager.set_, new BasicDBObject(DocCountPojo.extracted_, new Date()));
                }
                DbManager.getDocument().getCounts().update(query, update, true, false);
            } else if (!source.getCommunityIds().isEmpty()) { // Complex case since docs can belong to diff communities (but they're usually somewhat grouped)
                Map<ObjectId, Integer> communityMap = new HashMap<ObjectId, Integer>();
                for (DocumentPojo doc : added) {
                    ObjectId communityId = doc.getCommunityId();
                    Integer count = communityMap.get(communityId);
                    communityMap.put(communityId, (count == null ? 1 : count + 1));
                } //end loop over added documents (updating the separate community counts)
                long nDocsDeleted_byCommunity = nDocsDeleted / source.getCommunityIds().size();
                // (can't do better than assume a uniform distribution - the whole thing gets recalculated weekly anyway...)

                for (Map.Entry<ObjectId, Integer> communityInfo : communityMap.entrySet()) {
                    query = new BasicDBObject(DocCountPojo._id_, communityInfo.getKey());
                    update = new BasicDBObject(MongoDbManager.inc_, new BasicDBObject(DocCountPojo.doccount_,
                            communityInfo.getValue() - nDocsDeleted_byCommunity));
                    if ((communityInfo.getValue() != 0) || (nDocsDeleted_byCommunity != 0)) {
                        update.put(DbManager.set_, new BasicDBObject(DocCountPojo.extracted_, new Date()));
                    }
                    DbManager.getDocument().getCounts().update(query, update, true, false);
                    // (true for upsert, false for multi add)
                }
            } //(never called in practice - tested up until 5/2/2014)
        }
    }//TESTED (actually, except for multi community sources, which can't happen at the moment anyway)

    ////////////////////////////////////////////////////////////////////////////////////////////

    // Maps string type in source pojo to enum

    public static int getHarvestType(SourcePojo source) {
        if (source.getExtractType().equalsIgnoreCase("database")) {
            return InfiniteEnums.DATABASE;
        } else if (source.getExtractType().equalsIgnoreCase("file")) {
            return InfiniteEnums.FILES;
        } else {
            return InfiniteEnums.FEEDS;
        }
    }//TESTED

    ////////////////////////////////////////////////////////////////////////////////////////////

    /**
     * Changes all sources badSource flag to false so it will be attempted again on
     * the next harvest cycle.
     * 
     * NOTE: If mutliple harvesters are called with reset flag they will all
     * set the bad source flag back to true for all sources
     * 
     */
    public static void resetBadSources() {
        try {
            BasicDBObject query = new BasicDBObject();
            query.put(MongoDbManager.set_, new BasicDBObject(SourcePojo.harvestBadSource_, false));
            DbManager.getIngest().getSource().update(new BasicDBObject(), query, false, true);
        } catch (Exception e) {
            logger.error("Exception Message reseting feeds badsource flag: " + e.getMessage(), e);
        }
    }//TESTED (unchanged from working Beta version)

    /////////////////////////////////////////////////////////////////////////////////////

    // Prune sources with max doc settings

    private static int pruneSource(SourcePojo source, int nToPrune, int ttl_days) {
        int nTotalDocsDeleted = 0;
        int nDocsDeleted = 0;

        // (code taken mostly from SourceHandler.deleteSource)
        if (null != source.getKey()) { // or may delete everything!
            BasicDBObject docQuery = new BasicDBObject(DocumentPojo.sourceKey_,
                    source.getDistributedKeyQueryTerm());
            if (ttl_days > 0) {
                Date ageOut = new Date(new Date().getTime() - ttl_days * 24L * 3600L * 1000L);
                ObjectId oldestAllowedId = new ObjectId(ageOut);
                docQuery.put(DocumentPojo._id_, new BasicDBObject(DbManager.lt_, oldestAllowedId));
            } //TODO: TOTEST
            docQuery.put(DocumentPojo.index_, new BasicDBObject(DbManager.ne_, "?DEL?")); // (robustness)
            BasicDBObject sortField = new BasicDBObject(DocumentPojo._id_, 1);
            BasicDBObject docFields = new BasicDBObject();
            docFields.append(DocumentPojo.url_, 1);
            docFields.append(DocumentPojo.sourceUrl_, 1);
            docFields.append(DocumentPojo.index_, 1);
            docFields.append(DocumentPojo.sourceKey_, 1);

            StoreAndIndexManager dataStore = new StoreAndIndexManager();
            ObjectId nextId = null;
            while (nToPrune > 0) {
                int nToDelete = nToPrune;
                if (nToDelete > 10000) {
                    nToDelete = 10000;
                }
                if (null != nextId) {
                    docQuery.put(DocumentPojo._id_, new BasicDBObject(DbManager.gt_, nextId));
                } //TESTED (by hand)

                DBCursor dbc = DbManager.getDocument().getMetadata().find(docQuery, docFields).sort(sortField)
                        .limit(nToDelete);
                // (ie batches of 10K, ascending ordered by _id)

                nToPrune -= nToDelete;
                if (0 == nDocsDeleted) {
                    nDocsDeleted = dbc.count();
                    nTotalDocsDeleted += nDocsDeleted;
                }
                if (0 == dbc.size()) {
                    break;
                }
                List<DocumentPojo> docs = DocumentPojo.listFromDb(dbc, DocumentPojo.listType());

                nextId = dataStore.removeFromDatastore_byURL(docs, source);
            }
        }
        // No need to do anything related to soft deletion, this is all handled when the harvest ends 
        return nTotalDocsDeleted;
    }//TESTED

    //////////////////////////////////////////////////////

    // Utility to get harvest name for display purposes

    private static String _harvestHostname = null;

    private static String getHostname() {
        // (just get the hostname once)
        if (null == _harvestHostname) {
            try {
                _harvestHostname = InetAddress.getLocalHost().getHostName();
            } catch (Exception e) {
                _harvestHostname = "UNKNOWN";
            }
        }
        return _harvestHostname;
    }//TESTED

    ////////////////////////////////////////////////////////////////////////////////////////////
    ////////////////////////////////////////////////////////////////////////////////////////////

    // DISTRIBUTION UTILITIES

    //
    // Update the distibution state BEFORE the source is processed
    // (note can set in here because currently the status is in_process so no other threads can touch it)
    //   

    private static void updateHarvestDistributionState_newToken(ObjectId sourceId, int distributionTokensFree,
            HarvestEnum harvestStatus, boolean bResetOldState) {
        BasicDBObject query = new BasicDBObject(SourcePojo._id_, sourceId);
        BasicDBObject setClause = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_distributionTokensFree_,
                distributionTokensFree);
        if (bResetOldState) {
            setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, 0);
            setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, false);
        } //TESTED
        setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, harvestStatus.toString());
        BasicDBObject update = new BasicDBObject(MongoDbManager.set_, setClause);
        MongoDbManager.getIngest().getSource().update(query, update, false, false);

        //DEBUG
        //System.out.println(" NEW_TOKEN=" + query.toString() + " / " + update.toString());

    }//TESTED

    //
    // Update the distibution state AFTER the source is processed
    // (note can set here if source is complete because that means no other thread can have control)
    // returns true if harvest is complete
    //   
    // NOTE this isn't called if an error occurs during the ingest cycle (which is where almost all the errors are called)
    // as a result, the source will linger with incomplete/unavailable tokens until it's seen by the getDistributedSourceList
    // again - normally this will be quick because the sources keep getting put back on the uncheckedList
    //

    private static boolean updateHarvestDistributionState_tokenComplete(SourcePojo source,
            HarvestEnum harvestStatus, BasicDBObject incClause, BasicDBObject setClause) {

        // Update tokens complete, and retrieve modified version 
        int nTokensToBeCleared = source.getDistributionTokens().size();
        BasicDBObject query = new BasicDBObject(SourcePojo._id_, source.getId());
        BasicDBObject modify = new BasicDBObject(MongoDbManager.inc_, new BasicDBObject(
                SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, nTokensToBeCleared));
        BasicDBObject fields = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_,
                1);
        fields.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, 1);
        fields.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, 1);
        BasicDBObject partial = (BasicDBObject) MongoDbManager.getIngest().getSource().findAndModify(query, fields,
                null, false, modify, true, false);
        //(return new version - ensures previous increments have been taken into account)

        // Two cases: source complete (all tokens obtained), source incomplete:

        if (null != partial) { // (else yikes!)
            BasicDBObject partialStatus = (BasicDBObject) partial.get(SourcePojo.harvest_);
            if (null != partialStatus) { // (else yikes!)
                int nTokensComplete = partialStatus.getInt(SourceHarvestStatusPojo.distributionTokensComplete_, 0);
                // (note after increment)

                // COMPLETE: reset parameters, status -> error (if anything has errored), success (all done), success_iteration (more to do)

                if (nTokensComplete == source.getDistributionFactor()) {
                    if (!source.reachedMaxDocs()) { // (Can only do this if we've finished the source...
                        //...else the different threads can be at different points, so the most recent doc for one thread might be
                        // before the most recent doc of another)
                        setClause.put(SourceHarvestStatusPojo.sourceQuery_distributedLastCompletedCycle_,
                                new Date());
                    }

                    setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, 0);
                    setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionTokensFree_,
                            source.getDistributionFactor());
                    setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, false); // (resetting this)
                    // This source is now complete
                    String status = partialStatus.getString(SourceHarvestStatusPojo.harvest_status_, null);
                    Boolean reachedLimit = partialStatus.getBoolean(
                            SourceHarvestStatusPojo.distributionReachedLimit_, false) || source.reachedMaxDocs();

                    if ((null != status) && ((status.equalsIgnoreCase(HarvestEnum.error.toString())
                            || (HarvestEnum.error == harvestStatus)))) {
                        setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_,
                                HarvestEnum.error.toString());
                    } //TESTED (current and previous state == error)
                    else if (reachedLimit || (HarvestEnum.success_iteration == harvestStatus)) {

                        setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_,
                                HarvestEnum.success_iteration.toString());
                    } //TESTED (from previous or current state)

                    // (else leave with default of success)

                    //DEBUG
                    //System.out.println(Thread.currentThread().getName() + " COMPLETE_SRC COMPLETE_TOKEN=" + source.getKey() + " / " + setClause.toString() + " / " + incClause.toString() + " / " + nTokensComplete);

                    return true;

                } //TESTED
                else { // Not complete

                    // If we're here then we're only allowed to update the status to error
                    if (HarvestEnum.error != harvestStatus) {
                        setClause.remove(SourceHarvestStatusPojo.sourceQuery_harvest_status_);
                    } //TESTED
                    if (source.reachedMaxDocs()) {
                        setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, true);
                    } //TESTED

                    //DEBUG
                    //System.out.println(Thread.currentThread().getName() + " COMPLETE_TOKEN=" + source.getKey() + " / " + setClause.toString() + " / " + incClause.toString() + " / " + nTokensComplete);

                    return false;

                } //(end is complete or not)
                  //TESTED (reached max limit)

            } //(end found partial source status, else catastrophic failure)
        } //(end found partial source, else catastrophic failure)

        return false;

    }//TESTED

}