com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java

Source

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package com.ikanow.infinit.e.api.knowledge.processing;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;

import org.apache.log4j.Logger;
import org.bson.BSONCallback;
import org.bson.types.ObjectId;

import com.ikanow.infinit.e.api.knowledge.QueryHandler;
import com.ikanow.infinit.e.api.knowledge.aliases.AliasLookupTable;
import com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils_Associations.StandaloneEventHashAggregator;
import com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils_MultiCommunity.Community_EntityExtensions;
import com.ikanow.infinit.e.data_model.api.knowledge.AdvancedQueryPojo;
import com.ikanow.infinit.e.data_model.api.knowledge.DocumentPojoApiMap;
import com.ikanow.infinit.e.data_model.api.knowledge.GeoAggregationPojo;
import com.ikanow.infinit.e.data_model.api.knowledge.StatisticsPojo;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.document.AssociationPojo;
import com.ikanow.infinit.e.data_model.store.document.DocCountPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.ikanow.infinit.e.data_model.store.document.GeoPojo;
import com.ikanow.infinit.e.data_model.store.feature.entity.EntityFeaturePojo;
import com.ikanow.infinit.e.data_model.utils.GeoOntologyMapping;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCallback;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBDecoder;
import com.mongodb.DBDecoderFactory;
import com.mongodb.DBObject;
import com.mongodb.DefaultDBCallback;

public class ScoringUtils {
    private static final Logger logger = Logger.getLogger(ScoringUtils.class);

    private AliasLookupTable _s1_aliasLookup = null;

    public void setAliasLookupTable(AliasLookupTable aliasLookup) {
        _s1_aliasLookup = aliasLookup;
    }

    /////////////////////////////////////////////////////////////////////////////////////////////////////////////   
    // 
    // OPTIMIZED FULL TFIDF CALCULATIONS   

    // Classes required by the calculation

    static public class TempDocBucket implements Comparable<TempDocBucket> { // (only needs to be public because of test code)      
        public double docLength = 0; // (number of entities in document, taking frequency into account)
        public long nLeftToProcess = 0; // (state variable used to determine when a feed's score can be calc'd)
        // (after it's been used for that, I steal it to be used as pub-date/10-minutes)
        public BasicDBObject dbo; // (doc object from Mongo)
        public double totalScore = 0.0; // (combined sig/rel)
        public double aggSignificance = 0.0; // (sum of sigs of all entities)
        public double luceneScore = 0.0; // (score from Lucene)
        public double geoTemporalDecay = 1.0; // (decay based on time and location and query params)
        public boolean bPromoted = false;
        public int nLuceneIndex = -1; // (index in the sorted lucene reply)
        public double manualWeighting = 1.0; // (source-specific weighting)

        // Deduplication-specific code ... create a simple linked list 
        public int nTieBreaker; // ensures that elements will tend to get put in at the end of the list, which should improve performance 
        public String url = null;
        public TempDocBucket dupList = null; // (linked list starting at the "master" document)
        public int nEntsInDoc = 0; // (performance shortcut for comparing 2 potentially duplicate docs)

        // Store explain object (rarely needed) so won't incur map cost across all docs
        public Object explain;

        // Deduplication and ordering:
        @Override
        public int compareTo(TempDocBucket rhs) {
            boolean bCloseEnoughToCompare = false;

            double diff = this.totalScore - rhs.totalScore;

            if (-1 != nLuceneIndex) { // ie sorting by date
                if (this.nEntsInDoc == rhs.nEntsInDoc) { // (don't bother comparing unless they have the same number of entities_
                    if (0 == this.nLeftToProcess) {
                        try {
                            this.nLeftToProcess = ((Date) dbo.get(DocumentPojo.publishedDate_)).getTime() / 600000; // (down to 10 minutes==10*60*1000)
                        } catch (Exception e) {
                            this.nLeftToProcess = -1; // no date. don't try again
                        }
                    }
                    if (0 == rhs.nLeftToProcess) {
                        try {
                            rhs.nLeftToProcess = ((Date) rhs.dbo.get(DocumentPojo.publishedDate_)).getTime()
                                    / 600000; // (down to 10 minutes==10*60*1000)
                        } catch (Exception e) {
                            rhs.nLeftToProcess = -1; // no date. don't try again
                        }
                    }
                    if (rhs.nLeftToProcess == this.nLeftToProcess) { // This now contains the date in seconds...
                        bCloseEnoughToCompare = true;
                    }
                }
            } else { // normal score based sorting:
                bCloseEnoughToCompare = (Math.abs(diff) <= 1.0) && (this.nEntsInDoc == rhs.nEntsInDoc);
            }
            //TESTED (both sort types - by date and by score)

            if (bCloseEnoughToCompare) {

                // Get the url /(hash code since that will then get saved) and check that
                if (null == this.url) {
                    this.url = dbo.getString(DocumentPojo.url_);
                }
                if (null == rhs.url) {
                    rhs.url = rhs.dbo.getString(DocumentPojo.url_);
                }
                if (ScoringUtils_MultiCommunity.community_areDuplicates(this, rhs)) {

                    this.dupList = rhs.dupList;
                    rhs.dupList = this; // (add to very simple linked list)
                    return 0;
                } else if (0.0 == diff) {
                    return this.nTieBreaker - rhs.nTieBreaker;
                } else
                    return Double.compare(this.totalScore, rhs.totalScore);
            } else if (0.0 == diff) {
                return this.nTieBreaker - rhs.nTieBreaker;
            } else
                return Double.compare(this.totalScore, rhs.totalScore);
        }//TESTED (see TestCode#1)
    };

    static class TempEntityInDocBucket {
        public double freq = 0.0; // (freq pf entity in document, double for MongoDB reasons)
        public BasicDBObject dbo; // (entity object from Mongo)
        public TempDocBucket doc; // (parent document)
    };

    static class EntSigHolder implements Comparable<EntSigHolder> {

        EntSigHolder(String index, long nTotalDocCount, ScoringUtils_MultiCommunity multiCommunityHandler) {
            this.index = index; // (used for aliasing only)
            this.nTotalDocCount = nTotalDocCount;
            if (null != multiCommunityHandler) {
                multiCommunityHandler.initializeEntity(this);
            }
        }

        public String index = null; // (only used for aliasing)
        // (ALSO USED FOR ALIASES)

        public long nDocCountInQuerySubset = 0; // total number of matching docs in retrieved data
        public double datasetSignificance = 0.0; // calculated weighted avg of doc significances (ie TF*standalone)      
        public double standaloneSignificance = 0.0; // the IDF term of the significance
        public double queryCoverage = 0.0; // the % of documents in the query subset in which the entity occurs
        public double avgFreqOverQuerySubset = 0.0; // the average freq over all documents (not just those in which the entity occurs)
        // (ALSO USED FOR ALIASES)  (ALL FIVE)

        //Totals - since don't have "ent" any more
        public long nTotalDocCount = 0; // document count in population
        // (ALSO USED FOR ALIASES)

        // To approximate avg significance:
        public double decayedDocCountInQuerySubset = 0.0; // sigma(doc-count-in-query-subset * geo-temporal decay)
        // (ALSO USED FOR ALIASES)

        // Some more attempts to avoid going through the DB cursor more than once
        List<TempEntityInDocBucket> entityInstances = new LinkedList<TempEntityInDocBucket>();

        // For entity aggregation:

        public BasicDBObject unusedDbo = null;
        public double maxDocSig = 0.0;
        // (ALSO USED FOR ALIASES) (BOTH)
        public double maxFreq = 0.0;

        public long nTotalSentimentValues = 0;
        public double positiveSentiment = 0.0;
        public double negativeSentiment = 0.0;
        // (ALSO USED FOR ALIASES) (ALL THREE)

        @Override
        public int compareTo(EntSigHolder rhs) {
            return Double.compare(datasetSignificance, rhs.datasetSignificance);
        }

        // New code to handle significance approximation for pan-community queries
        // (see "additional functionality #1)
        Community_EntityExtensions community;

        // For aliasing:
        public EntSigHolder masterAliasSH = null;
        public EntityFeaturePojo aliasInfo = null;
        // (ALSO USED FOR ALIASES) (BOTH)

        // For low accuracy geo
        public BasicDBObject geotaggedEntity = null; // (store the entire ent object so we don't need to pay the deser cost unless it's promoted...)
        public BasicDBObject geotag = null; // (need both of these for onto type + geotag)
    };

    // Top level state ("s0" for "stage 0") 

    // (Some processing controls)
    long _s0_nQuerySetDocCount; // (however many were actually found in the Lucene indexes, NOTE not how many are retrieved from DB)
    int _s0_nQuerySubsetDocCount; // (eg 1000 docus, user limit - ie how many are retrieved from DB)
    boolean _s0_bNeedToCalcSig; // (whether this function needs to calc sig - eg if only being used for standalone events)
    double _s0_globalDocCount;

    double _s0_maxLuceneScoreInv; // (unused)
    double _s0_avgLuceneScoreInv; // (used for adjust aggregates' statistics)
    double _s0_avgLuceneScore;

    // (Some output controls)
    boolean _s0_sortingByDate = false;
    int _s0_nNumEntsReturn;
    boolean _s0_bNonGeoEnts;
    boolean _s0_bGeoEnts;
    boolean _s0_bEvents;
    boolean _s0_bFacts;
    boolean _s0_bSummaries;
    boolean _s0_bMetadata;

    // Type/Verb filtering:
    HashSet<String> _s0_entityTypeFilter = null;
    boolean _s0_bEntityTypeFilterPositive = true;
    HashSet<String> _s0_assocVerbFilter = null;
    boolean _s0_bAssocVerbFilterPositive = true;

    ScoringUtils_MultiCommunity _s0_multiCommunityHandler = null;
    // (handles approximating significance from multiple communities with various overlaps)
    StandaloneEventHashAggregator _s0_standaloneEventAggregator = null;
    // (handles event scoring)
    StandaloneEventHashAggregator _s0_lowAccuracyAssociationAggregator_events = null;
    StandaloneEventHashAggregator _s0_lowAccuracyAssociationAggregator_facts = null;
    // (workarounds for clusters where the Lucene indexes are too large to do faceting)

    // TF-params: original suggested values are (0.5, 1.5)
    private static final double TF_PARAM1 = 0.5;
    // I think this ranks docs with many entities up too high:
    // (FYI with (0.5,1.5): for freq==1, doc length==average, then tf term=0.333 (f==2=>0.5); doc length==av*2 => tf=0.222, (f==2=>0.364))
    //private static final double TF_PARAM2 = 1.5;
    // The following value has the property that there's a break-even point at ~3x the average number of entities
    private static final double TF_PARAM2 = 5.5;

    // Some support for low accuracy geo:
    LinkedList<EntSigHolder>[] _s3_geoBuckets = null;
    boolean _s3_bLowAccuracyGeo = false;
    boolean _s3_bExtraAliasGeo = false;
    private static final int _s3_nGEO_BUCKETS = 100;
    private static final int _s3_nGEO_BUCKETS_1 = 99;
    private static final double _s3_dGEO_BUCKETS = 100.0;
    double _s2_maxGeoQueryCoverage = 0.0;

    public void clearAsMuchMemoryAsPossible() {
        _s0_entityTypeFilter = null;
        _s0_assocVerbFilter = null;
        _s0_multiCommunityHandler = null;
        _s0_standaloneEventAggregator = null;
        _s0_lowAccuracyAssociationAggregator_events = null;
        _s0_lowAccuracyAssociationAggregator_facts = null;
        _s3_geoBuckets = null;
        _s1_dManualGeoDecay_latLonInvdecay = null;
        // Need this: _s1_entitiesInDataset
        _s1_noEntityBuckets = null;
        _s1_aliasSummary = null;
        _s3_pqDocs = null;
        _s3_pqEnt = null;
    }

    // Top level logic

    @SuppressWarnings("unchecked")
    public List<BasicDBObject> calcTFIDFAndFilter(DBCollection docsDb, DBCursor docs,
            AdvancedQueryPojo.QueryScorePojo scoreParams, AdvancedQueryPojo.QueryOutputPojo outParams,
            StatisticsPojo scores, boolean bLowAccuracyDecay, long nStart, long nToClientLimit,
            String[] communityIds, String[] entityTypeFilterStrings, String[] assocVerbFilterStrings,
            LinkedList<BasicDBObject> standaloneEventsReturn, LinkedList<BasicDBObject> lowAccuracyAggregatedEnts,
            AggregationUtils.GeoContainer lowAccuracyAggregatedGeo,
            AggregationUtils.GeoContainer extraAliasAggregatedGeo,
            LinkedList<BasicDBObject> lowAccuracyAggregatedEvents,
            LinkedList<BasicDBObject> lowAccuracyAggregatedFacts) {
        _s0_multiCommunityHandler = new ScoringUtils_MultiCommunity(communityIds);

        _s0_avgLuceneScore = scores.avgScore;
        _s0_avgLuceneScoreInv = 1.0 / (scores.avgScore + 0.01); // (+0.01 for safety in case avgScore is small)
        _s0_maxLuceneScoreInv = 1.0 / (scores.maxScore + 0.01);

        // Utility classes

        // Quick check - do I need to be here at all?      

        LinkedList<BasicDBObject> returnList = new LinkedList<BasicDBObject>();
        _s0_bNeedToCalcSig = (null != lowAccuracyAggregatedEnts) || (null != lowAccuracyAggregatedEvents)
                || (null != lowAccuracyAggregatedFacts) || (null != lowAccuracyAggregatedGeo)
                || ((nToClientLimit > 0) && outParams.docs.enable);

        if (!_s0_bNeedToCalcSig && (null == standaloneEventsReturn)) {
            return returnList;
        } //TESTED
        else if (!_s0_bNeedToCalcSig) { // (ie and want standaloneEventsReturn)
            if (scoreParams.sigWeight > 0.0) { // (reverse the call, we want sig for the standalone events)
                _s0_bNeedToCalcSig = true;
                nToClientLimit = 0; // (ensure no docs get accidentally output)
            }
        } //TESTED

        // Various configuration and state variables      

        // Entity aggregation code:
        _s0_nNumEntsReturn = 0;
        if (null != lowAccuracyAggregatedEnts) {
            _s0_nNumEntsReturn = outParams.aggregation.entsNumReturn;
        }
        _s1_entitiesInDataset = new HashMap<String, EntSigHolder>();
        _s1_noEntityBuckets = new ArrayList<TempDocBucket>();

        // (User output options)
        _s0_bNonGeoEnts = true;
        _s0_bGeoEnts = true;
        _s0_bEvents = true;
        _s0_bFacts = true;
        _s0_bSummaries = true;
        _s0_bMetadata = true;
        if (null != outParams.docs) {
            if ((null != outParams.docs.metadata) && !outParams.docs.metadata) {
                _s0_bMetadata = false;
            }
            if ((null != outParams.docs.ents) && !outParams.docs.ents) {
                _s0_bNonGeoEnts = false;
                _s0_bGeoEnts = false; // (but can be overridden below)
            }
            if ((null != outParams.docs.geo) && !outParams.docs.geo) {
                _s0_bGeoEnts = false;
            } else if ((null != outParams.docs.geo) && outParams.docs.geo) {
                _s0_bGeoEnts = true;
            }
            if ((null != outParams.docs.events) && !outParams.docs.events) {
                _s0_bEvents = false;
            }
            if ((null != outParams.docs.facts) && !outParams.docs.facts) {
                _s0_bFacts = false;
            }
            if ((null != outParams.docs.summaries) && !outParams.docs.summaries) {
                _s0_bSummaries = false;
            }
        } //TESTED      

        if (null != entityTypeFilterStrings) {

            if ('-' == entityTypeFilterStrings[0].charAt(0)) {
                _s0_bEntityTypeFilterPositive = false;
            }
            //TESTED (in both entities and associations)
            _s0_entityTypeFilter = new HashSet<String>();
            for (String entityType : entityTypeFilterStrings) {
                if (!_s0_bEntityTypeFilterPositive && ('-' == entityType.charAt(0))) {
                    entityType = entityType.substring(1);
                }
                _s0_entityTypeFilter.add(entityType.toLowerCase());
            }
        }
        if (_s0_bEvents || _s0_bFacts || _s0_bSummaries || (null != standaloneEventsReturn)) { // (ie most of the time!)
            if (null != assocVerbFilterStrings) {
                if ('-' == assocVerbFilterStrings[0].charAt(0)) {
                    _s0_bAssocVerbFilterPositive = false;
                }
                //TESTED
                _s0_assocVerbFilter = new HashSet<String>();
                for (String assocVerb : assocVerbFilterStrings) {
                    if (!_s0_bAssocVerbFilterPositive && ('-' == assocVerb.charAt(0))) {
                        assocVerb = assocVerb.substring(1);
                    }
                    _s0_assocVerbFilter.add(assocVerb);
                }
            }
        }
        //TESTED

        if ((scoreParams.relWeight == 0.0) && (scoreParams.sigWeight == 0.0)) {
            _s0_sortingByDate = true;
        }

        // First loop: just count and store      

        if ((null != standaloneEventsReturn) && (null != outParams.docs)
                && (null != outParams.docs.numEventsTimelineReturn)
                && (outParams.docs.numEventsTimelineReturn > 0)) {
            _s0_standaloneEventAggregator = new StandaloneEventHashAggregator(standaloneEventsReturn, false,
                    _s1_aliasLookup);
        }
        if ((null != lowAccuracyAggregatedEvents) && (null != outParams.aggregation)
                && (null != outParams.aggregation.eventsNumReturn) && (outParams.aggregation.eventsNumReturn > 0)) {
            _s0_lowAccuracyAssociationAggregator_events = new StandaloneEventHashAggregator(
                    lowAccuracyAggregatedEvents, true, _s1_aliasLookup);
        }
        if ((null != lowAccuracyAggregatedFacts) && (null != outParams.aggregation)
                && (null != outParams.aggregation.factsNumReturn) && (outParams.aggregation.factsNumReturn > 0)) {
            _s0_lowAccuracyAssociationAggregator_facts = new StandaloneEventHashAggregator(
                    lowAccuracyAggregatedFacts, true, _s1_aliasLookup);
        }
        if ((null != lowAccuracyAggregatedGeo) && (null != outParams.aggregation)
                && (null != outParams.aggregation.geoNumReturn) && (outParams.aggregation.geoNumReturn > 0)) {
            // Initialize the buckets
            _s3_geoBuckets = (LinkedList<EntSigHolder>[]) new LinkedList[_s3_nGEO_BUCKETS];
            _s3_bLowAccuracyGeo = true;
        }
        if ((null != extraAliasAggregatedGeo) && (null != outParams.aggregation)
                && (null != outParams.aggregation.geoNumReturn) && (outParams.aggregation.geoNumReturn > 0)) {
            _s3_bExtraAliasGeo = true;
            // (don't initialize _s3_geoBuckets until we have to)
        }
        if (bLowAccuracyDecay) {
            _s1_dManualGeoDecay_latLonInvdecay = QueryHandler.parseGeoDecay(scoreParams);
        } //TESTED

        _s0_nQuerySubsetDocCount = docs.size(); // eg (1000 docus, user limit)
        _s0_nQuerySetDocCount = scores.found; // however many were actually found

        //lookup the totaldoc count
        _s0_globalDocCount = 0;
        long nGlobalDocCount = 0;
        try {
            nGlobalDocCount = getDocCount(_s0_multiCommunityHandler.getCommunityIds());
        } catch (Exception e) {
            // If an exception occurs log the error
            logger.error("Exception Message: " + e.getMessage(), e);
        }
        // (End doccount)

        if (_s0_nQuerySetDocCount > nGlobalDocCount) {
            nGlobalDocCount = _s0_nQuerySetDocCount;
            // (This can happen if the source doc counts get out of sync...
            // ... conversely if the index/db get out of sync, the other way round can be correct, but this way is safer)
        }
        _s0_globalDocCount = (double) nGlobalDocCount;
        stage1_initialCountingLoop(docs, scoreParams, (int) nToClientLimit, scores, standaloneEventsReturn,
                communityIds.length);

        //Exit if not generating documents or entity aggregations:
        if (!_s0_bNeedToCalcSig) {
            return returnList;
        } //TESTED

        // Histogram time:

        this.stage2_generateFreqHistogramCalcIDFs();

        // Next stop: loop over the entities and calculate the IDF terms   

        this.stage3_calculateTFTerms(scoreParams, scores, nStart + nToClientLimit);
        // (get extra docs to handle deduplication)

        // Finally, write all the information to the surviving 100 (or whatever) documents        

        // Handle skipping past the end:
        if ((nStart + nToClientLimit) > _s3_pqDocs.size()) {
            nToClientLimit = _s3_pqDocs.size() - nStart;
            if (nToClientLimit < 0) {
                nToClientLimit = 0;
            }
        }
        this.stage4_prepareDocsForOutput(scoreParams, scores, nToClientLimit, returnList);

        // And then same for entities      

        this.stage4_prepareEntsForOutput(lowAccuracyAggregatedEnts);

        //Association is mostly done on the fly, but a final tidy up:
        if (null != standaloneEventsReturn) {
            ScoringUtils_Associations.finalizeStandaloneEvents(standaloneEventsReturn,
                    _s0_standaloneEventAggregator, outParams.docs.numEventsTimelineReturn);
        }
        if (null != _s0_lowAccuracyAssociationAggregator_events) {
            ScoringUtils_Associations.finalizeStandaloneEvents(lowAccuracyAggregatedEvents,
                    _s0_lowAccuracyAssociationAggregator_events, outParams.aggregation.eventsNumReturn);
        }
        if (null != _s0_lowAccuracyAssociationAggregator_facts) {
            ScoringUtils_Associations.finalizeStandaloneEvents(lowAccuracyAggregatedFacts,
                    _s0_lowAccuracyAssociationAggregator_facts, outParams.aggregation.factsNumReturn);
        }
        // Geo is mostly done on the fly, but a final tidy up:
        if (null != lowAccuracyAggregatedGeo) {
            finalizeLowAccuracyGeoAggregation(lowAccuracyAggregatedGeo, outParams.aggregation.geoNumReturn);
            // (outParams.aggregation.geoNumReturn must exist if (null != lowAccuracyAggregatedGeo)) 
        } else if ((null != extraAliasAggregatedGeo) && (null != _s3_geoBuckets)) {
            finalizeLowAccuracyGeoAggregation(extraAliasAggregatedGeo, Long.MAX_VALUE);
            //(at most 1 per alias so size shouldn't be an issue)
        }
        return returnList;
    }

    /////////////////////////////////////////////////////////////

    // (Top level logic - entities)

    // Interface to allow other internal services to take advantage of all the work that's gone into this:
    public boolean fillInEntityStatistics(EntityPojo ent) {
        EntSigHolder entStat = _s1_entitiesInDataset.get(ent.getIndex());
        if (null == entStat) {
            return false;
        } else {
            ent.setDoccount(entStat.nTotalDocCount);
            ent.setTotalfrequency(entStat.nTotalDocCount); // (don't seem to have that)

            ent.setDatasetSignificance(entStat.datasetSignificance);
            ent.setSignificance(entStat.datasetSignificance); // (doc significance isn't normalized correctly in most cases)
            ent.setQueryCoverage(entStat.queryCoverage);
            return true;
        }
    }//TESTED

    /////////////////////////////////////////////////////////////

    // (Top level logic - associations)

    public boolean calcAssocationSignificance(String ent1_index, String ent2_index, String geo_index,
            BasicDBObject assoc) {

        if ((null == _s1_entitiesInDataset) || _s1_entitiesInDataset.isEmpty()) {
            return false;
        } else {
            ScoringUtils_Associations.calcAssocationSignificance(ent1_index, ent2_index, geo_index, assoc,
                    _s1_entitiesInDataset);
        }
        return true;
    }

    /////////////////////////////////////////////////////////////////////////////////////////////////////////////

    // SUB-FUNCTIONS

    /////////////////////////////////////////////////////////////

    // 1] stage1_initialCountingLoop()
    // Loops over the data a first time and generates basic statistics required by the more complex
    // functionality that follow

    // Input:

    double _s1_dManualGeoDecay_latLonInvdecay[] = null;
    // (this is needed if internal Lucene geo decay is turned off for performance reasons)

    // Output:

    double _s1_sumFreqInQuerySubset = 0; // (the sum of all the frequencies in the received matching (sub-)dataset)
    HashMap<String, EntSigHolder> _s1_entitiesInDataset; // (map of entities to various stats)
    ArrayList<TempDocBucket> _s1_noEntityBuckets; // (docs with no entities)
    HashMap<String, EntSigHolder> _s1_aliasSummary = null; // (for aggregating entities by their alias)   

    // Logic:

    @SuppressWarnings("unchecked")
    private void stage1_initialCountingLoop(DBCursor docs, AdvancedQueryPojo.QueryScorePojo scoreParams,
            int toReturn, StatisticsPojo scores, LinkedList<BasicDBObject> standaloneEventsReturn,
            int nCommunities) {
        double s0_nQuerySubsetDocCountInv = 1.0 / (double) _s0_nQuerySubsetDocCount;

        // Some memory management:
        DBCollection dbc = MongoDbManager.getDocument().getMetadata();
        DBDecoderFactory defaultDecoder = dbc.getDBDecoderFactory();

        try {
            SizeReportingBasicBSONDecoder sizeReportingDecoder = new SizeReportingBasicBSONDecoder();
            dbc.setDBDecoderFactory(sizeReportingDecoder);

            long currMemUsage = 0;
            int ndocs = 0;
            long lastBatch = 0L;

            long initialUnusedMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory();
            long initialFreeMemory = Runtime.getRuntime().freeMemory();

            for (DBObject f0 : docs) {
                BasicDBObject f = (BasicDBObject) f0;
                long newMemUsage = sizeReportingDecoder.getSize();
                if ((newMemUsage - currMemUsage) > 0) { // check every batch               
                    long now = new Date().getTime();

                    //DEBUG
                    //logger.warn(ndocs + " : " + (now - lastBatch) + " : " + newMemUsage + " VS " + Runtime.getRuntime().maxMemory() + " UNUSED " + (Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory()) + " FREE " + Runtime.getRuntime().freeMemory());

                    // Check vs total memory:
                    long runtimeMem = Runtime.getRuntime().maxMemory();
                    // note newMemUsage is the input memory ... gets expanded ~6x by the BSON-ification, allowed at most 1/4rd of memory...
                    // Also if we're taking more than 20s for a batch then limp over the limit and exit...
                    if (((newMemUsage * 24) > runtimeMem)
                            || (((now - lastBatch) > 20000L) && (ndocs >= toReturn))) {
                        long finalUnusedMemory = Runtime.getRuntime().maxMemory()
                                - Runtime.getRuntime().totalMemory();
                        long finalFreeMemory = Runtime.getRuntime().freeMemory();

                        logger.error("Query truncated memUsage=" + newMemUsage + ", memory=" + runtimeMem
                                + ", docs=" + ndocs + ", totaldocs=" + scores.found + ", init_free_mem="
                                + initialFreeMemory + ", end_free_mem=" + finalFreeMemory + ", init_unused_mem="
                                + initialUnusedMemory + ", end_unused_mem=" + finalUnusedMemory);
                        break;
                    } //TESTED
                    currMemUsage = newMemUsage;
                    lastBatch = now;
                } //TESTED
                ndocs++;

                // Simple handling for standalone events
                if ((null != _s0_standaloneEventAggregator) && !_s0_bNeedToCalcSig) {
                    //if _s0_bNeedToCalcSig then do this elsewhere
                    ScoringUtils_Associations.addStandaloneEvents(f, 0.0, 0, _s0_standaloneEventAggregator,
                            _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter,
                            _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts);
                } //TESTED

                if (!_s0_bNeedToCalcSig) {
                    continue;
                } //TESTED

                if (nCommunities > 1) { // (could have pan-community entities)
                    ObjectId communityId = (ObjectId) f.get(DocumentPojo.communityId_);
                    if (null != communityId) { // (have big problems if so, but anyway!)
                        int retval = _s0_multiCommunityHandler.community_getIdAndInitialize(communityId,
                                _s1_entitiesInDataset);
                        // (returns an int community id but also sets it into the cache, so just use that below)
                        if (Integer.MIN_VALUE == retval) {
                            //this document cannot be viewed from within this set of communities
                            continue;
                        }
                    }
                } //TESTED      

                TempDocBucket docBucket = new TempDocBucket();
                docBucket.dbo = f;
                ObjectId id = (ObjectId) f.get(DocumentPojo._id_);

                // If we're going to weight relevance in, or we need the geo temporal decay:
                if ((0 != scoreParams.relWeight) || (null != scoreParams.timeProx)
                        || (null != scoreParams.geoProx)) {
                    StatisticsPojo.Score scoreObj = scores.getScore().get(id);
                    if (null != scoreObj) {
                        docBucket.explain = scoreObj.explain; // (will normally be null)
                        docBucket.luceneScore = scoreObj.score;
                        if ((null != scoreParams.timeProx) || (null != scoreParams.geoProx)) {
                            if (scoreObj.decay >= 0.0) {
                                docBucket.geoTemporalDecay = scoreObj.decay;
                            }
                            // (see also below for low accuracy geo scoring)
                        }
                    } else {
                        docBucket.luceneScore = 1.0;
                    }
                } //TESTED
                else if (this._s0_sortingByDate) {
                    StatisticsPojo.Score scoreObj = scores.getScore().get(id);
                    if (null != scoreObj) {
                        docBucket.nLuceneIndex = scoreObj.nIndex;
                    }
                }
                docBucket.manualWeighting = this.getManualScoreWeights(scoreParams, f);

                BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_));
                if (null != l) {

                    long nEntsInDoc = l.size();
                    double dBestGeoScore = 0.0; // (for low accuracy geo only)
                    for (Iterator<?> e0 = l.iterator(); e0.hasNext();) {
                        BasicDBObject e = (BasicDBObject) e0.next();
                        BasicDBObject tmpGeotag = null;
                        if (_s3_bLowAccuracyGeo || (null != _s1_dManualGeoDecay_latLonInvdecay)) {
                            // low accuracy geo, need to look for geotag
                            tmpGeotag = (BasicDBObject) e.get(EntityPojo.geotag_);
                        }

                        // Get attributes

                        double freq = -1.0;
                        long ntotaldoccount = -1;
                        String entity_index;
                        Double sentiment = null;
                        try {
                            sentiment = (Double) e.get(EntityPojo.sentiment_);
                            ntotaldoccount = e.getLong(EntityPojo.doccount_);
                            freq = e.getDouble(EntityPojo.frequency_);
                            entity_index = e.getString(EntityPojo.index_);
                            if (null == entity_index) {
                                // Just bypass the entity 
                                e.put(EntityPojo.significance_, 0.0);
                                nEntsInDoc--;
                                continue;
                            }
                        } catch (Exception ex) {
                            try {
                                String sfreq;
                                if (ntotaldoccount < 0) {
                                    sfreq = e.getString(EntityPojo.doccount_);
                                    ntotaldoccount = Long.valueOf(sfreq);
                                }
                                if (freq < -0.5) {
                                    sfreq = e.getString(EntityPojo.frequency_);
                                    freq = Long.valueOf(sfreq).doubleValue();
                                }
                                entity_index = e.getString(EntityPojo.index_);
                                if (null == entity_index) {
                                    // Just bypass the entity 
                                    e.put(EntityPojo.significance_, 0.0);
                                    nEntsInDoc--;
                                    continue;
                                }
                            } catch (Exception e2) {
                                // Just bypass the entity 
                                e.put(EntityPojo.significance_, 0.0);
                                nEntsInDoc--;
                                continue;
                            }
                        } //TESTED

                        // First loop through is just counting

                        // Retrieve entity (create/initialzie if necessary)
                        EntSigHolder shp = _s1_entitiesInDataset.get(entity_index);
                        if (null == shp) {
                            if (ntotaldoccount > (long) _s0_globalDocCount) { // obviously can't have more entities-in-dos than docs... 
                                ntotaldoccount = (long) _s0_globalDocCount;
                            }
                            shp = new EntSigHolder(entity_index, ntotaldoccount, _s0_multiCommunityHandler);

                            // Stage 1a alias handling: set up infrastructure, calculate doc overlap
                            if (null != _s1_aliasLookup) {
                                stage1_initAlias(shp);
                            }
                            if ((null != shp.aliasInfo) && (null == shp.masterAliasSH)) { // this is the discard alias
                                nEntsInDoc--;
                                continue;
                            } //TESTED

                            // Check if entity is in type filter list
                            if (null != _s0_entityTypeFilter) {
                                String entType = null;
                                if (null != shp.aliasInfo) {
                                    entType = shp.aliasInfo.getType();
                                } else {
                                    entType = e.getString(EntityPojo.type_);
                                }
                                if (_s0_bEntityTypeFilterPositive) {
                                    if ((null != entType)
                                            && !_s0_entityTypeFilter.contains(entType.toLowerCase())) {
                                        nEntsInDoc--;
                                        continue;
                                    }
                                } else if ((null != entType)
                                        && _s0_entityTypeFilter.contains(entType.toLowerCase())) {
                                    //(negative filter)
                                    nEntsInDoc--;
                                    continue;
                                }

                            } //TESTED (end entity filter)

                            // Geo:
                            if (null != shp.aliasInfo) {
                                if (null != shp.aliasInfo.getGeotag()) { //Geo, overwrite/create tmpGeotag
                                    if (_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo
                                            || (null != _s1_dManualGeoDecay_latLonInvdecay)) {
                                        // Always capture alias geo, even if not in low accuracy mode because we add it to the 
                                        // legitimate geo:
                                        if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo)
                                                && (null == _s3_geoBuckets)) {
                                            // Initialize the buckets if this is for aggregation not just decay
                                            _s3_geoBuckets = (LinkedList<EntSigHolder>[]) new LinkedList[_s3_nGEO_BUCKETS];
                                        }

                                        if (null == tmpGeotag) {
                                            tmpGeotag = new BasicDBObject();
                                        }
                                        tmpGeotag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat);
                                        tmpGeotag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon);

                                        if (null != shp.aliasInfo.getOntology_type()) {
                                            e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type());
                                        }
                                    }
                                }
                            } //TESTED (end geo for aggregation or decay)

                            _s1_entitiesInDataset.put(entity_index, shp);
                            // end Stage 1a alias handling
                        } //(end if is alias)

                        // Stage 1b alias handling: calculate document counts (taking overlaps into account)
                        if (null != shp.masterAliasSH) {
                            // Counts:
                            shp.masterAliasSH.nTotalDocCount++;
                            // docs including overlaps
                            shp.masterAliasSH.avgFreqOverQuerySubset += freq;

                            // Keep track of overlaps:
                            if (f != shp.masterAliasSH.unusedDbo) {
                                shp.masterAliasSH.unusedDbo = f;
                                // (note this is only used in stage 1, alias.unusedDbo is re-used differently in stage 3/4)
                                shp.masterAliasSH.nDocCountInQuerySubset++;
                                // non-overlapping docs ie < shp.nDocCountInQuerySubset
                            }

                            // Sentiment:
                            shp.masterAliasSH.positiveSentiment += shp.positiveSentiment;
                            shp.masterAliasSH.negativeSentiment += shp.negativeSentiment;
                            if (null != sentiment) {
                                shp.masterAliasSH.nTotalSentimentValues++;
                            }

                        } //TESTED (end if is alias)
                          // end Stage 1b

                        // Pan-community logic (this needs to be before the entity object is updated)
                        if (_s0_multiCommunityHandler.isActive()) {
                            _s0_multiCommunityHandler.community_updateCorrelations(shp, ntotaldoccount,
                                    entity_index);
                        } else { // (Once we've started multi-community logic, this is no longer desirable)
                            if ((ntotaldoccount > shp.nTotalDocCount) && (ntotaldoccount <= _s0_globalDocCount)) {
                                shp.nTotalDocCount = ntotaldoccount;
                            }
                            //(note there used to be some cases where we adjusted for dc/tf==0, but the 
                            // underlying issue in the data model that caused this has been fixed, so it's 
                            // now a pathological case that can be ignored)
                        } //(TESTED)

                        // Update counts:
                        _s1_sumFreqInQuerySubset += freq;
                        shp.avgFreqOverQuerySubset += freq;
                        shp.nDocCountInQuerySubset++;
                        shp.decayedDocCountInQuerySubset += docBucket.geoTemporalDecay;
                        // (note this doesn't handle low accuracy geo-decay ... we'll address that via a separate term)

                        TempEntityInDocBucket entBucket = new TempEntityInDocBucket();
                        entBucket.dbo = e;
                        entBucket.freq = freq;
                        entBucket.doc = docBucket;
                        shp.entityInstances.add(entBucket);
                        if (null != tmpGeotag) { // (only needed for low accuracy geo aggregation)

                            if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo) && (null == shp.geotag)) { // (first time for shp only)
                                shp.geotag = tmpGeotag;
                                shp.geotaggedEntity = e; // (ie for onto type, which has been overwritten in the alias case...)
                            }
                            if (null != _s1_dManualGeoDecay_latLonInvdecay) {
                                // Emulate scripted Lucene calculations
                                double minlat = tmpGeotag.getDouble(GeoPojo.lat_);
                                double minlon = tmpGeotag.getDouble(GeoPojo.lon_);
                                double paramlat = _s1_dManualGeoDecay_latLonInvdecay[0];
                                double paramlon = _s1_dManualGeoDecay_latLonInvdecay[1];
                                double gdecay = _s1_dManualGeoDecay_latLonInvdecay[2];
                                char ontCode = GeoOntologyMapping
                                        .encodeOntologyCode(e.getString(EntityPojo.ontology_type_));
                                double dDecay = QueryDecayScript.getGeoDecay(minlat, minlon, paramlat, paramlon,
                                        gdecay, ontCode);
                                if (dDecay > dBestGeoScore) {
                                    dBestGeoScore = dDecay;
                                }
                            } //TESTED
                        } //(end if entity has geo and need to process entity geo)

                        if (freq > shp.maxFreq) {
                            shp.maxFreq = freq;
                        }
                        // Sentiment:
                        if ((null != sentiment) && (Math.abs(sentiment) <= 1.1)) { // (actually 1.0)
                            shp.nTotalSentimentValues++;
                            if (sentiment > 0.0) {
                                shp.positiveSentiment += sentiment;
                            } else {
                                shp.negativeSentiment += sentiment;
                            }
                        } else if (null != sentiment) { // corrupt sentiment for some reason?!
                            e.put(EntityPojo.sentiment_, null);
                        }
                        docBucket.docLength += freq;

                    } //(end loop over entities)

                    docBucket.nLeftToProcess = nEntsInDoc;
                    docBucket.nEntsInDoc = (int) nEntsInDoc;

                    if (null != this._s1_dManualGeoDecay_latLonInvdecay) { // Low accuracy geo-calculations
                        docBucket.geoTemporalDecay *= dBestGeoScore;
                        docBucket.luceneScore *= dBestGeoScore;
                        _s2_dAvgLowAccuracyGeoDecay += dBestGeoScore * s0_nQuerySubsetDocCountInv;
                    } //TESTED            

                } // (end if feed has entities)

                // Handle documents with no entities - can still promote them
                if (0 == docBucket.nLeftToProcess) { // (use this rather than doc length in case all the entities had freq 0)
                    _s1_noEntityBuckets.add(docBucket);
                }

            } // (end loop over feeds)
              //TESTED
        } finally {
            dbc.setDBDecoderFactory(defaultDecoder);
        }
    }

    /////////////////////////////////////////////////////////////   

    // 2] stage2_generateFreqHistogramCalcIDFs()
    // Generates a histogram of entity frequencies that can be used to suppress the significance
    // of likely false positives
    // Then calculates the IDFs of each entity (including cross-community scoring adjustments if necessary)

    // Inputs

    double _s2_dAvgLowAccuracyGeoDecay = 0.0; // for low accuracy geo a further approximation...

    // Outputs

    double _s2_dApproxAverageDocumentSig; // Approximate calculated here for convenience, used later on
    int _s2_nMush1Index; // 33% significance frequency (very likely to be false positive)
    int _s2_nMush2Index; // 66% significance frequency (quite likely to be false positive)

    // Logic

    private void stage2_generateFreqHistogramCalcIDFs() {

        final int nMaxHistBins = 25;
        long nCountHistogram[] = new long[nMaxHistBins];

        // Prep histogram
        int nHistBins = 1 + (int) (_s0_nQuerySubsetDocCount / 50); // (eg 21 bins for 1000 documents)
        if (nHistBins > nMaxHistBins) {
            nHistBins = nMaxHistBins;
        }
        //TESTED

        // (Sadly requires 1 spurious loop over the entities, shouldn't add too much extra)
        // Will take the opportunity to calculate the standalone entity significances here

        // OK looking at IDF equations below, the significance's maximum value is (entity appears only in query set)
        // log(doccount*nQuerySubsetDocCount/0.25) ... so we'll scale that to be 100%
        double dScaleFactor = 100.0 / Math.log10((_s0_globalDocCount * _s0_nQuerySetDocCount + 0.5) / 0.25);
        // (note this isn't quite right anymore because of the adjustments performed below, but it does a reasonable
        //  job and the actual value is now very complicated...)
        double dHalfScaleFactor2 = 0.5 * ((0.5 + (double) _s0_nQuerySetDocCount) / (0.5 + _s0_globalDocCount));

        // Pre-calculated scalors to use in query coverage
        double halfQueryDocSubsetInv = 0.5 / (0.5 + _s0_nQuerySubsetDocCount); // (case 2.1 below - needs multipled by the entity's query count)
        double halfGlobalDocCountInv = 0.5 / (0.5 + _s0_globalDocCount); // (case 2.2 below - needs multipled by the entity's total count)

        _s2_dApproxAverageDocumentSig = 0.0; // (used to normalize vs the relevance)

        // Some TF-related numbers
        // (no longer needed since we calculate the average TF based on an average entity count, for performance reasons) 
        //double invAvgLength = ((double)_s0_nQuerySubsetDocCount/(_s1_sumFreqInQuerySubset + 0.01));

        // Pre-calculate a few dividors used in the loop below: 
        double s0_nQuerySubsetDocCountInv = 1.0 / (double) _s0_nQuerySubsetDocCount;
        double s0_nQuerySetDocCountInv = 1.0 / (double) _s0_nQuerySetDocCount;

        for (EntSigHolder shp : _s1_entitiesInDataset.values()) {

            double avgFreqPerEntity = shp.avgFreqOverQuerySubset / shp.nDocCountInQuerySubset;
            // (do this here because can overwrite shp.nDocCountInQuerySubset further below, losing direct link with shp.avgFreq)

            if (shp.nDocCountInQuerySubset < nHistBins) {
                nCountHistogram[(int) shp.nDocCountInQuerySubset]++;
            }

            //(Robustness)
            if (shp.nTotalDocCount < shp.nDocCountInQuerySubset) {
                shp.nTotalDocCount = shp.nDocCountInQuerySubset;
            }
            if (_s0_nQuerySubsetDocCount < shp.nDocCountInQuerySubset) {
                shp.nDocCountInQuerySubset = _s0_nQuerySubsetDocCount;
            }

            // Transform from a ratio involving nQuery*Subset*DocCount to a ratio of nQuery*Set*DocCount
            double estEntityDocCountInQuery = (double) shp.nDocCountInQuerySubset; // (case 1 below)
            // Cases 
            // 1] if "shp.nTotalDocCount <= shp.nDocCountInQuerySubset" OR "shp.nTotalDocCount == shp.nDocCountInQuerySubset" 
            //    then know that all instances were in nQuery*Set*DocCount (else the available entities is the smaller of the 2 diffs, see below)
            // 2] Otherwise we don't know, maybe we can guess:
            // 2.1] If the subset-ratio is correct then it will be 
            // MIN[nQuerySetDocCount*(shp.nDocCountInQuerySubset/nQuerySubsetDocCount),nDocCountDiff] + shp.nDocCountInQuerySubset  
            // 2.2] If it's actually randomly distributed then it will be 
            // (nQuerySetDocCount/globalDocCount)*nDocCountDiff + shp.nDocCountInQuerySubset
            // So we'll average the 2 and call it a day
            if ((shp.nTotalDocCount > shp.nDocCountInQuerySubset)
                    && (_s0_nQuerySetDocCount > _s0_nQuerySubsetDocCount)) {
                double docCountDiff = (double) (_s0_nQuerySetDocCount - _s0_nQuerySubsetDocCount);
                docCountDiff = Math.min(docCountDiff, (double) (shp.nTotalDocCount - shp.nDocCountInQuerySubset));
                // ie there are 2 differences:   the number of available entities in the total doc count
                //                        the number of available documents in the un-queried dataset

                estEntityDocCountInQuery += halfQueryDocSubsetInv * shp.nDocCountInQuerySubset * docCountDiff;
                estEntityDocCountInQuery += halfGlobalDocCountInv * shp.nTotalDocCount * docCountDiff;
            } //TESTED

            // IDF component of entity

            double adjustEntTotalDocCount = shp.nTotalDocCount
                    + _s0_multiCommunityHandler.community_estimateAnyMissingDocCounts(shp);

            shp.standaloneSignificance = dScaleFactor * Math.log10(
                    ((estEntityDocCountInQuery + 0.5) / (_s0_nQuerySetDocCount - estEntityDocCountInQuery + 0.5))
                            / ((adjustEntTotalDocCount - estEntityDocCountInQuery + 0.5)
                                    / ((_s0_globalDocCount - _s0_nQuerySetDocCount)
                                            - (adjustEntTotalDocCount - estEntityDocCountInQuery) + 0.5)));

            if ((shp.standaloneSignificance <= 0.0) || (Double.isInfinite(shp.standaloneSignificance))
                    || Double.isNaN(shp.standaloneSignificance)) {
                // Probably matches on the entire index or something like that, use a diff equation:
                // (basically ignore the denominator...)

                if ((2.0 * _s0_nQuerySetDocCount) >= (_s0_globalDocCount)) { // (to within 33% ... after that we'll start to trust it)
                    final double dBackupScalingFactor = 200.0 / Math.log10(2);//200 vs 100 to counteract use of dHalfScaleFactor2

                    // Use dHalfScaleFactor2 (see case 2.2)==0.5*((0.5 + (double)_s0_nQuerySetDocCount)/(0.5 + _s0_globalDocCount))
                    // basically to suppress any non-matching records that (almost certainly) don't contain the entity

                    shp.standaloneSignificance = dHalfScaleFactor2 * dBackupScalingFactor * Math.log10(
                            (_s0_globalDocCount + shp.nDocCountInQuerySubset + 0.5) / (_s0_globalDocCount + 0.5));
                    // (note if (shp.nDocCountInQuerySubset==_s0_nQuerySetDocCount) then this==100% because of defn of dBackupScalingFactor)

                    if ((shp.standaloneSignificance < 0.0) || (Double.isInfinite(shp.standaloneSignificance))
                            || Double.isNaN(shp.standaloneSignificance)) // (cleanup)
                    {
                        shp.standaloneSignificance = 0.0;
                    }
                } else {
                    shp.standaloneSignificance = 0.0;
                }

            } //TESTED (vs entire dataset) 

            // Use an "estimated query coverage" (instead of the exact one over the subset)
            shp.queryCoverage = (100.0 * (estEntityDocCountInQuery * s0_nQuerySetDocCountInv));
            shp.avgFreqOverQuerySubset *= s0_nQuerySubsetDocCountInv;

            if (null != shp.geotag) { // (only happens for low accuracy geo aggregation)
                if (shp.queryCoverage > _s2_maxGeoQueryCoverage) {
                    _s2_maxGeoQueryCoverage = shp.queryCoverage;
                }
            }

            double dApproxAvgTfTerm = avgFreqPerEntity / (avgFreqPerEntity + TF_PARAM1 + TF_PARAM2);
            // (An approximation for the TF for this entity - assume on average that the entity occurs in docs 
            //  with an average doc length, to avoid an extra loop here or in S1 to calc "avg doc length for docs containing entity)
            // (We're summing this across all entities anyway, so it's not like it would be a particularly accurate number anyway...)

            if (_s2_dAvgLowAccuracyGeoDecay > 0.0) { // Take into account average low accuracy geo-decay across the entire dataset
                dApproxAvgTfTerm *= _s2_dAvgLowAccuracyGeoDecay;
            }
            _s2_dApproxAverageDocumentSig += shp.decayedDocCountInQuerySubset * dApproxAvgTfTerm
                    * shp.standaloneSignificance;
            // (ie an approximation to sum(TF-IDF) across docs

            // Stage 2 alias processing: calc pythag significance, store first/last values ready for S3
            if (null != shp.masterAliasSH) {
                if (null == shp.masterAliasSH.index) {
                    shp.masterAliasSH.index = shp.index; // (used so I know I'm the first alias in the global list)
                    shp.masterAliasSH.avgFreqOverQuerySubset *= s0_nQuerySubsetDocCountInv;
                    // (can't do query coverage yet, we're still summing over the adjusted total doc counts)

                    // pre-calculate and store an overlap scalor to apply to query coverage 
                    shp.masterAliasSH.decayedDocCountInQuerySubset = (double) shp.masterAliasSH.nDocCountInQuerySubset
                            / (double) shp.masterAliasSH.nTotalDocCount;

                } //TESTED
                shp.masterAliasSH.queryCoverage += shp.queryCoverage
                        * shp.masterAliasSH.decayedDocCountInQuerySubset;
                // (my not-very-good estimate sort-of-adjusted for overlap)

                shp.masterAliasSH.standaloneSignificance += shp.standaloneSignificance * shp.standaloneSignificance;
                // (combine using pythag, like I do elsewhere for an easy approximation)
                shp.masterAliasSH.masterAliasSH = shp; // (used so I know I'm the last alias in the global list)

            } //TESTED
              // end stage 2 alias processing

        } //(end stage 2 loop over entities)
          //TESTED (by eye for a 114 document query and a 646 document query) 

        _s2_dApproxAverageDocumentSig *= s0_nQuerySubsetDocCountInv;

        // Intention is now to do some false positive reduction
        double peak = 0.0;
        _s2_nMush1Index = nHistBins; // 33% significance
        _s2_nMush2Index = nHistBins; // 66% significance

        double lastval = -1.0;
        for (int i = 1; i < nHistBins; ++i) {
            double val = (double) nCountHistogram[i];
            if (val > peak) {
                peak = val;
            } else {
                if (lastval >= 0.0) { // ie have got the 5% mark, now look for noise floor 
                    if (val >= (lastval - 1.5)) { // noise floor!
                        _s2_nMush2Index = i;
                        break; // (nothing left to do)
                    }
                    lastval = val;
                } else if (val < 0.05 * peak) { //5%
                    _s2_nMush1Index = i;
                    lastval = val;
                }
            }
        } // (end loop over histobins)
          //TESTED            

    }

    /////////////////////////////////////////////////////////////   

    // 3] stage3_calculateTFTerms()
    // Calculate the entities' and documents' TF-IDF scores (already calculated IDF in stage2)

    // Output

    // For these 2: lower order (ie significance) puts you at the front of the Q
    java.util.TreeSet<TempDocBucket> _s3_pqDocs; // (doc queue for output - use a TreeSet + custom separator to do deduplication at the same time)
    java.util.PriorityQueue<EntSigHolder> _s3_pqEnt; // (entity queue for output, dedup not an issue for entities)

    double _s3_dLuceneScalingFactor; // How to weight relevance (using scoreParams config)
    double _s3_dSigScalingFactor; // How to weight significance (using scoreParams config)
    double _s3_dScoreScalingFactor; // How to weight total score (using scoreParams config)

    // Logic

    private void stage3_calculateTFTerms(AdvancedQueryPojo.QueryScorePojo scoreParams, StatisticsPojo scores,
            long nToClientLimit) {
        // First off: we have an approximate average significance, we're going to create a scaling factor for 
        // relevance to fit in with the input parameters
        // Average doc score will be 100

        _s3_pqDocs = new java.util.TreeSet<TempDocBucket>();
        _s3_pqEnt = null;
        if (_s0_nNumEntsReturn > 0) {
            _s3_pqEnt = new java.util.PriorityQueue<EntSigHolder>();
        }

        // Calculate scaling factors:
        _s3_dSigScalingFactor = 1.0;
        if (scoreParams.sigWeight != 0.0) {
            double d = (scoreParams.relWeight / scoreParams.sigWeight);
            _s3_dLuceneScalingFactor = (d * _s2_dApproxAverageDocumentSig) / (_s0_avgLuceneScore + 0.01); // (eg scale1*avQuery == (r/s)*avAggSig)         
            _s3_dScoreScalingFactor = 100.0 / ((1.0 + d) * _s2_dApproxAverageDocumentSig); // ie scale2*(scale1*avQuery + avAggSig)==100.0

            // Special case: all significances are 0:
            if (_s2_dApproxAverageDocumentSig == 0.0) { // just ignore significance
                _s3_dScoreScalingFactor = 100.0 / _s0_avgLuceneScore;
                _s3_dLuceneScalingFactor = 1.0;
                _s3_dSigScalingFactor = 0.0;
            }
        } else { // Ignore significance
            _s3_dLuceneScalingFactor = 1.0;
            _s3_dSigScalingFactor = 0.0;
            _s3_dScoreScalingFactor = 100.0 / _s0_avgLuceneScore;
        }
        //TESTED

        // (See wiki thoughts on not basing this on the query sub-set (eg 1000 is totally arbitrary) ... I like this current way)
        // Take set A == 1000 docs (ent hits = dc_in_sset), set B = #hits (ent hits = unknown), set C = total space (ent hits = dc)
        // If dc==dc_in_sset then *know* that ent hits in set B = dc, so you can divide by size of B
        // Where dc>dc_in_sset, you don't know how those remaining hits are partitioned between B and C
        // Use min(|B|*(dc_in_sset/|A|),dc) as one extreme, (dc-dc_in_sset)*|B|/|C| as other

        double invAvgLength = ((double) _s0_nQuerySubsetDocCount / (_s1_sumFreqInQuerySubset + 0.01));
        int n1Down = 0; // (ensures where scores are equal documents are added last, should make a small difference to performance)

        for (EntSigHolder shp : _s1_entitiesInDataset.values()) {
            //(NOTE: important that we loop over this in the same order as we looped over it in stage 2)

            // Stage 3a alias processing:
            if (null != shp.masterAliasSH) {
                if (shp.index == shp.masterAliasSH.index) { // First instance of this alias set...
                    shp.masterAliasSH.standaloneSignificance = Math.sqrt(shp.masterAliasSH.standaloneSignificance);
                    // OK now all the stats are up-to-date
                }
            } //TESTED
              // end Stage 3a alias processing:

            //(IDF component calculated above)

            // Now calculate the term frequencies

            for (TempEntityInDocBucket entBucket : shp.entityInstances) {

                double tf_term = (entBucket.freq / (entBucket.freq + TF_PARAM1
                        + TF_PARAM2 * ((entBucket.doc.docLength + 0.01) * invAvgLength)));

                if (shp.nDocCountInQuerySubset <= _s2_nMush1Index) {
                    tf_term *= 0.33;
                } else if (shp.nDocCountInQuerySubset <= _s2_nMush2Index) {
                    tf_term *= 0.66;
                }
                double tf_idf_sig = tf_term * shp.standaloneSignificance * entBucket.doc.manualWeighting;
                //TESTED

                // Insert significance, unfortunately need to do this spuriously for low prio cases
                // (this could probably be accelerated by recalculating from the IDF and freq only for the top N docs, but empirically doesn't seem worth it)
                if (Double.isNaN(tf_idf_sig)) {
                    entBucket.dbo.put(EntityPojo.significance_, 0.0);
                } else {
                    entBucket.dbo.put(EntityPojo.significance_, tf_idf_sig);
                }
                if (tf_idf_sig > shp.maxDocSig) {
                    shp.maxDocSig = tf_idf_sig;
                }

                entBucket.doc.aggSignificance += tf_idf_sig;

                // Now we're done incorporating the significance into the document, we're going
                // to adjust the standalone significance for the relevance of the document
                // (if enabled - either manually or if the query contains OR statements)
                if ((null != scoreParams.adjustAggregateSig) && scoreParams.adjustAggregateSig) {
                    tf_idf_sig *= entBucket.doc.luceneScore * _s0_avgLuceneScoreInv;
                }
                //TESTED (doc scores stay the same, entity scores adjust)

                shp.datasetSignificance += tf_idf_sig / (double) shp.nDocCountInQuerySubset;

                // Stage 3b alias processing: update dataset significance
                if (null != shp.masterAliasSH) {

                    double alias_tf_idf_sig = tf_term * shp.masterAliasSH.standaloneSignificance
                            * entBucket.doc.manualWeighting;
                    // (standaloneSig's calculation was finished at the start of this loop)

                    // (adjust for relevance as above)
                    if ((null != scoreParams.adjustAggregateSig) && scoreParams.adjustAggregateSig) {
                        alias_tf_idf_sig *= entBucket.doc.luceneScore * _s0_avgLuceneScoreInv;
                    }
                    //TESTED

                    if (alias_tf_idf_sig > shp.masterAliasSH.maxDocSig) {
                        shp.masterAliasSH.maxDocSig = alias_tf_idf_sig;
                    }
                    shp.masterAliasSH.datasetSignificance += alias_tf_idf_sig
                            / (double) shp.masterAliasSH.nDocCountInQuerySubset;
                    // (don't use the nEntsInContainingDocs because here we do care about the overlap)

                } //TESTED
                  // end Stage 3b alias processing

                entBucket.doc.nLeftToProcess--;
                if (0 == entBucket.doc.nLeftToProcess) {

                    // Final calculation for Infinite significance
                    entBucket.doc.aggSignificance *= entBucket.doc.geoTemporalDecay * _s3_dSigScalingFactor;

                    entBucket.doc.luceneScore *= _s3_dLuceneScalingFactor; // (lucene already geo-temporally) scaled
                    // (don't up lucene score this is done inside Lucene)

                    double d = _s3_dScoreScalingFactor
                            * (entBucket.doc.luceneScore + entBucket.doc.aggSignificance);
                    if (Double.isNaN(d)) {
                        d = 0.0;
                    }
                    if (_s0_sortingByDate) {
                        entBucket.doc.totalScore = (double) -entBucket.doc.nLuceneIndex;
                    } else {
                        entBucket.doc.totalScore = d;
                    }
                    entBucket.doc.nTieBreaker = n1Down--;

                    // Completed calculating this feed's score               
                    // Insert into "top 100" list:

                    if (_s3_pqDocs.size() < nToClientLimit) {

                        //DEBUG
                        //System.out.println(_s3_pqDocs.size() + ", ADD URL=" + entBucket.doc.dbo.getString(DocumentPojo.url_));

                        _s3_pqDocs.add(entBucket.doc);
                        entBucket.doc.bPromoted = true;
                    } else if ((_s3_pqDocs.size() >= nToClientLimit) && (nToClientLimit > 0)) {
                        TempDocBucket qsf = _s3_pqDocs.first();
                        if (entBucket.doc.totalScore > qsf.totalScore) {
                            entBucket.doc.bPromoted = true;
                            _s3_pqDocs.add(entBucket.doc);
                            if (_s3_pqDocs.size() > nToClientLimit) { // (size might stay the same if this is a duplicate)                        

                                Iterator<TempDocBucket> it = _s3_pqDocs.iterator(); // (now can remove this the object via...)
                                TempDocBucket tdb = it.next();
                                it.remove(); // (ie remove the first object)

                                tdb.bPromoted = false;

                                // Phase "1": middle ranking (used to be good, not so much any more)
                                if (null != _s0_standaloneEventAggregator) {
                                    ScoringUtils_Associations.addStandaloneEvents(tdb.dbo, tdb.aggSignificance, 1,
                                            _s0_standaloneEventAggregator, _s0_bEntityTypeFilterPositive,
                                            _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter,
                                            _s0_bEvents, _s0_bSummaries, _s0_bFacts);
                                } //TESTED                                 
                                if (null != _s0_lowAccuracyAssociationAggregator_events) {
                                    ScoringUtils_Associations.addStandaloneEvents(tdb.dbo, tdb.aggSignificance, 1,
                                            _s0_lowAccuracyAssociationAggregator_events,
                                            _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                                            _s0_entityTypeFilter, _s0_assocVerbFilter, true, false, false);
                                } //TESTED                        
                                if (null != _s0_lowAccuracyAssociationAggregator_facts) {
                                    ScoringUtils_Associations.addStandaloneEvents(tdb.dbo, tdb.aggSignificance, 1,
                                            _s0_lowAccuracyAssociationAggregator_facts,
                                            _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                                            _s0_entityTypeFilter, _s0_assocVerbFilter, false, false, true);
                                } //TESTED

                            } //TESTED
                        } else { // Not promoting
                            shp.unusedDbo = entBucket.dbo; // (might save me the trouble of cloning a few times...)                                       

                            // Phase "2": never any good!
                            if (null != _s0_standaloneEventAggregator) {
                                ScoringUtils_Associations.addStandaloneEvents(entBucket.doc.dbo,
                                        entBucket.doc.aggSignificance, 2, _s0_standaloneEventAggregator,
                                        _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                                        _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries,
                                        _s0_bFacts);
                            } //TESTED
                            if (null != _s0_lowAccuracyAssociationAggregator_events) {
                                ScoringUtils_Associations.addStandaloneEvents(entBucket.doc.dbo,
                                        entBucket.doc.aggSignificance, 2,
                                        _s0_lowAccuracyAssociationAggregator_events, _s0_bEntityTypeFilterPositive,
                                        _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter,
                                        true, false, false);
                            } //TESTED                        
                            if (null != _s0_lowAccuracyAssociationAggregator_facts) {
                                ScoringUtils_Associations.addStandaloneEvents(entBucket.doc.dbo,
                                        entBucket.doc.aggSignificance, 2,
                                        _s0_lowAccuracyAssociationAggregator_facts, _s0_bEntityTypeFilterPositive,
                                        _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter,
                                        false, false, true);
                            } //TESTED
                        }
                    } else { // Not promoting any documents...
                        shp.unusedDbo = entBucket.dbo; // (might save me the trouble of cloning a few times...)

                        // Phase "2": never any good!
                        if (null != _s0_standaloneEventAggregator) {
                            ScoringUtils_Associations.addStandaloneEvents(entBucket.doc.dbo,
                                    entBucket.doc.aggSignificance, 2, _s0_standaloneEventAggregator,
                                    _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                                    _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries,
                                    _s0_bFacts);
                        } //TESTED
                        if (null != _s0_lowAccuracyAssociationAggregator_events) {
                            ScoringUtils_Associations.addStandaloneEvents(entBucket.doc.dbo,
                                    entBucket.doc.aggSignificance, 2, _s0_lowAccuracyAssociationAggregator_events,
                                    _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                                    _s0_entityTypeFilter, _s0_assocVerbFilter, true, false, false);
                        } //TESTED                        
                        if (null != _s0_lowAccuracyAssociationAggregator_facts) {
                            ScoringUtils_Associations.addStandaloneEvents(entBucket.doc.dbo,
                                    entBucket.doc.aggSignificance, 2, _s0_lowAccuracyAssociationAggregator_facts,
                                    _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                                    _s0_entityTypeFilter, _s0_assocVerbFilter, false, false, true);
                        } //TESTED
                    }
                } //TESTED

            } // (end loop over entity occurrences in feeds)          
              //TESTED

            // Handle geo:
            if (null != shp.geotag) {
                loadLowAccuracyGeoBuckets(shp);
            }

            // Insert entities into the output priority queue
            // NOTE LOCAL SHP CANNOT BE USED AFTER THE FOLLOWING CLAUSE
            // (LOCAL==the object itself isn't changed, so the code above is fine, but the pointer is modified)

            if (_s0_nNumEntsReturn > 0) {

                // Stage 3c alias processing:
                if ((null != shp.masterAliasSH) && (shp.masterAliasSH.masterAliasSH != shp)) {
                    continue; // (only promote the last of the aliased entities)
                } //TESTED
                else if (null != shp.masterAliasSH) { // (use aggregated aliased version if present)

                    shp.masterAliasSH.unusedDbo = shp.unusedDbo;
                    // (overwriting this, which is fine since it's not used after stage 1)
                    shp.masterAliasSH.index = shp.index; // (just so I know what the index of this entity is) 
                    // (overwriting this, which is fine since it's not used after the first ent of the alias group in this stage)

                    shp.masterAliasSH.entityInstances = shp.entityInstances;
                    // (the only 2 fields that are needed but weren't present)
                    shp = shp.masterAliasSH;
                } //TESTED
                  // end stage 3c of alias processing

                if (_s3_pqEnt.size() < _s0_nNumEntsReturn) {
                    _s3_pqEnt.add(shp);
                }
                if ((_s3_pqEnt.size() >= _s0_nNumEntsReturn) && (_s0_nNumEntsReturn > 0)) {

                    EntSigHolder qsf = _s3_pqEnt.element();
                    if (shp.datasetSignificance > qsf.datasetSignificance) {
                        EntSigHolder toRemove = _s3_pqEnt.remove();
                        _s3_pqEnt.add(shp);

                        toRemove.entityInstances = null; // (don't need this any more can be gc'd)
                        if (null != toRemove.masterAliasSH) {
                            toRemove.masterAliasSH.entityInstances = null;
                            // (can only promote one masterAliasSH so no risk this will remove an active entityInstances)
                        }
                    } else {
                        shp.entityInstances = null; // (don't need this any more can be gc'd)                  
                    } //TESTED
                }
            } //TESTED
            else {
                shp.entityInstances = null; // (don't need this any more, can be gc'd)
            } //TESTED

            // (NOTE LOCAL SHP CANNOT BE USED FROM HERE - IE NO MORE CODE IN THIS LOOP!)   

        } // (end loop over entities)

        // Handle docus with no entities:

        if (nToClientLimit > 0) {
            for (TempDocBucket doc : _s1_noEntityBuckets) {
                doc.luceneScore *= _s3_dLuceneScalingFactor;
                double d = _s3_dScoreScalingFactor * doc.luceneScore;
                if (Double.isNaN(d)) {
                    d = 0.0;
                }
                if (_s0_sortingByDate) {
                    doc.totalScore = (double) -doc.nLuceneIndex;
                } else {
                    doc.totalScore = d;
                }
                doc.nTieBreaker = n1Down--;
                if (_s3_pqDocs.size() < nToClientLimit) {
                    _s3_pqDocs.add(doc);
                }
                if (_s3_pqDocs.size() >= nToClientLimit) {
                    TempDocBucket qsf = _s3_pqDocs.first();
                    if (doc.totalScore > qsf.totalScore) {
                        _s3_pqDocs.add(doc);
                        if (_s3_pqDocs.size() > nToClientLimit) { // (could be a duplicate)
                            Iterator<TempDocBucket> it = _s3_pqDocs.iterator(); // (now can remove this the object via...)
                            it.next();
                            it.remove(); // (ie remove the first object)
                        }
                    } //(TESTED)
                }
            } // (end loop over feeds with no entities)
        } // (obv don't bother if we're not returning documents anyway...)      
    }

    /////////////////////////////////////////////////////////////   

    // 4a] stage4_prepareDocsForOutput()
    // Using the priority queues calculated in step [3] generate the lists of documents and entities to return 

    private void stage4_prepareDocsForOutput(AdvancedQueryPojo.QueryScorePojo scoreParams, StatisticsPojo scores,
            long nToClientLimit, LinkedList<BasicDBObject> returnList) {
        // Get the documents
        long nDocs = 0;
        double dBestScore = 0.0;
        double dAvgScore = 0.0;

        double dSigFactor = 100.0 / (_s3_dSigScalingFactor * _s2_dApproxAverageDocumentSig);
        double dRelFactor = 100.0 / (_s3_dLuceneScalingFactor * _s0_avgLuceneScore);

        // Start at the bottom of the list, so don't need to worry about skipping documents, just count out from the bottom
        // The call to stage3_calculateTFTerms with nStart+nToClientLimit handles the rest

        Iterator<TempDocBucket> pqIt = _s3_pqDocs.iterator();
        while (pqIt.hasNext() && (nDocs < nToClientLimit)) {
            TempDocBucket qsf = pqIt.next();
            nDocs++;
            if (!_s0_sortingByDate) {
                dBestScore = qsf.totalScore;
            }
            dAvgScore += dBestScore;

            BasicDBObject f = qsf.dbo;

            // Phase "0" - these are the highest prio events
            boolean bNeedToFilterAndAliasAssoc_event = true;
            boolean bNeedToFilterAndAliasAssoc_fact = true;
            boolean bNeedToFilterAndAliasAssoc_summary = true;
            if (null != _s0_standaloneEventAggregator) {
                ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0,
                        _s0_standaloneEventAggregator, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                        _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts);
                bNeedToFilterAndAliasAssoc_event = false;
                bNeedToFilterAndAliasAssoc_fact = false;
                bNeedToFilterAndAliasAssoc_summary = false;
            } //TESTED
            if (null != _s0_lowAccuracyAssociationAggregator_events) {
                ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0,
                        _s0_lowAccuracyAssociationAggregator_events, _s0_bEntityTypeFilterPositive,
                        _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, true, false,
                        false);
                bNeedToFilterAndAliasAssoc_event = false;
            } //TESTED                        
            if (null != _s0_lowAccuracyAssociationAggregator_facts) {
                ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0,
                        _s0_lowAccuracyAssociationAggregator_facts, _s0_bEntityTypeFilterPositive,
                        _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, false, false,
                        true);
                bNeedToFilterAndAliasAssoc_fact = false;
            } //TESTED

            try {
                DocumentPojoApiMap.mapToApi(f);
                // Handle deduplication/multi-community code:
                if (null != qsf.dupList) {
                    try {
                        ScoringUtils_MultiCommunity.community_combineDuplicateDocs(qsf);
                    } catch (Exception e) {
                        // Do nothing, just carry on with minimal damage!
                    }
                }

                // Scoring:
                double d = qsf.aggSignificance * dSigFactor;
                if (Double.isNaN(d)) {
                    f.put(DocumentPojo.aggregateSignif_, 0.0);
                } else {
                    f.put(DocumentPojo.aggregateSignif_, d);
                }
                d = qsf.luceneScore * dRelFactor;
                if (Double.isNaN(d)) {
                    f.put(DocumentPojo.queryRelevance_, 0.0);
                } else {
                    f.put(DocumentPojo.queryRelevance_, d);
                }
                if (!_s0_sortingByDate) {
                    f.put(DocumentPojo.score_, qsf.totalScore);
                }

                BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_));

                // Handle update ids vs normal ids:
                ObjectId updateId = (ObjectId) f.get(DocumentPojo.updateId_);
                if (null != updateId) { // swap the 2...
                    f.put(DocumentPojo.updateId_, f.get(DocumentPojo._id_));
                    f.put(DocumentPojo._id_, updateId);
                }

                // Check if entities enabled            
                if ((null != l) && (!_s0_bGeoEnts && !_s0_bNonGeoEnts)) {
                    f.removeField(DocumentPojo.entities_);
                    l = null;
                } //TESTED

                // Check if events etc enabled
                if ((!_s0_bEvents && !_s0_bFacts && !_s0_bSummaries)) {
                    f.removeField(DocumentPojo.associations_);
                } //TESTED            
                else if (!_s0_bEvents || !_s0_bFacts || !_s0_bSummaries || (null != _s0_assocVerbFilter)) {

                    // Keep only specified event_types
                    BasicDBList lev = (BasicDBList) (f.get(DocumentPojo.associations_));
                    if (null != lev) {
                        for (Iterator<?> e0 = lev.iterator(); e0.hasNext();) {
                            BasicDBObject e = (BasicDBObject) e0.next();

                            // Type filter
                            boolean bNeedToFilterAndAliasAssoc = true;
                            String sEvType = e.getString(AssociationPojo.assoc_type_);
                            boolean bKeep = true;
                            if (null == sEvType) {
                                bKeep = false;
                            } else if (sEvType.equalsIgnoreCase("event")) {
                                if (!_s0_bEvents)
                                    bKeep = false;
                                bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_event;
                            } else if (sEvType.equalsIgnoreCase("fact")) {
                                if (!_s0_bFacts)
                                    bKeep = false;
                                bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_fact;
                            } else if (sEvType.equalsIgnoreCase("summary")) {
                                if (!_s0_bSummaries)
                                    bKeep = false;
                                bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_summary;
                            }
                            if (!bKeep) {
                                e0.remove();
                            } else { // Type matches, now for some more complex logic....

                                if (bNeedToFilterAndAliasAssoc) { // (otherwise done already)

                                    bKeep = ScoringUtils_Associations.filterAndAliasAssociation(e, _s1_aliasLookup,
                                            true, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                                            _s0_entityTypeFilter, _s0_assocVerbFilter);
                                    if (!bKeep) {
                                        e0.remove();
                                    }

                                } //TESTED

                            } //(end output filter logic)

                        } // (end loop over events)   
                    } // (end if this doc has events)

                } //TESTED            

                // Check if metadata is enabled
                if (!_s0_bMetadata) {
                    f.removeField(DocumentPojo.metadata_);
                } //TESTED

                if (null != l) {

                    for (Iterator<?> e0 = l.iterator(); e0.hasNext();) {
                        BasicDBObject e = (BasicDBObject) e0.next();

                        if (!_s0_bNonGeoEnts) { // then must only be getting geo (else wouldn't be in this loop)
                            if (null == e.get(EntityPojo.geotag_)) {
                                e0.remove();
                                continue;
                            }
                        }

                        String entity_index = e.getString(EntityPojo.index_);
                        if (null == entity_index)
                            continue;

                        EntSigHolder shp = (EntSigHolder) _s1_entitiesInDataset.get(entity_index);

                        if (null != shp) {
                            // Stage 4x: alias processing, just overwrite 
                            // (note don't delete "duplicate entities", hard-to-be-globally-consistent
                            //  and will potentially throw data away which might be undesirable)
                            if (null != shp.masterAliasSH) {
                                shp = shp.masterAliasSH; // (already has all the aggregated values used below)
                                if (!entity_index.equals(shp.aliasInfo.getIndex())) {
                                    e.put(EntityPojo.index_, shp.aliasInfo.getIndex());
                                    e.put(EntityPojo.disambiguated_name_, shp.aliasInfo.getDisambiguatedName());
                                    e.put(EntityPojo.type_, shp.aliasInfo.getType());
                                    e.put(EntityPojo.dimension_, shp.aliasInfo.getDimension());

                                    if (null != shp.aliasInfo.getGeotag()) {
                                        BasicDBObject aliasedGeoTag = new BasicDBObject();
                                        aliasedGeoTag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat);
                                        aliasedGeoTag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon);
                                        e.put(EntityPojo.geotag_, aliasedGeoTag);
                                        if (null != shp.aliasInfo.getOntology_type()) {
                                            e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type());
                                        }
                                    } //TESTED
                                }
                            } //TESTED
                              // end Stage 4x of alias processing                  

                            double dataSig = shp.datasetSignificance;
                            if (Double.isNaN(dataSig)) {
                                e.put(EntityPojo.datasetSignificance_, 0.0);
                            } else {
                                e.put(EntityPojo.datasetSignificance_, dataSig);
                            }
                            e.put(EntityPojo.queryCoverage_, shp.queryCoverage);
                            e.put(EntityPojo.averageFreq_, shp.avgFreqOverQuerySubset);
                            if (shp.nTotalSentimentValues > 0) {
                                e.put(EntityPojo.positiveSentiment_, shp.positiveSentiment);
                                e.put(EntityPojo.negativeSentiment_, shp.negativeSentiment);
                                e.put(EntityPojo.sentimentCount_, shp.nTotalSentimentValues);
                            }
                        } else { // (most likely to occur if the entity is discarded (alias/filter) or is corrupt in some way)
                            e0.remove();
                            continue;
                        }

                    } //(end loop over entities)
                } // (end if feed has entities)
                  //TESTED

                // Explain if enabled
                if (null != qsf.explain) {
                    f.put(DocumentPojo.explain_, qsf.explain);
                }

                // Add to the end of the list (so will come back from API call in natural order, highest first)
                returnList.addFirst(f);
                // (add elements to the front of the list so that the top of the list is ordered by priority)
            } catch (Exception e) {
                // Probably a JSON error, just carry on
                String title = f.getString(DocumentPojo.title_);
                logger.error(title + ": " + e.getMessage());
            }

        } // (end loop over feeds)
          //TESTED

        // Update the scores:
        scores.maxScore = (float) dBestScore;
        if (nDocs > 0) {
            scores.avgScore = (float) dAvgScore / nDocs;
        }
    }

    /////////////////////////////////////////////////   

    // 4b] stage4_prepareEntsForOutput()
    // Using the priority queues calculated in step [3] generate the lists of documents and entities to return 

    private void stage4_prepareEntsForOutput(LinkedList<BasicDBObject> entityReturn) {
        if (_s0_nNumEntsReturn > 0) { // (else entities not enabled)

            for (EntSigHolder qsf = _s3_pqEnt.poll(); null != qsf; qsf = _s3_pqEnt.poll()) // (start with lowest ranking)
            {
                BasicDBObject ent = qsf.unusedDbo;
                if (null == ent) {
                    int nTries = 0;
                    if (null != qsf.entityInstances) { // (should never be null but just to be on the safe side...
                        for (TempEntityInDocBucket tefb : qsf.entityInstances) {
                            // (Try to find an entity that wasn't promoted ie can now be re-used
                            //  if we can't find one quite quickly then bail out and we'll pay the cost of cloning it)
                            if (!tefb.doc.bPromoted) {
                                ent = tefb.dbo;
                                break;
                            } else if (++nTries > 10) {
                                break;
                            }
                        }
                        if (null == ent) {
                            ent = qsf.entityInstances.get(0).dbo;
                        }
                    } else { // (no entityInstances, something alias-related has gone wrong, just skip) 
                        continue;
                    }
                } //TESTED
                qsf.entityInstances = null; // (don't need this any more, can be gc'd)

                try {

                    if (null != qsf.aliasInfo) {
                        if (!qsf.index.equals(qsf.aliasInfo.getIndex())) {
                            ent.put(EntityPojo.index_, qsf.aliasInfo.getIndex());
                            ent.put(EntityPojo.disambiguated_name_, qsf.aliasInfo.getDisambiguatedName());
                            ent.put(EntityPojo.type_, qsf.aliasInfo.getType());
                            ent.put(EntityPojo.dimension_, qsf.aliasInfo.getDimension());
                            if (null != qsf.aliasInfo.getGeotag()) {
                                BasicDBObject aliasedGeoTag = new BasicDBObject();
                                aliasedGeoTag.put(GeoPojo.lat_, qsf.aliasInfo.getGeotag().lat);
                                aliasedGeoTag.put(GeoPojo.lon_, qsf.aliasInfo.getGeotag().lon);
                                ent.put(EntityPojo.geotag_, aliasedGeoTag);
                                if (null != qsf.aliasInfo.getOntology_type()) {
                                    ent.put(EntityPojo.ontology_type_, qsf.aliasInfo.getOntology_type());
                                }
                            } //TESTED
                        }
                    } //TESTED

                    if (null == ent.get(EntityPojo.datasetSignificance_)) { // Not getting promoted so need to add fields...                  
                        if (Double.isNaN(qsf.datasetSignificance)) {
                            ent.put("datasetSignificance", 0.0);
                        } else {
                            ent.put(EntityPojo.datasetSignificance_, qsf.datasetSignificance);
                        }
                        ent.put(EntityPojo.queryCoverage_, qsf.queryCoverage);
                        ent.put(EntityPojo.averageFreq_, qsf.avgFreqOverQuerySubset);
                        if (qsf.nTotalSentimentValues > 0) {
                            ent.put(EntityPojo.positiveSentiment_, qsf.positiveSentiment);
                            ent.put(EntityPojo.negativeSentiment_, qsf.negativeSentiment);
                            ent.put(EntityPojo.sentimentCount_, qsf.nTotalSentimentValues);
                        }
                    } else { // (... but can just use it without cloning)
                        BasicDBObject ent2 = new BasicDBObject();
                        for (Map.Entry<String, Object> kv : ent.entrySet()) {
                            ent2.append(kv.getKey(), kv.getValue());
                        }
                        ent = ent2;
                    }
                    ent.removeField(EntityPojo.relevance_);
                    if (Double.isNaN(qsf.maxDocSig)) {
                        ent.put(EntityPojo.significance_, 0.0);
                    } else {
                        ent.put(EntityPojo.significance_, qsf.maxDocSig);
                    }
                    ent.put(EntityPojo.frequency_, (long) qsf.maxFreq);
                    entityReturn.addFirst(ent);
                } catch (Exception e) {
                    // Probably a JSON error, just carry on
                    String title = ent.getString(EntityPojo.index_);
                    logger.error(title + ": " + e.getMessage());
                } //TESTED
            }
        } //TESTED            
    }
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////

    // Utility

    private BasicDBObject _s0_docCountFields = null;
    private BasicDBObject _s0_docCountHint = null;

    private long getDocCount(ObjectId[] communityIds) {
        long nDocCount = 0;
        try {
            BasicDBObject query = new BasicDBObject(DocCountPojo._id_,
                    new BasicDBObject(MongoDbManager.in_, communityIds));
            if (null == _s0_docCountFields) {
                _s0_docCountFields = new BasicDBObject(DocCountPojo._id_, 0);
                _s0_docCountFields.put(DocCountPojo.doccount_, 1);
                _s0_docCountHint = new BasicDBObject(DocCountPojo._id_, 1);
                _s0_docCountHint.put(DocCountPojo.doccount_, 1);
            }
            DBCursor dbc = DbManager.getDocument().getCounts().find(query, _s0_docCountFields)
                    .hint(_s0_docCountHint);
            while (dbc.hasNext()) {
                BasicDBObject dbo = (BasicDBObject) dbc.next();
                Iterator<?> it = dbo.values().iterator();
                if (it.hasNext()) {
                    nDocCount += (double) ((Long) it.next()).longValue(); // (from _s0_docCountFields, doccount is only return variable)
                }
            }
            if (0 == nDocCount) { // (Probably shouldn't happen if a harvest has occurred, just don't bomb out
                nDocCount = _s0_nQuerySetDocCount;
            }
        } catch (Exception e) { // Doc count might not be setup correctly?
            nDocCount = _s0_nQuerySetDocCount;
        }
        return nDocCount;
    }//TESTED

    // The overall plan is:
    // S1: identify alias (write helper function based on the code above), calculate overlapping doc count
    // S2: calc pythag significance, store first/last values ready for S3
    // S3: first time through, do sqrt bit of pythag, last time through add to PQ
    // S4: overwrite the entity values with aliased entities where necessary

    private void stage1_initAlias(EntSigHolder shp) {
        EntityFeaturePojo alias = _s1_aliasLookup.getAliasMaster(shp.index);
        if (null != alias) { // overwrite index
            if (alias.getIndex().equalsIgnoreCase("document_discard")) {
                // (document discard... shouldn't have this document at this point, we'll just carry on if we do though)
                return;
            }
            if (alias.getIndex().equalsIgnoreCase("discard")) {
                shp.aliasInfo = alias;
                shp.masterAliasSH = null;
                return;
            }
            EntSigHolder masterAliasSH = null;
            if (null == _s1_aliasSummary) {
                _s1_aliasSummary = new HashMap<String, EntSigHolder>();
            } else {
                masterAliasSH = _s1_aliasSummary.get(alias.getIndex());
            }
            if (null == masterAliasSH) {
                masterAliasSH = new EntSigHolder(null, 0, null); //(use ESH as handy collection of req'd vars)
                _s1_aliasSummary.put(alias.getIndex(), masterAliasSH);
            }
            shp.masterAliasSH = masterAliasSH;
            shp.aliasInfo = alias;
            shp.masterAliasSH.aliasInfo = alias; // (no harm storing this in 2 places)
        }
    }//TESTED

    private double getManualScoreWeights(AdvancedQueryPojo.QueryScorePojo scoreParams, BasicDBObject doc) {
        // Highest prio: source key weight
        if (null != scoreParams.sourceWeights) {
            String sourceKey = DocumentPojo.getSourceKey(doc.getString(DocumentPojo.sourceKey_));
            Double dWeight = scoreParams.sourceWeights.get(sourceKey);

            if (null != dWeight) {
                return dWeight;
            }
        }
        // Middle prio: type
        if (null != scoreParams.typeWeights) {
            String mediaType = doc.getString(DocumentPojo.mediaType_);
            Double dWeight = scoreParams.typeWeights.get(mediaType);

            if (null != dWeight) {
                return dWeight;
            }
        }
        // Lowest prio: average of tags
        if (null != scoreParams.tagWeights) {
            double dScore = 0.0;
            int nComps = 0;
            BasicDBList tags = (BasicDBList) doc.get(DocumentPojo.tags_);
            if (null != tags) {
                for (Object tagObj : tags) {
                    String tag = (String) tagObj;
                    Double dWeight = scoreParams.tagWeights.get(tag);
                    if (null != dWeight) {
                        nComps++;
                        dScore += dWeight;
                    }
                }
                if (nComps > 0) {
                    return dScore / nComps;
                }
            }
        }
        return 1.0;
    }//TESTED (all 3 cases)   

    ////////////////////////////////////////////////////////////////////////////

    // Low accuracy geo aggregation utils:

    // Code copied from ScoringUtils_Association

    private void loadLowAccuracyGeoBuckets(EntSigHolder shp) {
        double dBucket = shp.queryCoverage / (this._s2_maxGeoQueryCoverage + 0.01); // (ensure <1)
        if (dBucket > 1.0)
            dBucket = 1.0;

        int nBucket = _s3_nGEO_BUCKETS_1 - ((int) (_s3_dGEO_BUCKETS * dBucket) % _s3_nGEO_BUCKETS);

        LinkedList<EntSigHolder> bucketList = _s3_geoBuckets[nBucket];
        if (null == bucketList) {
            bucketList = new LinkedList<EntSigHolder>();
            _s3_geoBuckets[nBucket] = bucketList;
        }
        bucketList.add(shp);
    }//TESTED

    private void finalizeLowAccuracyGeoAggregation(AggregationUtils.GeoContainer geoContainer, long nMaxToReturn) {

        geoContainer.geotags = new TreeSet<GeoAggregationPojo>();

        for (LinkedList<EntSigHolder> bucket : _s3_geoBuckets) {

            if (null != bucket) {
                for (EntSigHolder shp : bucket) {
                    // Estimated count:

                    try {
                        if (null != shp.geotag) { // will always be the case...
                            GeoAggregationPojo geo = new GeoAggregationPojo();

                            geo.lat = shp.geotag.getDouble(GeoPojo.lat_);
                            geo.lon = shp.geotag.getDouble(GeoPojo.lon_);
                            geo.type = shp.geotaggedEntity.getString(EntityPojo.ontology_type_);
                            if (null == geo.type) {
                                geo.type = "point";
                            }
                            geo.count = (int) (0.01 * shp.queryCoverage * _s0_nQuerySetDocCount);
                            // (query coverage is a %)

                            geoContainer.geotags.add(geo);
                            // (can change geo.count, where aggregation has happened)

                            if (geo.count > geoContainer.maxCount) {
                                geoContainer.maxCount = geo.count;
                            }
                            if (geo.count < geoContainer.minCount) {
                                geoContainer.minCount = geo.count;
                            }
                            if (geoContainer.geotags.size() >= nMaxToReturn) {
                                return;
                            }
                        }
                    } catch (Exception e) {
                    } // geotag invalid just carry on
                }
            }
        }
    }//TESTED

    // MEMORY HANDLING UTILITY

    public static class SizeReportingBasicBSONDecoder extends org.bson.BasicBSONDecoder
            implements DBDecoderFactory, DBDecoder {
        @Override
        public int decode(byte[] b, BSONCallback callback) {
            int size = super.decode(b, callback);
            _size += size;
            return size;
        }

        @Override
        public int decode(InputStream in, BSONCallback callback) throws IOException {
            int size = super.decode(in, callback);
            _size += size;
            return size;
        }

        public void resetSize() {
            _size = 0;
        }

        public long getSize() {
            return _size;
        }

        protected long _size = 0;

        @Override
        public DBDecoder create() {
            return this;
        }

        @Override
        public DBObject decode(byte[] b, DBCollection collection) {
            DBCallback cbk = getDBCallback(collection);
            cbk.reset();
            decode(b, cbk);
            return (DBObject) cbk.get();
        }

        @Override
        public DBObject decode(InputStream in, DBCollection collection) throws IOException {
            DBCallback cbk = getDBCallback(collection);
            cbk.reset();
            decode(in, cbk);
            return (DBObject) cbk.get();
        }

        @Override
        public DBCallback getDBCallback(DBCollection collection) {
            return new DefaultDBCallback(collection);
        }
    }

}