Java tutorial
/******************************************************************************* * Copyright 2012, The Infinit.e Open Source Project. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package com.ikanow.infinit.e.api.knowledge.processing; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.TreeSet; import org.apache.log4j.Logger; import org.bson.BSONCallback; import org.bson.types.ObjectId; import com.ikanow.infinit.e.api.knowledge.QueryHandler; import com.ikanow.infinit.e.api.knowledge.aliases.AliasLookupTable; import com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils_Associations.StandaloneEventHashAggregator; import com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils_MultiCommunity.Community_EntityExtensions; import com.ikanow.infinit.e.data_model.api.knowledge.AdvancedQueryPojo; import com.ikanow.infinit.e.data_model.api.knowledge.DocumentPojoApiMap; import com.ikanow.infinit.e.data_model.api.knowledge.GeoAggregationPojo; import com.ikanow.infinit.e.data_model.api.knowledge.StatisticsPojo; import com.ikanow.infinit.e.data_model.store.DbManager; import com.ikanow.infinit.e.data_model.store.MongoDbManager; import com.ikanow.infinit.e.data_model.store.document.AssociationPojo; import com.ikanow.infinit.e.data_model.store.document.DocCountPojo; import com.ikanow.infinit.e.data_model.store.document.DocumentPojo; import com.ikanow.infinit.e.data_model.store.document.EntityPojo; import com.ikanow.infinit.e.data_model.store.document.GeoPojo; import com.ikanow.infinit.e.data_model.store.feature.entity.EntityFeaturePojo; import com.ikanow.infinit.e.data_model.utils.GeoOntologyMapping; import com.mongodb.BasicDBList; import com.mongodb.BasicDBObject; import com.mongodb.DBCallback; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.DBDecoder; import com.mongodb.DBDecoderFactory; import com.mongodb.DBObject; import com.mongodb.DefaultDBCallback; public class ScoringUtils { private static final Logger logger = Logger.getLogger(ScoringUtils.class); private AliasLookupTable _s1_aliasLookup = null; public void setAliasLookupTable(AliasLookupTable aliasLookup) { _s1_aliasLookup = aliasLookup; } ///////////////////////////////////////////////////////////////////////////////////////////////////////////// // // OPTIMIZED FULL TFIDF CALCULATIONS // Classes required by the calculation static public class TempDocBucket implements Comparable<TempDocBucket> { // (only needs to be public because of test code) public double docLength = 0; // (number of entities in document, taking frequency into account) public long nLeftToProcess = 0; // (state variable used to determine when a feed's score can be calc'd) // (after it's been used for that, I steal it to be used as pub-date/10-minutes) public BasicDBObject dbo; // (doc object from Mongo) public double totalScore = 0.0; // (combined sig/rel) public double aggSignificance = 0.0; // (sum of sigs of all entities) public double luceneScore = 0.0; // (score from Lucene) public double geoTemporalDecay = 1.0; // (decay based on time and location and query params) public boolean bPromoted = false; public int nLuceneIndex = -1; // (index in the sorted lucene reply) public double manualWeighting = 1.0; // (source-specific weighting) // Deduplication-specific code ... create a simple linked list public int nTieBreaker; // ensures that elements will tend to get put in at the end of the list, which should improve performance public String url = null; public TempDocBucket dupList = null; // (linked list starting at the "master" document) public int nEntsInDoc = 0; // (performance shortcut for comparing 2 potentially duplicate docs) // Store explain object (rarely needed) so won't incur map cost across all docs public Object explain; // Deduplication and ordering: @Override public int compareTo(TempDocBucket rhs) { boolean bCloseEnoughToCompare = false; double diff = this.totalScore - rhs.totalScore; if (-1 != nLuceneIndex) { // ie sorting by date if (this.nEntsInDoc == rhs.nEntsInDoc) { // (don't bother comparing unless they have the same number of entities_ if (0 == this.nLeftToProcess) { try { this.nLeftToProcess = ((Date) dbo.get(DocumentPojo.publishedDate_)).getTime() / 600000; // (down to 10 minutes==10*60*1000) } catch (Exception e) { this.nLeftToProcess = -1; // no date. don't try again } } if (0 == rhs.nLeftToProcess) { try { rhs.nLeftToProcess = ((Date) rhs.dbo.get(DocumentPojo.publishedDate_)).getTime() / 600000; // (down to 10 minutes==10*60*1000) } catch (Exception e) { rhs.nLeftToProcess = -1; // no date. don't try again } } if (rhs.nLeftToProcess == this.nLeftToProcess) { // This now contains the date in seconds... bCloseEnoughToCompare = true; } } } else { // normal score based sorting: bCloseEnoughToCompare = (Math.abs(diff) <= 1.0) && (this.nEntsInDoc == rhs.nEntsInDoc); } //TESTED (both sort types - by date and by score) if (bCloseEnoughToCompare) { // Get the url /(hash code since that will then get saved) and check that if (null == this.url) { this.url = dbo.getString(DocumentPojo.url_); } if (null == rhs.url) { rhs.url = rhs.dbo.getString(DocumentPojo.url_); } if (ScoringUtils_MultiCommunity.community_areDuplicates(this, rhs)) { this.dupList = rhs.dupList; rhs.dupList = this; // (add to very simple linked list) return 0; } else if (0.0 == diff) { return this.nTieBreaker - rhs.nTieBreaker; } else return Double.compare(this.totalScore, rhs.totalScore); } else if (0.0 == diff) { return this.nTieBreaker - rhs.nTieBreaker; } else return Double.compare(this.totalScore, rhs.totalScore); }//TESTED (see TestCode#1) }; static class TempEntityInDocBucket { public double freq = 0.0; // (freq pf entity in document, double for MongoDB reasons) public BasicDBObject dbo; // (entity object from Mongo) public TempDocBucket doc; // (parent document) }; static class EntSigHolder implements Comparable<EntSigHolder> { EntSigHolder(String index, long nTotalDocCount, ScoringUtils_MultiCommunity multiCommunityHandler) { this.index = index; // (used for aliasing only) this.nTotalDocCount = nTotalDocCount; if (null != multiCommunityHandler) { multiCommunityHandler.initializeEntity(this); } } public String index = null; // (only used for aliasing) // (ALSO USED FOR ALIASES) public long nDocCountInQuerySubset = 0; // total number of matching docs in retrieved data public double datasetSignificance = 0.0; // calculated weighted avg of doc significances (ie TF*standalone) public double standaloneSignificance = 0.0; // the IDF term of the significance public double queryCoverage = 0.0; // the % of documents in the query subset in which the entity occurs public double avgFreqOverQuerySubset = 0.0; // the average freq over all documents (not just those in which the entity occurs) // (ALSO USED FOR ALIASES) (ALL FIVE) //Totals - since don't have "ent" any more public long nTotalDocCount = 0; // document count in population // (ALSO USED FOR ALIASES) // To approximate avg significance: public double decayedDocCountInQuerySubset = 0.0; // sigma(doc-count-in-query-subset * geo-temporal decay) // (ALSO USED FOR ALIASES) // Some more attempts to avoid going through the DB cursor more than once List<TempEntityInDocBucket> entityInstances = new LinkedList<TempEntityInDocBucket>(); // For entity aggregation: public BasicDBObject unusedDbo = null; public double maxDocSig = 0.0; // (ALSO USED FOR ALIASES) (BOTH) public double maxFreq = 0.0; public long nTotalSentimentValues = 0; public double positiveSentiment = 0.0; public double negativeSentiment = 0.0; // (ALSO USED FOR ALIASES) (ALL THREE) @Override public int compareTo(EntSigHolder rhs) { return Double.compare(datasetSignificance, rhs.datasetSignificance); } // New code to handle significance approximation for pan-community queries // (see "additional functionality #1) Community_EntityExtensions community; // For aliasing: public EntSigHolder masterAliasSH = null; public EntityFeaturePojo aliasInfo = null; // (ALSO USED FOR ALIASES) (BOTH) // For low accuracy geo public BasicDBObject geotaggedEntity = null; // (store the entire ent object so we don't need to pay the deser cost unless it's promoted...) public BasicDBObject geotag = null; // (need both of these for onto type + geotag) }; // Top level state ("s0" for "stage 0") // (Some processing controls) long _s0_nQuerySetDocCount; // (however many were actually found in the Lucene indexes, NOTE not how many are retrieved from DB) int _s0_nQuerySubsetDocCount; // (eg 1000 docus, user limit - ie how many are retrieved from DB) boolean _s0_bNeedToCalcSig; // (whether this function needs to calc sig - eg if only being used for standalone events) double _s0_globalDocCount; double _s0_maxLuceneScoreInv; // (unused) double _s0_avgLuceneScoreInv; // (used for adjust aggregates' statistics) double _s0_avgLuceneScore; // (Some output controls) boolean _s0_sortingByDate = false; int _s0_nNumEntsReturn; boolean _s0_bNonGeoEnts; boolean _s0_bGeoEnts; boolean _s0_bEvents; boolean _s0_bFacts; boolean _s0_bSummaries; boolean _s0_bMetadata; // Type/Verb filtering: HashSet<String> _s0_entityTypeFilter = null; boolean _s0_bEntityTypeFilterPositive = true; HashSet<String> _s0_assocVerbFilter = null; boolean _s0_bAssocVerbFilterPositive = true; ScoringUtils_MultiCommunity _s0_multiCommunityHandler = null; // (handles approximating significance from multiple communities with various overlaps) StandaloneEventHashAggregator _s0_standaloneEventAggregator = null; // (handles event scoring) StandaloneEventHashAggregator _s0_lowAccuracyAssociationAggregator_events = null; StandaloneEventHashAggregator _s0_lowAccuracyAssociationAggregator_facts = null; // (workarounds for clusters where the Lucene indexes are too large to do faceting) // TF-params: original suggested values are (0.5, 1.5) private static final double TF_PARAM1 = 0.5; // I think this ranks docs with many entities up too high: // (FYI with (0.5,1.5): for freq==1, doc length==average, then tf term=0.333 (f==2=>0.5); doc length==av*2 => tf=0.222, (f==2=>0.364)) //private static final double TF_PARAM2 = 1.5; // The following value has the property that there's a break-even point at ~3x the average number of entities private static final double TF_PARAM2 = 5.5; // Some support for low accuracy geo: LinkedList<EntSigHolder>[] _s3_geoBuckets = null; boolean _s3_bLowAccuracyGeo = false; boolean _s3_bExtraAliasGeo = false; private static final int _s3_nGEO_BUCKETS = 100; private static final int _s3_nGEO_BUCKETS_1 = 99; private static final double _s3_dGEO_BUCKETS = 100.0; double _s2_maxGeoQueryCoverage = 0.0; public void clearAsMuchMemoryAsPossible() { _s0_entityTypeFilter = null; _s0_assocVerbFilter = null; _s0_multiCommunityHandler = null; _s0_standaloneEventAggregator = null; _s0_lowAccuracyAssociationAggregator_events = null; _s0_lowAccuracyAssociationAggregator_facts = null; _s3_geoBuckets = null; _s1_dManualGeoDecay_latLonInvdecay = null; // Need this: _s1_entitiesInDataset _s1_noEntityBuckets = null; _s1_aliasSummary = null; _s3_pqDocs = null; _s3_pqEnt = null; } // Top level logic @SuppressWarnings("unchecked") public List<BasicDBObject> calcTFIDFAndFilter(DBCollection docsDb, DBCursor docs, AdvancedQueryPojo.QueryScorePojo scoreParams, AdvancedQueryPojo.QueryOutputPojo outParams, StatisticsPojo scores, boolean bLowAccuracyDecay, long nStart, long nToClientLimit, String[] communityIds, String[] entityTypeFilterStrings, String[] assocVerbFilterStrings, LinkedList<BasicDBObject> standaloneEventsReturn, LinkedList<BasicDBObject> lowAccuracyAggregatedEnts, AggregationUtils.GeoContainer lowAccuracyAggregatedGeo, AggregationUtils.GeoContainer extraAliasAggregatedGeo, LinkedList<BasicDBObject> lowAccuracyAggregatedEvents, LinkedList<BasicDBObject> lowAccuracyAggregatedFacts) { _s0_multiCommunityHandler = new ScoringUtils_MultiCommunity(communityIds); _s0_avgLuceneScore = scores.avgScore; _s0_avgLuceneScoreInv = 1.0 / (scores.avgScore + 0.01); // (+0.01 for safety in case avgScore is small) _s0_maxLuceneScoreInv = 1.0 / (scores.maxScore + 0.01); // Utility classes // Quick check - do I need to be here at all? LinkedList<BasicDBObject> returnList = new LinkedList<BasicDBObject>(); _s0_bNeedToCalcSig = (null != lowAccuracyAggregatedEnts) || (null != lowAccuracyAggregatedEvents) || (null != lowAccuracyAggregatedFacts) || (null != lowAccuracyAggregatedGeo) || ((nToClientLimit > 0) && outParams.docs.enable); if (!_s0_bNeedToCalcSig && (null == standaloneEventsReturn)) { return returnList; } //TESTED else if (!_s0_bNeedToCalcSig) { // (ie and want standaloneEventsReturn) if (scoreParams.sigWeight > 0.0) { // (reverse the call, we want sig for the standalone events) _s0_bNeedToCalcSig = true; nToClientLimit = 0; // (ensure no docs get accidentally output) } } //TESTED // Various configuration and state variables // Entity aggregation code: _s0_nNumEntsReturn = 0; if (null != lowAccuracyAggregatedEnts) { _s0_nNumEntsReturn = outParams.aggregation.entsNumReturn; } _s1_entitiesInDataset = new HashMap<String, EntSigHolder>(); _s1_noEntityBuckets = new ArrayList<TempDocBucket>(); // (User output options) _s0_bNonGeoEnts = true; _s0_bGeoEnts = true; _s0_bEvents = true; _s0_bFacts = true; _s0_bSummaries = true; _s0_bMetadata = true; if (null != outParams.docs) { if ((null != outParams.docs.metadata) && !outParams.docs.metadata) { _s0_bMetadata = false; } if ((null != outParams.docs.ents) && !outParams.docs.ents) { _s0_bNonGeoEnts = false; _s0_bGeoEnts = false; // (but can be overridden below) } if ((null != outParams.docs.geo) && !outParams.docs.geo) { _s0_bGeoEnts = false; } else if ((null != outParams.docs.geo) && outParams.docs.geo) { _s0_bGeoEnts = true; } if ((null != outParams.docs.events) && !outParams.docs.events) { _s0_bEvents = false; } if ((null != outParams.docs.facts) && !outParams.docs.facts) { _s0_bFacts = false; } if ((null != outParams.docs.summaries) && !outParams.docs.summaries) { _s0_bSummaries = false; } } //TESTED if (null != entityTypeFilterStrings) { if ('-' == entityTypeFilterStrings[0].charAt(0)) { _s0_bEntityTypeFilterPositive = false; } //TESTED (in both entities and associations) _s0_entityTypeFilter = new HashSet<String>(); for (String entityType : entityTypeFilterStrings) { if (!_s0_bEntityTypeFilterPositive && ('-' == entityType.charAt(0))) { entityType = entityType.substring(1); } _s0_entityTypeFilter.add(entityType.toLowerCase()); } } if (_s0_bEvents || _s0_bFacts || _s0_bSummaries || (null != standaloneEventsReturn)) { // (ie most of the time!) if (null != assocVerbFilterStrings) { if ('-' == assocVerbFilterStrings[0].charAt(0)) { _s0_bAssocVerbFilterPositive = false; } //TESTED _s0_assocVerbFilter = new HashSet<String>(); for (String assocVerb : assocVerbFilterStrings) { if (!_s0_bAssocVerbFilterPositive && ('-' == assocVerb.charAt(0))) { assocVerb = assocVerb.substring(1); } _s0_assocVerbFilter.add(assocVerb); } } } //TESTED if ((scoreParams.relWeight == 0.0) && (scoreParams.sigWeight == 0.0)) { _s0_sortingByDate = true; } // First loop: just count and store if ((null != standaloneEventsReturn) && (null != outParams.docs) && (null != outParams.docs.numEventsTimelineReturn) && (outParams.docs.numEventsTimelineReturn > 0)) { _s0_standaloneEventAggregator = new StandaloneEventHashAggregator(standaloneEventsReturn, false, _s1_aliasLookup); } if ((null != lowAccuracyAggregatedEvents) && (null != outParams.aggregation) && (null != outParams.aggregation.eventsNumReturn) && (outParams.aggregation.eventsNumReturn > 0)) { _s0_lowAccuracyAssociationAggregator_events = new StandaloneEventHashAggregator( lowAccuracyAggregatedEvents, true, _s1_aliasLookup); } if ((null != lowAccuracyAggregatedFacts) && (null != outParams.aggregation) && (null != outParams.aggregation.factsNumReturn) && (outParams.aggregation.factsNumReturn > 0)) { _s0_lowAccuracyAssociationAggregator_facts = new StandaloneEventHashAggregator( lowAccuracyAggregatedFacts, true, _s1_aliasLookup); } if ((null != lowAccuracyAggregatedGeo) && (null != outParams.aggregation) && (null != outParams.aggregation.geoNumReturn) && (outParams.aggregation.geoNumReturn > 0)) { // Initialize the buckets _s3_geoBuckets = (LinkedList<EntSigHolder>[]) new LinkedList[_s3_nGEO_BUCKETS]; _s3_bLowAccuracyGeo = true; } if ((null != extraAliasAggregatedGeo) && (null != outParams.aggregation) && (null != outParams.aggregation.geoNumReturn) && (outParams.aggregation.geoNumReturn > 0)) { _s3_bExtraAliasGeo = true; // (don't initialize _s3_geoBuckets until we have to) } if (bLowAccuracyDecay) { _s1_dManualGeoDecay_latLonInvdecay = QueryHandler.parseGeoDecay(scoreParams); } //TESTED _s0_nQuerySubsetDocCount = docs.size(); // eg (1000 docus, user limit) _s0_nQuerySetDocCount = scores.found; // however many were actually found //lookup the totaldoc count _s0_globalDocCount = 0; long nGlobalDocCount = 0; try { nGlobalDocCount = getDocCount(_s0_multiCommunityHandler.getCommunityIds()); } catch (Exception e) { // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); } // (End doccount) if (_s0_nQuerySetDocCount > nGlobalDocCount) { nGlobalDocCount = _s0_nQuerySetDocCount; // (This can happen if the source doc counts get out of sync... // ... conversely if the index/db get out of sync, the other way round can be correct, but this way is safer) } _s0_globalDocCount = (double) nGlobalDocCount; stage1_initialCountingLoop(docs, scoreParams, (int) nToClientLimit, scores, standaloneEventsReturn, communityIds.length); //Exit if not generating documents or entity aggregations: if (!_s0_bNeedToCalcSig) { return returnList; } //TESTED // Histogram time: this.stage2_generateFreqHistogramCalcIDFs(); // Next stop: loop over the entities and calculate the IDF terms this.stage3_calculateTFTerms(scoreParams, scores, nStart + nToClientLimit); // (get extra docs to handle deduplication) // Finally, write all the information to the surviving 100 (or whatever) documents // Handle skipping past the end: if ((nStart + nToClientLimit) > _s3_pqDocs.size()) { nToClientLimit = _s3_pqDocs.size() - nStart; if (nToClientLimit < 0) { nToClientLimit = 0; } } this.stage4_prepareDocsForOutput(scoreParams, scores, nToClientLimit, returnList); // And then same for entities this.stage4_prepareEntsForOutput(lowAccuracyAggregatedEnts); //Association is mostly done on the fly, but a final tidy up: if (null != standaloneEventsReturn) { ScoringUtils_Associations.finalizeStandaloneEvents(standaloneEventsReturn, _s0_standaloneEventAggregator, outParams.docs.numEventsTimelineReturn); } if (null != _s0_lowAccuracyAssociationAggregator_events) { ScoringUtils_Associations.finalizeStandaloneEvents(lowAccuracyAggregatedEvents, _s0_lowAccuracyAssociationAggregator_events, outParams.aggregation.eventsNumReturn); } if (null != _s0_lowAccuracyAssociationAggregator_facts) { ScoringUtils_Associations.finalizeStandaloneEvents(lowAccuracyAggregatedFacts, _s0_lowAccuracyAssociationAggregator_facts, outParams.aggregation.factsNumReturn); } // Geo is mostly done on the fly, but a final tidy up: if (null != lowAccuracyAggregatedGeo) { finalizeLowAccuracyGeoAggregation(lowAccuracyAggregatedGeo, outParams.aggregation.geoNumReturn); // (outParams.aggregation.geoNumReturn must exist if (null != lowAccuracyAggregatedGeo)) } else if ((null != extraAliasAggregatedGeo) && (null != _s3_geoBuckets)) { finalizeLowAccuracyGeoAggregation(extraAliasAggregatedGeo, Long.MAX_VALUE); //(at most 1 per alias so size shouldn't be an issue) } return returnList; } ///////////////////////////////////////////////////////////// // (Top level logic - entities) // Interface to allow other internal services to take advantage of all the work that's gone into this: public boolean fillInEntityStatistics(EntityPojo ent) { EntSigHolder entStat = _s1_entitiesInDataset.get(ent.getIndex()); if (null == entStat) { return false; } else { ent.setDoccount(entStat.nTotalDocCount); ent.setTotalfrequency(entStat.nTotalDocCount); // (don't seem to have that) ent.setDatasetSignificance(entStat.datasetSignificance); ent.setSignificance(entStat.datasetSignificance); // (doc significance isn't normalized correctly in most cases) ent.setQueryCoverage(entStat.queryCoverage); return true; } }//TESTED ///////////////////////////////////////////////////////////// // (Top level logic - associations) public boolean calcAssocationSignificance(String ent1_index, String ent2_index, String geo_index, BasicDBObject assoc) { if ((null == _s1_entitiesInDataset) || _s1_entitiesInDataset.isEmpty()) { return false; } else { ScoringUtils_Associations.calcAssocationSignificance(ent1_index, ent2_index, geo_index, assoc, _s1_entitiesInDataset); } return true; } ///////////////////////////////////////////////////////////////////////////////////////////////////////////// // SUB-FUNCTIONS ///////////////////////////////////////////////////////////// // 1] stage1_initialCountingLoop() // Loops over the data a first time and generates basic statistics required by the more complex // functionality that follow // Input: double _s1_dManualGeoDecay_latLonInvdecay[] = null; // (this is needed if internal Lucene geo decay is turned off for performance reasons) // Output: double _s1_sumFreqInQuerySubset = 0; // (the sum of all the frequencies in the received matching (sub-)dataset) HashMap<String, EntSigHolder> _s1_entitiesInDataset; // (map of entities to various stats) ArrayList<TempDocBucket> _s1_noEntityBuckets; // (docs with no entities) HashMap<String, EntSigHolder> _s1_aliasSummary = null; // (for aggregating entities by their alias) // Logic: @SuppressWarnings("unchecked") private void stage1_initialCountingLoop(DBCursor docs, AdvancedQueryPojo.QueryScorePojo scoreParams, int toReturn, StatisticsPojo scores, LinkedList<BasicDBObject> standaloneEventsReturn, int nCommunities) { double s0_nQuerySubsetDocCountInv = 1.0 / (double) _s0_nQuerySubsetDocCount; // Some memory management: DBCollection dbc = MongoDbManager.getDocument().getMetadata(); DBDecoderFactory defaultDecoder = dbc.getDBDecoderFactory(); try { SizeReportingBasicBSONDecoder sizeReportingDecoder = new SizeReportingBasicBSONDecoder(); dbc.setDBDecoderFactory(sizeReportingDecoder); long currMemUsage = 0; int ndocs = 0; long lastBatch = 0L; long initialUnusedMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory(); long initialFreeMemory = Runtime.getRuntime().freeMemory(); for (DBObject f0 : docs) { BasicDBObject f = (BasicDBObject) f0; long newMemUsage = sizeReportingDecoder.getSize(); if ((newMemUsage - currMemUsage) > 0) { // check every batch long now = new Date().getTime(); //DEBUG //logger.warn(ndocs + " : " + (now - lastBatch) + " : " + newMemUsage + " VS " + Runtime.getRuntime().maxMemory() + " UNUSED " + (Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory()) + " FREE " + Runtime.getRuntime().freeMemory()); // Check vs total memory: long runtimeMem = Runtime.getRuntime().maxMemory(); // note newMemUsage is the input memory ... gets expanded ~6x by the BSON-ification, allowed at most 1/4rd of memory... // Also if we're taking more than 20s for a batch then limp over the limit and exit... if (((newMemUsage * 24) > runtimeMem) || (((now - lastBatch) > 20000L) && (ndocs >= toReturn))) { long finalUnusedMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory(); long finalFreeMemory = Runtime.getRuntime().freeMemory(); logger.error("Query truncated memUsage=" + newMemUsage + ", memory=" + runtimeMem + ", docs=" + ndocs + ", totaldocs=" + scores.found + ", init_free_mem=" + initialFreeMemory + ", end_free_mem=" + finalFreeMemory + ", init_unused_mem=" + initialUnusedMemory + ", end_unused_mem=" + finalUnusedMemory); break; } //TESTED currMemUsage = newMemUsage; lastBatch = now; } //TESTED ndocs++; // Simple handling for standalone events if ((null != _s0_standaloneEventAggregator) && !_s0_bNeedToCalcSig) { //if _s0_bNeedToCalcSig then do this elsewhere ScoringUtils_Associations.addStandaloneEvents(f, 0.0, 0, _s0_standaloneEventAggregator, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts); } //TESTED if (!_s0_bNeedToCalcSig) { continue; } //TESTED if (nCommunities > 1) { // (could have pan-community entities) ObjectId communityId = (ObjectId) f.get(DocumentPojo.communityId_); if (null != communityId) { // (have big problems if so, but anyway!) int retval = _s0_multiCommunityHandler.community_getIdAndInitialize(communityId, _s1_entitiesInDataset); // (returns an int community id but also sets it into the cache, so just use that below) if (Integer.MIN_VALUE == retval) { //this document cannot be viewed from within this set of communities continue; } } } //TESTED TempDocBucket docBucket = new TempDocBucket(); docBucket.dbo = f; ObjectId id = (ObjectId) f.get(DocumentPojo._id_); // If we're going to weight relevance in, or we need the geo temporal decay: if ((0 != scoreParams.relWeight) || (null != scoreParams.timeProx) || (null != scoreParams.geoProx)) { StatisticsPojo.Score scoreObj = scores.getScore().get(id); if (null != scoreObj) { docBucket.explain = scoreObj.explain; // (will normally be null) docBucket.luceneScore = scoreObj.score; if ((null != scoreParams.timeProx) || (null != scoreParams.geoProx)) { if (scoreObj.decay >= 0.0) { docBucket.geoTemporalDecay = scoreObj.decay; } // (see also below for low accuracy geo scoring) } } else { docBucket.luceneScore = 1.0; } } //TESTED else if (this._s0_sortingByDate) { StatisticsPojo.Score scoreObj = scores.getScore().get(id); if (null != scoreObj) { docBucket.nLuceneIndex = scoreObj.nIndex; } } docBucket.manualWeighting = this.getManualScoreWeights(scoreParams, f); BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_)); if (null != l) { long nEntsInDoc = l.size(); double dBestGeoScore = 0.0; // (for low accuracy geo only) for (Iterator<?> e0 = l.iterator(); e0.hasNext();) { BasicDBObject e = (BasicDBObject) e0.next(); BasicDBObject tmpGeotag = null; if (_s3_bLowAccuracyGeo || (null != _s1_dManualGeoDecay_latLonInvdecay)) { // low accuracy geo, need to look for geotag tmpGeotag = (BasicDBObject) e.get(EntityPojo.geotag_); } // Get attributes double freq = -1.0; long ntotaldoccount = -1; String entity_index; Double sentiment = null; try { sentiment = (Double) e.get(EntityPojo.sentiment_); ntotaldoccount = e.getLong(EntityPojo.doccount_); freq = e.getDouble(EntityPojo.frequency_); entity_index = e.getString(EntityPojo.index_); if (null == entity_index) { // Just bypass the entity e.put(EntityPojo.significance_, 0.0); nEntsInDoc--; continue; } } catch (Exception ex) { try { String sfreq; if (ntotaldoccount < 0) { sfreq = e.getString(EntityPojo.doccount_); ntotaldoccount = Long.valueOf(sfreq); } if (freq < -0.5) { sfreq = e.getString(EntityPojo.frequency_); freq = Long.valueOf(sfreq).doubleValue(); } entity_index = e.getString(EntityPojo.index_); if (null == entity_index) { // Just bypass the entity e.put(EntityPojo.significance_, 0.0); nEntsInDoc--; continue; } } catch (Exception e2) { // Just bypass the entity e.put(EntityPojo.significance_, 0.0); nEntsInDoc--; continue; } } //TESTED // First loop through is just counting // Retrieve entity (create/initialzie if necessary) EntSigHolder shp = _s1_entitiesInDataset.get(entity_index); if (null == shp) { if (ntotaldoccount > (long) _s0_globalDocCount) { // obviously can't have more entities-in-dos than docs... ntotaldoccount = (long) _s0_globalDocCount; } shp = new EntSigHolder(entity_index, ntotaldoccount, _s0_multiCommunityHandler); // Stage 1a alias handling: set up infrastructure, calculate doc overlap if (null != _s1_aliasLookup) { stage1_initAlias(shp); } if ((null != shp.aliasInfo) && (null == shp.masterAliasSH)) { // this is the discard alias nEntsInDoc--; continue; } //TESTED // Check if entity is in type filter list if (null != _s0_entityTypeFilter) { String entType = null; if (null != shp.aliasInfo) { entType = shp.aliasInfo.getType(); } else { entType = e.getString(EntityPojo.type_); } if (_s0_bEntityTypeFilterPositive) { if ((null != entType) && !_s0_entityTypeFilter.contains(entType.toLowerCase())) { nEntsInDoc--; continue; } } else if ((null != entType) && _s0_entityTypeFilter.contains(entType.toLowerCase())) { //(negative filter) nEntsInDoc--; continue; } } //TESTED (end entity filter) // Geo: if (null != shp.aliasInfo) { if (null != shp.aliasInfo.getGeotag()) { //Geo, overwrite/create tmpGeotag if (_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo || (null != _s1_dManualGeoDecay_latLonInvdecay)) { // Always capture alias geo, even if not in low accuracy mode because we add it to the // legitimate geo: if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo) && (null == _s3_geoBuckets)) { // Initialize the buckets if this is for aggregation not just decay _s3_geoBuckets = (LinkedList<EntSigHolder>[]) new LinkedList[_s3_nGEO_BUCKETS]; } if (null == tmpGeotag) { tmpGeotag = new BasicDBObject(); } tmpGeotag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat); tmpGeotag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon); if (null != shp.aliasInfo.getOntology_type()) { e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type()); } } } } //TESTED (end geo for aggregation or decay) _s1_entitiesInDataset.put(entity_index, shp); // end Stage 1a alias handling } //(end if is alias) // Stage 1b alias handling: calculate document counts (taking overlaps into account) if (null != shp.masterAliasSH) { // Counts: shp.masterAliasSH.nTotalDocCount++; // docs including overlaps shp.masterAliasSH.avgFreqOverQuerySubset += freq; // Keep track of overlaps: if (f != shp.masterAliasSH.unusedDbo) { shp.masterAliasSH.unusedDbo = f; // (note this is only used in stage 1, alias.unusedDbo is re-used differently in stage 3/4) shp.masterAliasSH.nDocCountInQuerySubset++; // non-overlapping docs ie < shp.nDocCountInQuerySubset } // Sentiment: shp.masterAliasSH.positiveSentiment += shp.positiveSentiment; shp.masterAliasSH.negativeSentiment += shp.negativeSentiment; if (null != sentiment) { shp.masterAliasSH.nTotalSentimentValues++; } } //TESTED (end if is alias) // end Stage 1b // Pan-community logic (this needs to be before the entity object is updated) if (_s0_multiCommunityHandler.isActive()) { _s0_multiCommunityHandler.community_updateCorrelations(shp, ntotaldoccount, entity_index); } else { // (Once we've started multi-community logic, this is no longer desirable) if ((ntotaldoccount > shp.nTotalDocCount) && (ntotaldoccount <= _s0_globalDocCount)) { shp.nTotalDocCount = ntotaldoccount; } //(note there used to be some cases where we adjusted for dc/tf==0, but the // underlying issue in the data model that caused this has been fixed, so it's // now a pathological case that can be ignored) } //(TESTED) // Update counts: _s1_sumFreqInQuerySubset += freq; shp.avgFreqOverQuerySubset += freq; shp.nDocCountInQuerySubset++; shp.decayedDocCountInQuerySubset += docBucket.geoTemporalDecay; // (note this doesn't handle low accuracy geo-decay ... we'll address that via a separate term) TempEntityInDocBucket entBucket = new TempEntityInDocBucket(); entBucket.dbo = e; entBucket.freq = freq; entBucket.doc = docBucket; shp.entityInstances.add(entBucket); if (null != tmpGeotag) { // (only needed for low accuracy geo aggregation) if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo) && (null == shp.geotag)) { // (first time for shp only) shp.geotag = tmpGeotag; shp.geotaggedEntity = e; // (ie for onto type, which has been overwritten in the alias case...) } if (null != _s1_dManualGeoDecay_latLonInvdecay) { // Emulate scripted Lucene calculations double minlat = tmpGeotag.getDouble(GeoPojo.lat_); double minlon = tmpGeotag.getDouble(GeoPojo.lon_); double paramlat = _s1_dManualGeoDecay_latLonInvdecay[0]; double paramlon = _s1_dManualGeoDecay_latLonInvdecay[1]; double gdecay = _s1_dManualGeoDecay_latLonInvdecay[2]; char ontCode = GeoOntologyMapping .encodeOntologyCode(e.getString(EntityPojo.ontology_type_)); double dDecay = QueryDecayScript.getGeoDecay(minlat, minlon, paramlat, paramlon, gdecay, ontCode); if (dDecay > dBestGeoScore) { dBestGeoScore = dDecay; } } //TESTED } //(end if entity has geo and need to process entity geo) if (freq > shp.maxFreq) { shp.maxFreq = freq; } // Sentiment: if ((null != sentiment) && (Math.abs(sentiment) <= 1.1)) { // (actually 1.0) shp.nTotalSentimentValues++; if (sentiment > 0.0) { shp.positiveSentiment += sentiment; } else { shp.negativeSentiment += sentiment; } } else if (null != sentiment) { // corrupt sentiment for some reason?! e.put(EntityPojo.sentiment_, null); } docBucket.docLength += freq; } //(end loop over entities) docBucket.nLeftToProcess = nEntsInDoc; docBucket.nEntsInDoc = (int) nEntsInDoc; if (null != this._s1_dManualGeoDecay_latLonInvdecay) { // Low accuracy geo-calculations docBucket.geoTemporalDecay *= dBestGeoScore; docBucket.luceneScore *= dBestGeoScore; _s2_dAvgLowAccuracyGeoDecay += dBestGeoScore * s0_nQuerySubsetDocCountInv; } //TESTED } // (end if feed has entities) // Handle documents with no entities - can still promote them if (0 == docBucket.nLeftToProcess) { // (use this rather than doc length in case all the entities had freq 0) _s1_noEntityBuckets.add(docBucket); } } // (end loop over feeds) //TESTED } finally { dbc.setDBDecoderFactory(defaultDecoder); } } ///////////////////////////////////////////////////////////// // 2] stage2_generateFreqHistogramCalcIDFs() // Generates a histogram of entity frequencies that can be used to suppress the significance // of likely false positives // Then calculates the IDFs of each entity (including cross-community scoring adjustments if necessary) // Inputs double _s2_dAvgLowAccuracyGeoDecay = 0.0; // for low accuracy geo a further approximation... // Outputs double _s2_dApproxAverageDocumentSig; // Approximate calculated here for convenience, used later on int _s2_nMush1Index; // 33% significance frequency (very likely to be false positive) int _s2_nMush2Index; // 66% significance frequency (quite likely to be false positive) // Logic private void stage2_generateFreqHistogramCalcIDFs() { final int nMaxHistBins = 25; long nCountHistogram[] = new long[nMaxHistBins]; // Prep histogram int nHistBins = 1 + (int) (_s0_nQuerySubsetDocCount / 50); // (eg 21 bins for 1000 documents) if (nHistBins > nMaxHistBins) { nHistBins = nMaxHistBins; } //TESTED // (Sadly requires 1 spurious loop over the entities, shouldn't add too much extra) // Will take the opportunity to calculate the standalone entity significances here // OK looking at IDF equations below, the significance's maximum value is (entity appears only in query set) // log(doccount*nQuerySubsetDocCount/0.25) ... so we'll scale that to be 100% double dScaleFactor = 100.0 / Math.log10((_s0_globalDocCount * _s0_nQuerySetDocCount + 0.5) / 0.25); // (note this isn't quite right anymore because of the adjustments performed below, but it does a reasonable // job and the actual value is now very complicated...) double dHalfScaleFactor2 = 0.5 * ((0.5 + (double) _s0_nQuerySetDocCount) / (0.5 + _s0_globalDocCount)); // Pre-calculated scalors to use in query coverage double halfQueryDocSubsetInv = 0.5 / (0.5 + _s0_nQuerySubsetDocCount); // (case 2.1 below - needs multipled by the entity's query count) double halfGlobalDocCountInv = 0.5 / (0.5 + _s0_globalDocCount); // (case 2.2 below - needs multipled by the entity's total count) _s2_dApproxAverageDocumentSig = 0.0; // (used to normalize vs the relevance) // Some TF-related numbers // (no longer needed since we calculate the average TF based on an average entity count, for performance reasons) //double invAvgLength = ((double)_s0_nQuerySubsetDocCount/(_s1_sumFreqInQuerySubset + 0.01)); // Pre-calculate a few dividors used in the loop below: double s0_nQuerySubsetDocCountInv = 1.0 / (double) _s0_nQuerySubsetDocCount; double s0_nQuerySetDocCountInv = 1.0 / (double) _s0_nQuerySetDocCount; for (EntSigHolder shp : _s1_entitiesInDataset.values()) { double avgFreqPerEntity = shp.avgFreqOverQuerySubset / shp.nDocCountInQuerySubset; // (do this here because can overwrite shp.nDocCountInQuerySubset further below, losing direct link with shp.avgFreq) if (shp.nDocCountInQuerySubset < nHistBins) { nCountHistogram[(int) shp.nDocCountInQuerySubset]++; } //(Robustness) if (shp.nTotalDocCount < shp.nDocCountInQuerySubset) { shp.nTotalDocCount = shp.nDocCountInQuerySubset; } if (_s0_nQuerySubsetDocCount < shp.nDocCountInQuerySubset) { shp.nDocCountInQuerySubset = _s0_nQuerySubsetDocCount; } // Transform from a ratio involving nQuery*Subset*DocCount to a ratio of nQuery*Set*DocCount double estEntityDocCountInQuery = (double) shp.nDocCountInQuerySubset; // (case 1 below) // Cases // 1] if "shp.nTotalDocCount <= shp.nDocCountInQuerySubset" OR "shp.nTotalDocCount == shp.nDocCountInQuerySubset" // then know that all instances were in nQuery*Set*DocCount (else the available entities is the smaller of the 2 diffs, see below) // 2] Otherwise we don't know, maybe we can guess: // 2.1] If the subset-ratio is correct then it will be // MIN[nQuerySetDocCount*(shp.nDocCountInQuerySubset/nQuerySubsetDocCount),nDocCountDiff] + shp.nDocCountInQuerySubset // 2.2] If it's actually randomly distributed then it will be // (nQuerySetDocCount/globalDocCount)*nDocCountDiff + shp.nDocCountInQuerySubset // So we'll average the 2 and call it a day if ((shp.nTotalDocCount > shp.nDocCountInQuerySubset) && (_s0_nQuerySetDocCount > _s0_nQuerySubsetDocCount)) { double docCountDiff = (double) (_s0_nQuerySetDocCount - _s0_nQuerySubsetDocCount); docCountDiff = Math.min(docCountDiff, (double) (shp.nTotalDocCount - shp.nDocCountInQuerySubset)); // ie there are 2 differences: the number of available entities in the total doc count // the number of available documents in the un-queried dataset estEntityDocCountInQuery += halfQueryDocSubsetInv * shp.nDocCountInQuerySubset * docCountDiff; estEntityDocCountInQuery += halfGlobalDocCountInv * shp.nTotalDocCount * docCountDiff; } //TESTED // IDF component of entity double adjustEntTotalDocCount = shp.nTotalDocCount + _s0_multiCommunityHandler.community_estimateAnyMissingDocCounts(shp); shp.standaloneSignificance = dScaleFactor * Math.log10( ((estEntityDocCountInQuery + 0.5) / (_s0_nQuerySetDocCount - estEntityDocCountInQuery + 0.5)) / ((adjustEntTotalDocCount - estEntityDocCountInQuery + 0.5) / ((_s0_globalDocCount - _s0_nQuerySetDocCount) - (adjustEntTotalDocCount - estEntityDocCountInQuery) + 0.5))); if ((shp.standaloneSignificance <= 0.0) || (Double.isInfinite(shp.standaloneSignificance)) || Double.isNaN(shp.standaloneSignificance)) { // Probably matches on the entire index or something like that, use a diff equation: // (basically ignore the denominator...) if ((2.0 * _s0_nQuerySetDocCount) >= (_s0_globalDocCount)) { // (to within 33% ... after that we'll start to trust it) final double dBackupScalingFactor = 200.0 / Math.log10(2);//200 vs 100 to counteract use of dHalfScaleFactor2 // Use dHalfScaleFactor2 (see case 2.2)==0.5*((0.5 + (double)_s0_nQuerySetDocCount)/(0.5 + _s0_globalDocCount)) // basically to suppress any non-matching records that (almost certainly) don't contain the entity shp.standaloneSignificance = dHalfScaleFactor2 * dBackupScalingFactor * Math.log10( (_s0_globalDocCount + shp.nDocCountInQuerySubset + 0.5) / (_s0_globalDocCount + 0.5)); // (note if (shp.nDocCountInQuerySubset==_s0_nQuerySetDocCount) then this==100% because of defn of dBackupScalingFactor) if ((shp.standaloneSignificance < 0.0) || (Double.isInfinite(shp.standaloneSignificance)) || Double.isNaN(shp.standaloneSignificance)) // (cleanup) { shp.standaloneSignificance = 0.0; } } else { shp.standaloneSignificance = 0.0; } } //TESTED (vs entire dataset) // Use an "estimated query coverage" (instead of the exact one over the subset) shp.queryCoverage = (100.0 * (estEntityDocCountInQuery * s0_nQuerySetDocCountInv)); shp.avgFreqOverQuerySubset *= s0_nQuerySubsetDocCountInv; if (null != shp.geotag) { // (only happens for low accuracy geo aggregation) if (shp.queryCoverage > _s2_maxGeoQueryCoverage) { _s2_maxGeoQueryCoverage = shp.queryCoverage; } } double dApproxAvgTfTerm = avgFreqPerEntity / (avgFreqPerEntity + TF_PARAM1 + TF_PARAM2); // (An approximation for the TF for this entity - assume on average that the entity occurs in docs // with an average doc length, to avoid an extra loop here or in S1 to calc "avg doc length for docs containing entity) // (We're summing this across all entities anyway, so it's not like it would be a particularly accurate number anyway...) if (_s2_dAvgLowAccuracyGeoDecay > 0.0) { // Take into account average low accuracy geo-decay across the entire dataset dApproxAvgTfTerm *= _s2_dAvgLowAccuracyGeoDecay; } _s2_dApproxAverageDocumentSig += shp.decayedDocCountInQuerySubset * dApproxAvgTfTerm * shp.standaloneSignificance; // (ie an approximation to sum(TF-IDF) across docs // Stage 2 alias processing: calc pythag significance, store first/last values ready for S3 if (null != shp.masterAliasSH) { if (null == shp.masterAliasSH.index) { shp.masterAliasSH.index = shp.index; // (used so I know I'm the first alias in the global list) shp.masterAliasSH.avgFreqOverQuerySubset *= s0_nQuerySubsetDocCountInv; // (can't do query coverage yet, we're still summing over the adjusted total doc counts) // pre-calculate and store an overlap scalor to apply to query coverage shp.masterAliasSH.decayedDocCountInQuerySubset = (double) shp.masterAliasSH.nDocCountInQuerySubset / (double) shp.masterAliasSH.nTotalDocCount; } //TESTED shp.masterAliasSH.queryCoverage += shp.queryCoverage * shp.masterAliasSH.decayedDocCountInQuerySubset; // (my not-very-good estimate sort-of-adjusted for overlap) shp.masterAliasSH.standaloneSignificance += shp.standaloneSignificance * shp.standaloneSignificance; // (combine using pythag, like I do elsewhere for an easy approximation) shp.masterAliasSH.masterAliasSH = shp; // (used so I know I'm the last alias in the global list) } //TESTED // end stage 2 alias processing } //(end stage 2 loop over entities) //TESTED (by eye for a 114 document query and a 646 document query) _s2_dApproxAverageDocumentSig *= s0_nQuerySubsetDocCountInv; // Intention is now to do some false positive reduction double peak = 0.0; _s2_nMush1Index = nHistBins; // 33% significance _s2_nMush2Index = nHistBins; // 66% significance double lastval = -1.0; for (int i = 1; i < nHistBins; ++i) { double val = (double) nCountHistogram[i]; if (val > peak) { peak = val; } else { if (lastval >= 0.0) { // ie have got the 5% mark, now look for noise floor if (val >= (lastval - 1.5)) { // noise floor! _s2_nMush2Index = i; break; // (nothing left to do) } lastval = val; } else if (val < 0.05 * peak) { //5% _s2_nMush1Index = i; lastval = val; } } } // (end loop over histobins) //TESTED } ///////////////////////////////////////////////////////////// // 3] stage3_calculateTFTerms() // Calculate the entities' and documents' TF-IDF scores (already calculated IDF in stage2) // Output // For these 2: lower order (ie significance) puts you at the front of the Q java.util.TreeSet<TempDocBucket> _s3_pqDocs; // (doc queue for output - use a TreeSet + custom separator to do deduplication at the same time) java.util.PriorityQueue<EntSigHolder> _s3_pqEnt; // (entity queue for output, dedup not an issue for entities) double _s3_dLuceneScalingFactor; // How to weight relevance (using scoreParams config) double _s3_dSigScalingFactor; // How to weight significance (using scoreParams config) double _s3_dScoreScalingFactor; // How to weight total score (using scoreParams config) // Logic private void stage3_calculateTFTerms(AdvancedQueryPojo.QueryScorePojo scoreParams, StatisticsPojo scores, long nToClientLimit) { // First off: we have an approximate average significance, we're going to create a scaling factor for // relevance to fit in with the input parameters // Average doc score will be 100 _s3_pqDocs = new java.util.TreeSet<TempDocBucket>(); _s3_pqEnt = null; if (_s0_nNumEntsReturn > 0) { _s3_pqEnt = new java.util.PriorityQueue<EntSigHolder>(); } // Calculate scaling factors: _s3_dSigScalingFactor = 1.0; if (scoreParams.sigWeight != 0.0) { double d = (scoreParams.relWeight / scoreParams.sigWeight); _s3_dLuceneScalingFactor = (d * _s2_dApproxAverageDocumentSig) / (_s0_avgLuceneScore + 0.01); // (eg scale1*avQuery == (r/s)*avAggSig) _s3_dScoreScalingFactor = 100.0 / ((1.0 + d) * _s2_dApproxAverageDocumentSig); // ie scale2*(scale1*avQuery + avAggSig)==100.0 // Special case: all significances are 0: if (_s2_dApproxAverageDocumentSig == 0.0) { // just ignore significance _s3_dScoreScalingFactor = 100.0 / _s0_avgLuceneScore; _s3_dLuceneScalingFactor = 1.0; _s3_dSigScalingFactor = 0.0; } } else { // Ignore significance _s3_dLuceneScalingFactor = 1.0; _s3_dSigScalingFactor = 0.0; _s3_dScoreScalingFactor = 100.0 / _s0_avgLuceneScore; } //TESTED // (See wiki thoughts on not basing this on the query sub-set (eg 1000 is totally arbitrary) ... I like this current way) // Take set A == 1000 docs (ent hits = dc_in_sset), set B = #hits (ent hits = unknown), set C = total space (ent hits = dc) // If dc==dc_in_sset then *know* that ent hits in set B = dc, so you can divide by size of B // Where dc>dc_in_sset, you don't know how those remaining hits are partitioned between B and C // Use min(|B|*(dc_in_sset/|A|),dc) as one extreme, (dc-dc_in_sset)*|B|/|C| as other double invAvgLength = ((double) _s0_nQuerySubsetDocCount / (_s1_sumFreqInQuerySubset + 0.01)); int n1Down = 0; // (ensures where scores are equal documents are added last, should make a small difference to performance) for (EntSigHolder shp : _s1_entitiesInDataset.values()) { //(NOTE: important that we loop over this in the same order as we looped over it in stage 2) // Stage 3a alias processing: if (null != shp.masterAliasSH) { if (shp.index == shp.masterAliasSH.index) { // First instance of this alias set... shp.masterAliasSH.standaloneSignificance = Math.sqrt(shp.masterAliasSH.standaloneSignificance); // OK now all the stats are up-to-date } } //TESTED // end Stage 3a alias processing: //(IDF component calculated above) // Now calculate the term frequencies for (TempEntityInDocBucket entBucket : shp.entityInstances) { double tf_term = (entBucket.freq / (entBucket.freq + TF_PARAM1 + TF_PARAM2 * ((entBucket.doc.docLength + 0.01) * invAvgLength))); if (shp.nDocCountInQuerySubset <= _s2_nMush1Index) { tf_term *= 0.33; } else if (shp.nDocCountInQuerySubset <= _s2_nMush2Index) { tf_term *= 0.66; } double tf_idf_sig = tf_term * shp.standaloneSignificance * entBucket.doc.manualWeighting; //TESTED // Insert significance, unfortunately need to do this spuriously for low prio cases // (this could probably be accelerated by recalculating from the IDF and freq only for the top N docs, but empirically doesn't seem worth it) if (Double.isNaN(tf_idf_sig)) { entBucket.dbo.put(EntityPojo.significance_, 0.0); } else { entBucket.dbo.put(EntityPojo.significance_, tf_idf_sig); } if (tf_idf_sig > shp.maxDocSig) { shp.maxDocSig = tf_idf_sig; } entBucket.doc.aggSignificance += tf_idf_sig; // Now we're done incorporating the significance into the document, we're going // to adjust the standalone significance for the relevance of the document // (if enabled - either manually or if the query contains OR statements) if ((null != scoreParams.adjustAggregateSig) && scoreParams.adjustAggregateSig) { tf_idf_sig *= entBucket.doc.luceneScore * _s0_avgLuceneScoreInv; } //TESTED (doc scores stay the same, entity scores adjust) shp.datasetSignificance += tf_idf_sig / (double) shp.nDocCountInQuerySubset; // Stage 3b alias processing: update dataset significance if (null != shp.masterAliasSH) { double alias_tf_idf_sig = tf_term * shp.masterAliasSH.standaloneSignificance * entBucket.doc.manualWeighting; // (standaloneSig's calculation was finished at the start of this loop) // (adjust for relevance as above) if ((null != scoreParams.adjustAggregateSig) && scoreParams.adjustAggregateSig) { alias_tf_idf_sig *= entBucket.doc.luceneScore * _s0_avgLuceneScoreInv; } //TESTED if (alias_tf_idf_sig > shp.masterAliasSH.maxDocSig) { shp.masterAliasSH.maxDocSig = alias_tf_idf_sig; } shp.masterAliasSH.datasetSignificance += alias_tf_idf_sig / (double) shp.masterAliasSH.nDocCountInQuerySubset; // (don't use the nEntsInContainingDocs because here we do care about the overlap) } //TESTED // end Stage 3b alias processing entBucket.doc.nLeftToProcess--; if (0 == entBucket.doc.nLeftToProcess) { // Final calculation for Infinite significance entBucket.doc.aggSignificance *= entBucket.doc.geoTemporalDecay * _s3_dSigScalingFactor; entBucket.doc.luceneScore *= _s3_dLuceneScalingFactor; // (lucene already geo-temporally) scaled // (don't up lucene score this is done inside Lucene) double d = _s3_dScoreScalingFactor * (entBucket.doc.luceneScore + entBucket.doc.aggSignificance); if (Double.isNaN(d)) { d = 0.0; } if (_s0_sortingByDate) { entBucket.doc.totalScore = (double) -entBucket.doc.nLuceneIndex; } else { entBucket.doc.totalScore = d; } entBucket.doc.nTieBreaker = n1Down--; // Completed calculating this feed's score // Insert into "top 100" list: if (_s3_pqDocs.size() < nToClientLimit) { //DEBUG //System.out.println(_s3_pqDocs.size() + ", ADD URL=" + entBucket.doc.dbo.getString(DocumentPojo.url_)); _s3_pqDocs.add(entBucket.doc); entBucket.doc.bPromoted = true; } else if ((_s3_pqDocs.size() >= nToClientLimit) && (nToClientLimit > 0)) { TempDocBucket qsf = _s3_pqDocs.first(); if (entBucket.doc.totalScore > qsf.totalScore) { entBucket.doc.bPromoted = true; _s3_pqDocs.add(entBucket.doc); if (_s3_pqDocs.size() > nToClientLimit) { // (size might stay the same if this is a duplicate) Iterator<TempDocBucket> it = _s3_pqDocs.iterator(); // (now can remove this the object via...) TempDocBucket tdb = it.next(); it.remove(); // (ie remove the first object) tdb.bPromoted = false; // Phase "1": middle ranking (used to be good, not so much any more) if (null != _s0_standaloneEventAggregator) { ScoringUtils_Associations.addStandaloneEvents(tdb.dbo, tdb.aggSignificance, 1, _s0_standaloneEventAggregator, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts); } //TESTED if (null != _s0_lowAccuracyAssociationAggregator_events) { ScoringUtils_Associations.addStandaloneEvents(tdb.dbo, tdb.aggSignificance, 1, _s0_lowAccuracyAssociationAggregator_events, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, true, false, false); } //TESTED if (null != _s0_lowAccuracyAssociationAggregator_facts) { ScoringUtils_Associations.addStandaloneEvents(tdb.dbo, tdb.aggSignificance, 1, _s0_lowAccuracyAssociationAggregator_facts, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, false, false, true); } //TESTED } //TESTED } else { // Not promoting shp.unusedDbo = entBucket.dbo; // (might save me the trouble of cloning a few times...) // Phase "2": never any good! if (null != _s0_standaloneEventAggregator) { ScoringUtils_Associations.addStandaloneEvents(entBucket.doc.dbo, entBucket.doc.aggSignificance, 2, _s0_standaloneEventAggregator, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts); } //TESTED if (null != _s0_lowAccuracyAssociationAggregator_events) { ScoringUtils_Associations.addStandaloneEvents(entBucket.doc.dbo, entBucket.doc.aggSignificance, 2, _s0_lowAccuracyAssociationAggregator_events, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, true, false, false); } //TESTED if (null != _s0_lowAccuracyAssociationAggregator_facts) { ScoringUtils_Associations.addStandaloneEvents(entBucket.doc.dbo, entBucket.doc.aggSignificance, 2, _s0_lowAccuracyAssociationAggregator_facts, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, false, false, true); } //TESTED } } else { // Not promoting any documents... shp.unusedDbo = entBucket.dbo; // (might save me the trouble of cloning a few times...) // Phase "2": never any good! if (null != _s0_standaloneEventAggregator) { ScoringUtils_Associations.addStandaloneEvents(entBucket.doc.dbo, entBucket.doc.aggSignificance, 2, _s0_standaloneEventAggregator, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts); } //TESTED if (null != _s0_lowAccuracyAssociationAggregator_events) { ScoringUtils_Associations.addStandaloneEvents(entBucket.doc.dbo, entBucket.doc.aggSignificance, 2, _s0_lowAccuracyAssociationAggregator_events, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, true, false, false); } //TESTED if (null != _s0_lowAccuracyAssociationAggregator_facts) { ScoringUtils_Associations.addStandaloneEvents(entBucket.doc.dbo, entBucket.doc.aggSignificance, 2, _s0_lowAccuracyAssociationAggregator_facts, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, false, false, true); } //TESTED } } //TESTED } // (end loop over entity occurrences in feeds) //TESTED // Handle geo: if (null != shp.geotag) { loadLowAccuracyGeoBuckets(shp); } // Insert entities into the output priority queue // NOTE LOCAL SHP CANNOT BE USED AFTER THE FOLLOWING CLAUSE // (LOCAL==the object itself isn't changed, so the code above is fine, but the pointer is modified) if (_s0_nNumEntsReturn > 0) { // Stage 3c alias processing: if ((null != shp.masterAliasSH) && (shp.masterAliasSH.masterAliasSH != shp)) { continue; // (only promote the last of the aliased entities) } //TESTED else if (null != shp.masterAliasSH) { // (use aggregated aliased version if present) shp.masterAliasSH.unusedDbo = shp.unusedDbo; // (overwriting this, which is fine since it's not used after stage 1) shp.masterAliasSH.index = shp.index; // (just so I know what the index of this entity is) // (overwriting this, which is fine since it's not used after the first ent of the alias group in this stage) shp.masterAliasSH.entityInstances = shp.entityInstances; // (the only 2 fields that are needed but weren't present) shp = shp.masterAliasSH; } //TESTED // end stage 3c of alias processing if (_s3_pqEnt.size() < _s0_nNumEntsReturn) { _s3_pqEnt.add(shp); } if ((_s3_pqEnt.size() >= _s0_nNumEntsReturn) && (_s0_nNumEntsReturn > 0)) { EntSigHolder qsf = _s3_pqEnt.element(); if (shp.datasetSignificance > qsf.datasetSignificance) { EntSigHolder toRemove = _s3_pqEnt.remove(); _s3_pqEnt.add(shp); toRemove.entityInstances = null; // (don't need this any more can be gc'd) if (null != toRemove.masterAliasSH) { toRemove.masterAliasSH.entityInstances = null; // (can only promote one masterAliasSH so no risk this will remove an active entityInstances) } } else { shp.entityInstances = null; // (don't need this any more can be gc'd) } //TESTED } } //TESTED else { shp.entityInstances = null; // (don't need this any more, can be gc'd) } //TESTED // (NOTE LOCAL SHP CANNOT BE USED FROM HERE - IE NO MORE CODE IN THIS LOOP!) } // (end loop over entities) // Handle docus with no entities: if (nToClientLimit > 0) { for (TempDocBucket doc : _s1_noEntityBuckets) { doc.luceneScore *= _s3_dLuceneScalingFactor; double d = _s3_dScoreScalingFactor * doc.luceneScore; if (Double.isNaN(d)) { d = 0.0; } if (_s0_sortingByDate) { doc.totalScore = (double) -doc.nLuceneIndex; } else { doc.totalScore = d; } doc.nTieBreaker = n1Down--; if (_s3_pqDocs.size() < nToClientLimit) { _s3_pqDocs.add(doc); } if (_s3_pqDocs.size() >= nToClientLimit) { TempDocBucket qsf = _s3_pqDocs.first(); if (doc.totalScore > qsf.totalScore) { _s3_pqDocs.add(doc); if (_s3_pqDocs.size() > nToClientLimit) { // (could be a duplicate) Iterator<TempDocBucket> it = _s3_pqDocs.iterator(); // (now can remove this the object via...) it.next(); it.remove(); // (ie remove the first object) } } //(TESTED) } } // (end loop over feeds with no entities) } // (obv don't bother if we're not returning documents anyway...) } ///////////////////////////////////////////////////////////// // 4a] stage4_prepareDocsForOutput() // Using the priority queues calculated in step [3] generate the lists of documents and entities to return private void stage4_prepareDocsForOutput(AdvancedQueryPojo.QueryScorePojo scoreParams, StatisticsPojo scores, long nToClientLimit, LinkedList<BasicDBObject> returnList) { // Get the documents long nDocs = 0; double dBestScore = 0.0; double dAvgScore = 0.0; double dSigFactor = 100.0 / (_s3_dSigScalingFactor * _s2_dApproxAverageDocumentSig); double dRelFactor = 100.0 / (_s3_dLuceneScalingFactor * _s0_avgLuceneScore); // Start at the bottom of the list, so don't need to worry about skipping documents, just count out from the bottom // The call to stage3_calculateTFTerms with nStart+nToClientLimit handles the rest Iterator<TempDocBucket> pqIt = _s3_pqDocs.iterator(); while (pqIt.hasNext() && (nDocs < nToClientLimit)) { TempDocBucket qsf = pqIt.next(); nDocs++; if (!_s0_sortingByDate) { dBestScore = qsf.totalScore; } dAvgScore += dBestScore; BasicDBObject f = qsf.dbo; // Phase "0" - these are the highest prio events boolean bNeedToFilterAndAliasAssoc_event = true; boolean bNeedToFilterAndAliasAssoc_fact = true; boolean bNeedToFilterAndAliasAssoc_summary = true; if (null != _s0_standaloneEventAggregator) { ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0, _s0_standaloneEventAggregator, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts); bNeedToFilterAndAliasAssoc_event = false; bNeedToFilterAndAliasAssoc_fact = false; bNeedToFilterAndAliasAssoc_summary = false; } //TESTED if (null != _s0_lowAccuracyAssociationAggregator_events) { ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0, _s0_lowAccuracyAssociationAggregator_events, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, true, false, false); bNeedToFilterAndAliasAssoc_event = false; } //TESTED if (null != _s0_lowAccuracyAssociationAggregator_facts) { ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0, _s0_lowAccuracyAssociationAggregator_facts, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, false, false, true); bNeedToFilterAndAliasAssoc_fact = false; } //TESTED try { DocumentPojoApiMap.mapToApi(f); // Handle deduplication/multi-community code: if (null != qsf.dupList) { try { ScoringUtils_MultiCommunity.community_combineDuplicateDocs(qsf); } catch (Exception e) { // Do nothing, just carry on with minimal damage! } } // Scoring: double d = qsf.aggSignificance * dSigFactor; if (Double.isNaN(d)) { f.put(DocumentPojo.aggregateSignif_, 0.0); } else { f.put(DocumentPojo.aggregateSignif_, d); } d = qsf.luceneScore * dRelFactor; if (Double.isNaN(d)) { f.put(DocumentPojo.queryRelevance_, 0.0); } else { f.put(DocumentPojo.queryRelevance_, d); } if (!_s0_sortingByDate) { f.put(DocumentPojo.score_, qsf.totalScore); } BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_)); // Handle update ids vs normal ids: ObjectId updateId = (ObjectId) f.get(DocumentPojo.updateId_); if (null != updateId) { // swap the 2... f.put(DocumentPojo.updateId_, f.get(DocumentPojo._id_)); f.put(DocumentPojo._id_, updateId); } // Check if entities enabled if ((null != l) && (!_s0_bGeoEnts && !_s0_bNonGeoEnts)) { f.removeField(DocumentPojo.entities_); l = null; } //TESTED // Check if events etc enabled if ((!_s0_bEvents && !_s0_bFacts && !_s0_bSummaries)) { f.removeField(DocumentPojo.associations_); } //TESTED else if (!_s0_bEvents || !_s0_bFacts || !_s0_bSummaries || (null != _s0_assocVerbFilter)) { // Keep only specified event_types BasicDBList lev = (BasicDBList) (f.get(DocumentPojo.associations_)); if (null != lev) { for (Iterator<?> e0 = lev.iterator(); e0.hasNext();) { BasicDBObject e = (BasicDBObject) e0.next(); // Type filter boolean bNeedToFilterAndAliasAssoc = true; String sEvType = e.getString(AssociationPojo.assoc_type_); boolean bKeep = true; if (null == sEvType) { bKeep = false; } else if (sEvType.equalsIgnoreCase("event")) { if (!_s0_bEvents) bKeep = false; bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_event; } else if (sEvType.equalsIgnoreCase("fact")) { if (!_s0_bFacts) bKeep = false; bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_fact; } else if (sEvType.equalsIgnoreCase("summary")) { if (!_s0_bSummaries) bKeep = false; bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_summary; } if (!bKeep) { e0.remove(); } else { // Type matches, now for some more complex logic.... if (bNeedToFilterAndAliasAssoc) { // (otherwise done already) bKeep = ScoringUtils_Associations.filterAndAliasAssociation(e, _s1_aliasLookup, true, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter); if (!bKeep) { e0.remove(); } } //TESTED } //(end output filter logic) } // (end loop over events) } // (end if this doc has events) } //TESTED // Check if metadata is enabled if (!_s0_bMetadata) { f.removeField(DocumentPojo.metadata_); } //TESTED if (null != l) { for (Iterator<?> e0 = l.iterator(); e0.hasNext();) { BasicDBObject e = (BasicDBObject) e0.next(); if (!_s0_bNonGeoEnts) { // then must only be getting geo (else wouldn't be in this loop) if (null == e.get(EntityPojo.geotag_)) { e0.remove(); continue; } } String entity_index = e.getString(EntityPojo.index_); if (null == entity_index) continue; EntSigHolder shp = (EntSigHolder) _s1_entitiesInDataset.get(entity_index); if (null != shp) { // Stage 4x: alias processing, just overwrite // (note don't delete "duplicate entities", hard-to-be-globally-consistent // and will potentially throw data away which might be undesirable) if (null != shp.masterAliasSH) { shp = shp.masterAliasSH; // (already has all the aggregated values used below) if (!entity_index.equals(shp.aliasInfo.getIndex())) { e.put(EntityPojo.index_, shp.aliasInfo.getIndex()); e.put(EntityPojo.disambiguated_name_, shp.aliasInfo.getDisambiguatedName()); e.put(EntityPojo.type_, shp.aliasInfo.getType()); e.put(EntityPojo.dimension_, shp.aliasInfo.getDimension()); if (null != shp.aliasInfo.getGeotag()) { BasicDBObject aliasedGeoTag = new BasicDBObject(); aliasedGeoTag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat); aliasedGeoTag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon); e.put(EntityPojo.geotag_, aliasedGeoTag); if (null != shp.aliasInfo.getOntology_type()) { e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type()); } } //TESTED } } //TESTED // end Stage 4x of alias processing double dataSig = shp.datasetSignificance; if (Double.isNaN(dataSig)) { e.put(EntityPojo.datasetSignificance_, 0.0); } else { e.put(EntityPojo.datasetSignificance_, dataSig); } e.put(EntityPojo.queryCoverage_, shp.queryCoverage); e.put(EntityPojo.averageFreq_, shp.avgFreqOverQuerySubset); if (shp.nTotalSentimentValues > 0) { e.put(EntityPojo.positiveSentiment_, shp.positiveSentiment); e.put(EntityPojo.negativeSentiment_, shp.negativeSentiment); e.put(EntityPojo.sentimentCount_, shp.nTotalSentimentValues); } } else { // (most likely to occur if the entity is discarded (alias/filter) or is corrupt in some way) e0.remove(); continue; } } //(end loop over entities) } // (end if feed has entities) //TESTED // Explain if enabled if (null != qsf.explain) { f.put(DocumentPojo.explain_, qsf.explain); } // Add to the end of the list (so will come back from API call in natural order, highest first) returnList.addFirst(f); // (add elements to the front of the list so that the top of the list is ordered by priority) } catch (Exception e) { // Probably a JSON error, just carry on String title = f.getString(DocumentPojo.title_); logger.error(title + ": " + e.getMessage()); } } // (end loop over feeds) //TESTED // Update the scores: scores.maxScore = (float) dBestScore; if (nDocs > 0) { scores.avgScore = (float) dAvgScore / nDocs; } } ///////////////////////////////////////////////// // 4b] stage4_prepareEntsForOutput() // Using the priority queues calculated in step [3] generate the lists of documents and entities to return private void stage4_prepareEntsForOutput(LinkedList<BasicDBObject> entityReturn) { if (_s0_nNumEntsReturn > 0) { // (else entities not enabled) for (EntSigHolder qsf = _s3_pqEnt.poll(); null != qsf; qsf = _s3_pqEnt.poll()) // (start with lowest ranking) { BasicDBObject ent = qsf.unusedDbo; if (null == ent) { int nTries = 0; if (null != qsf.entityInstances) { // (should never be null but just to be on the safe side... for (TempEntityInDocBucket tefb : qsf.entityInstances) { // (Try to find an entity that wasn't promoted ie can now be re-used // if we can't find one quite quickly then bail out and we'll pay the cost of cloning it) if (!tefb.doc.bPromoted) { ent = tefb.dbo; break; } else if (++nTries > 10) { break; } } if (null == ent) { ent = qsf.entityInstances.get(0).dbo; } } else { // (no entityInstances, something alias-related has gone wrong, just skip) continue; } } //TESTED qsf.entityInstances = null; // (don't need this any more, can be gc'd) try { if (null != qsf.aliasInfo) { if (!qsf.index.equals(qsf.aliasInfo.getIndex())) { ent.put(EntityPojo.index_, qsf.aliasInfo.getIndex()); ent.put(EntityPojo.disambiguated_name_, qsf.aliasInfo.getDisambiguatedName()); ent.put(EntityPojo.type_, qsf.aliasInfo.getType()); ent.put(EntityPojo.dimension_, qsf.aliasInfo.getDimension()); if (null != qsf.aliasInfo.getGeotag()) { BasicDBObject aliasedGeoTag = new BasicDBObject(); aliasedGeoTag.put(GeoPojo.lat_, qsf.aliasInfo.getGeotag().lat); aliasedGeoTag.put(GeoPojo.lon_, qsf.aliasInfo.getGeotag().lon); ent.put(EntityPojo.geotag_, aliasedGeoTag); if (null != qsf.aliasInfo.getOntology_type()) { ent.put(EntityPojo.ontology_type_, qsf.aliasInfo.getOntology_type()); } } //TESTED } } //TESTED if (null == ent.get(EntityPojo.datasetSignificance_)) { // Not getting promoted so need to add fields... if (Double.isNaN(qsf.datasetSignificance)) { ent.put("datasetSignificance", 0.0); } else { ent.put(EntityPojo.datasetSignificance_, qsf.datasetSignificance); } ent.put(EntityPojo.queryCoverage_, qsf.queryCoverage); ent.put(EntityPojo.averageFreq_, qsf.avgFreqOverQuerySubset); if (qsf.nTotalSentimentValues > 0) { ent.put(EntityPojo.positiveSentiment_, qsf.positiveSentiment); ent.put(EntityPojo.negativeSentiment_, qsf.negativeSentiment); ent.put(EntityPojo.sentimentCount_, qsf.nTotalSentimentValues); } } else { // (... but can just use it without cloning) BasicDBObject ent2 = new BasicDBObject(); for (Map.Entry<String, Object> kv : ent.entrySet()) { ent2.append(kv.getKey(), kv.getValue()); } ent = ent2; } ent.removeField(EntityPojo.relevance_); if (Double.isNaN(qsf.maxDocSig)) { ent.put(EntityPojo.significance_, 0.0); } else { ent.put(EntityPojo.significance_, qsf.maxDocSig); } ent.put(EntityPojo.frequency_, (long) qsf.maxFreq); entityReturn.addFirst(ent); } catch (Exception e) { // Probably a JSON error, just carry on String title = ent.getString(EntityPojo.index_); logger.error(title + ": " + e.getMessage()); } //TESTED } } //TESTED } ///////////////////////////////////////////////////////////////////////////////////////////////////////////// // Utility private BasicDBObject _s0_docCountFields = null; private BasicDBObject _s0_docCountHint = null; private long getDocCount(ObjectId[] communityIds) { long nDocCount = 0; try { BasicDBObject query = new BasicDBObject(DocCountPojo._id_, new BasicDBObject(MongoDbManager.in_, communityIds)); if (null == _s0_docCountFields) { _s0_docCountFields = new BasicDBObject(DocCountPojo._id_, 0); _s0_docCountFields.put(DocCountPojo.doccount_, 1); _s0_docCountHint = new BasicDBObject(DocCountPojo._id_, 1); _s0_docCountHint.put(DocCountPojo.doccount_, 1); } DBCursor dbc = DbManager.getDocument().getCounts().find(query, _s0_docCountFields) .hint(_s0_docCountHint); while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); Iterator<?> it = dbo.values().iterator(); if (it.hasNext()) { nDocCount += (double) ((Long) it.next()).longValue(); // (from _s0_docCountFields, doccount is only return variable) } } if (0 == nDocCount) { // (Probably shouldn't happen if a harvest has occurred, just don't bomb out nDocCount = _s0_nQuerySetDocCount; } } catch (Exception e) { // Doc count might not be setup correctly? nDocCount = _s0_nQuerySetDocCount; } return nDocCount; }//TESTED // The overall plan is: // S1: identify alias (write helper function based on the code above), calculate overlapping doc count // S2: calc pythag significance, store first/last values ready for S3 // S3: first time through, do sqrt bit of pythag, last time through add to PQ // S4: overwrite the entity values with aliased entities where necessary private void stage1_initAlias(EntSigHolder shp) { EntityFeaturePojo alias = _s1_aliasLookup.getAliasMaster(shp.index); if (null != alias) { // overwrite index if (alias.getIndex().equalsIgnoreCase("document_discard")) { // (document discard... shouldn't have this document at this point, we'll just carry on if we do though) return; } if (alias.getIndex().equalsIgnoreCase("discard")) { shp.aliasInfo = alias; shp.masterAliasSH = null; return; } EntSigHolder masterAliasSH = null; if (null == _s1_aliasSummary) { _s1_aliasSummary = new HashMap<String, EntSigHolder>(); } else { masterAliasSH = _s1_aliasSummary.get(alias.getIndex()); } if (null == masterAliasSH) { masterAliasSH = new EntSigHolder(null, 0, null); //(use ESH as handy collection of req'd vars) _s1_aliasSummary.put(alias.getIndex(), masterAliasSH); } shp.masterAliasSH = masterAliasSH; shp.aliasInfo = alias; shp.masterAliasSH.aliasInfo = alias; // (no harm storing this in 2 places) } }//TESTED private double getManualScoreWeights(AdvancedQueryPojo.QueryScorePojo scoreParams, BasicDBObject doc) { // Highest prio: source key weight if (null != scoreParams.sourceWeights) { String sourceKey = DocumentPojo.getSourceKey(doc.getString(DocumentPojo.sourceKey_)); Double dWeight = scoreParams.sourceWeights.get(sourceKey); if (null != dWeight) { return dWeight; } } // Middle prio: type if (null != scoreParams.typeWeights) { String mediaType = doc.getString(DocumentPojo.mediaType_); Double dWeight = scoreParams.typeWeights.get(mediaType); if (null != dWeight) { return dWeight; } } // Lowest prio: average of tags if (null != scoreParams.tagWeights) { double dScore = 0.0; int nComps = 0; BasicDBList tags = (BasicDBList) doc.get(DocumentPojo.tags_); if (null != tags) { for (Object tagObj : tags) { String tag = (String) tagObj; Double dWeight = scoreParams.tagWeights.get(tag); if (null != dWeight) { nComps++; dScore += dWeight; } } if (nComps > 0) { return dScore / nComps; } } } return 1.0; }//TESTED (all 3 cases) //////////////////////////////////////////////////////////////////////////// // Low accuracy geo aggregation utils: // Code copied from ScoringUtils_Association private void loadLowAccuracyGeoBuckets(EntSigHolder shp) { double dBucket = shp.queryCoverage / (this._s2_maxGeoQueryCoverage + 0.01); // (ensure <1) if (dBucket > 1.0) dBucket = 1.0; int nBucket = _s3_nGEO_BUCKETS_1 - ((int) (_s3_dGEO_BUCKETS * dBucket) % _s3_nGEO_BUCKETS); LinkedList<EntSigHolder> bucketList = _s3_geoBuckets[nBucket]; if (null == bucketList) { bucketList = new LinkedList<EntSigHolder>(); _s3_geoBuckets[nBucket] = bucketList; } bucketList.add(shp); }//TESTED private void finalizeLowAccuracyGeoAggregation(AggregationUtils.GeoContainer geoContainer, long nMaxToReturn) { geoContainer.geotags = new TreeSet<GeoAggregationPojo>(); for (LinkedList<EntSigHolder> bucket : _s3_geoBuckets) { if (null != bucket) { for (EntSigHolder shp : bucket) { // Estimated count: try { if (null != shp.geotag) { // will always be the case... GeoAggregationPojo geo = new GeoAggregationPojo(); geo.lat = shp.geotag.getDouble(GeoPojo.lat_); geo.lon = shp.geotag.getDouble(GeoPojo.lon_); geo.type = shp.geotaggedEntity.getString(EntityPojo.ontology_type_); if (null == geo.type) { geo.type = "point"; } geo.count = (int) (0.01 * shp.queryCoverage * _s0_nQuerySetDocCount); // (query coverage is a %) geoContainer.geotags.add(geo); // (can change geo.count, where aggregation has happened) if (geo.count > geoContainer.maxCount) { geoContainer.maxCount = geo.count; } if (geo.count < geoContainer.minCount) { geoContainer.minCount = geo.count; } if (geoContainer.geotags.size() >= nMaxToReturn) { return; } } } catch (Exception e) { } // geotag invalid just carry on } } } }//TESTED // MEMORY HANDLING UTILITY public static class SizeReportingBasicBSONDecoder extends org.bson.BasicBSONDecoder implements DBDecoderFactory, DBDecoder { @Override public int decode(byte[] b, BSONCallback callback) { int size = super.decode(b, callback); _size += size; return size; } @Override public int decode(InputStream in, BSONCallback callback) throws IOException { int size = super.decode(in, callback); _size += size; return size; } public void resetSize() { _size = 0; } public long getSize() { return _size; } protected long _size = 0; @Override public DBDecoder create() { return this; } @Override public DBObject decode(byte[] b, DBCollection collection) { DBCallback cbk = getDBCallback(collection); cbk.reset(); decode(b, cbk); return (DBObject) cbk.get(); } @Override public DBObject decode(InputStream in, DBCollection collection) throws IOException { DBCallback cbk = getDBCallback(collection); cbk.reset(); decode(in, cbk); return (DBObject) cbk.get(); } @Override public DBCallback getDBCallback(DBCollection collection) { return new DefaultDBCallback(collection); } } }