com.ikanow.infinit.e.processing.generic.aggregation.EntityAggregationUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.ikanow.infinit.e.processing.generic.aggregation.EntityAggregationUtils.java

Source

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package com.ikanow.infinit.e.processing.generic.aggregation;

import java.util.Date;
import java.util.Iterator;
import java.util.Map;

import org.apache.log4j.Logger;
import org.bson.types.ObjectId;

import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
import com.ikanow.infinit.e.data_model.index.IndexManager;
import com.ikanow.infinit.e.data_model.index.feature.entity.EntityFeaturePojoIndexMap;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.ikanow.infinit.e.data_model.store.document.GeoPojo;
import com.ikanow.infinit.e.data_model.store.feature.entity.EntityFeaturePojo;
import com.ikanow.infinit.e.processing.generic.GenericProcessingController;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;

//TODO (INF-1660): here and for assocs, I think I should delete entities that have doccount==0

public class EntityAggregationUtils {

    private static final Logger logger = Logger.getLogger(EntityAggregationUtils.class);

    private static boolean _diagnosticMode = false;
    private static boolean _logInDiagnosticMode = true;

    public static void setDiagnosticMode(boolean bMode) {
        _diagnosticMode = bMode;
    }

    public static void setLogInDiagnosticMode(boolean bLog) {
        _logInDiagnosticMode = bLog;
    }

    private static boolean _incrementalMode = false;

    public static void setIncrementalMode(boolean mode) {
        _incrementalMode = mode;
    }

    ///////////////////////////////////////////////////////////////////////////////////////   
    ///////////////////////////////////////////////////////////////////////////////////////   

    // CREATING/UPDATE FEATURES

    ///////////////////////////////////////////////////////////////////////////////////////   
    ///////////////////////////////////////////////////////////////////////////////////////   

    /**
     * Updates the feature entries for the list of entities
     * that was just extracted including changing frequency,
     * adding aliases etc
     * 
     * This method now has 3 steps:
     * 1. Try to update alias
     *    1.a If fail, create new gaz
     * 2. Update totalfreq and doccount
     * 
     * @param ents List of entities to update in the entity feature
     */
    public static void updateEntityFeatures(Map<String, Map<ObjectId, EntityFeaturePojo>> entFeatures) {
        // Some diagnostic counters:
        int numCacheMisses = 0;
        int numCacheHits = 0;
        int numNewEntities = 0;
        long entityAggregationTime = new Date().getTime();

        DBCollection col = DbManager.getFeature().getEntity();

        // (This fn is normally run for a single community id)
        CommunityFeatureCaches.CommunityFeatureCache currCache = null;

        String savedSyncTime = null;
        for (Map<ObjectId, EntityFeaturePojo> entCommunity : entFeatures.values()) {

            Iterator<Map.Entry<ObjectId, EntityFeaturePojo>> it = entCommunity.entrySet().iterator();
            while (it.hasNext()) {
                Map.Entry<ObjectId, EntityFeaturePojo> entFeatureKV = it.next();
                try {
                    EntityFeaturePojo entFeature = entFeatureKV.getValue();

                    long nSavedDocCount = entFeature.getDoccount();
                    long nSavedFreqCount = entFeature.getTotalfreq();
                    // (these should be constant across all communities but keep it here
                    //  so can assign it using entFeature, it's v cheap so no need to get once like for sync vars)

                    // For each community, see if the entity feature already exists *for that community*               
                    ObjectId communityID = entFeature.getCommunityId();
                    if (null != communityID) {
                        if ((null == currCache) || !currCache.getCommunityId().equals(entFeatureKV.getKey())) {
                            currCache = CommunityFeatureCaches.getCommunityFeatureCache(entFeatureKV.getKey());
                            if (_diagnosticMode) {
                                if (_logInDiagnosticMode)
                                    System.out.println(
                                            "EntityAggregationUtils.updateEntityFeatures, Opened cache for community: "
                                                    + entFeatureKV.getKey());
                            }
                        } //TESTED (by hand)

                        // Is this in our cache? If so can short cut a bunch of the DB interaction:
                        EntityFeaturePojo cachedEnt = currCache.getCachedEntityFeature(entFeature);
                        if (null != cachedEnt) {
                            if (_incrementalMode) {
                                if (_diagnosticMode) {
                                    if (_logInDiagnosticMode)
                                        System.out.println(
                                                "EntityAggregationUtils.updateEntityFeatures, skip cached: "
                                                        + cachedEnt.toDb());
                                    //TODO (INF-2825): should be continue-ing here (after implementing incremental caching fully) so can use delta more efficiently...
                                }
                            } else if (_diagnosticMode) {
                                if (_logInDiagnosticMode)
                                    System.out
                                            .println("EntityAggregationUtils.updateEntityFeatures, grabbed cached: "
                                                    + cachedEnt.toDb());
                            }
                            numCacheHits++;

                        } //TESTED (by hand)                  
                        else {
                            numCacheMisses++;
                        }

                        BasicDBObject query = new BasicDBObject(EntityFeaturePojo.index_, entFeature.getIndex());
                        query.put(EntityFeaturePojo.communityId_, communityID);
                        BasicDBObject updateOp = new BasicDBObject();
                        // Add aliases:
                        BasicDBObject updateOpA = new BasicDBObject();
                        if (null != entFeature.getAlias()) {
                            if ((null == cachedEnt) || (null == cachedEnt.getAlias())
                                    || !cachedEnt.getAlias().containsAll(entFeature.getAlias())) {
                                //(if the data we have is already cached, don't bother adding it again)
                                BasicDBObject multiopE = new BasicDBObject(MongoDbManager.each_,
                                        entFeature.getAlias());
                                updateOpA.put(EntityFeaturePojo.alias_, multiopE);
                            } //TESTED (by hand)
                        }
                        // Add link data, if there is any:
                        if ((null != entFeature.getSemanticLinks()) && !entFeature.getSemanticLinks().isEmpty()) {
                            if ((null == cachedEnt) || (null == cachedEnt.getSemanticLinks())
                                    || !cachedEnt.getSemanticLinks().containsAll(entFeature.getSemanticLinks())) {
                                //(if the data we have is already cached, don't bother adding it again)
                                BasicDBObject multiopF = new BasicDBObject(MongoDbManager.each_,
                                        entFeature.getSemanticLinks());
                                updateOpA.put(EntityFeaturePojo.linkdata_, multiopF);
                            } //TESTED (by hand)
                        }
                        // OK - now we can copy across the fields into the cache:
                        if (null != cachedEnt) {
                            currCache.updateCachedEntityFeatureStatistics(cachedEnt, entFeature); //(entFeature is now fully up to date)
                        } //TESTED (by hand)

                        if (!updateOpA.isEmpty()) {
                            updateOp.put(MongoDbManager.addToSet_, updateOpA);
                        }
                        // Update frequency:
                        BasicDBObject updateOpB = new BasicDBObject();
                        updateOpB.put(EntityFeaturePojo.totalfreq_, nSavedFreqCount);
                        updateOpB.put(EntityFeaturePojo.doccount_, nSavedDocCount);
                        updateOp.put(MongoDbManager.inc_, updateOpB);

                        //try to use find/modify to see if something comes back and set doc freq/totalfreq
                        BasicDBObject fields = new BasicDBObject(EntityFeaturePojo.totalfreq_, 1);
                        fields.put(EntityFeaturePojo.doccount_, 1);
                        fields.put(EntityFeaturePojo.alias_, 1);
                        fields.put(EntityFeaturePojo.linkdata_, 1);
                        //(slightly annoying, since only want these 2 largish fields if updating freq but won't know
                        // until after i've got this object)                  
                        fields.put(EntityFeaturePojo.db_sync_time_, 1);
                        fields.put(EntityFeaturePojo.db_sync_doccount_, 1);

                        DBObject dboUpdate = null;
                        if (_diagnosticMode) {
                            if (null == cachedEnt) {
                                dboUpdate = col.findOne(query, fields);
                            }
                        } else {
                            if (null != cachedEnt) {
                                col.update(query, updateOp, false, false);
                            } else { // Not cached - so have to grab the feature we're either getting or creating
                                dboUpdate = col.findAndModify(query, fields, new BasicDBObject(), false, updateOp,
                                        false, true);
                                // (can use findAndModify because specify index, ie the shard key)
                                // (returns entity before the changes above, update the feature object below)
                                // (also atomically creates the object if it doesn't exist so is "distributed-safe")
                            }
                        }
                        if ((null != cachedEnt) || ((dboUpdate != null) && !dboUpdate.keySet().isEmpty())) // (feature already exists)
                        {
                            EntityFeaturePojo gp = cachedEnt;

                            // (Update the entity feature to be correct so that it can be accurately synchronized with the index)
                            if (null == gp) {
                                gp = EntityFeaturePojo.fromDb(dboUpdate, EntityFeaturePojo.class);
                                entFeature.setTotalfreq(gp.getTotalfreq() + nSavedFreqCount);
                                entFeature.setDoccount(gp.getDoccount() + nSavedDocCount);
                                entFeature.setDbSyncDoccount(gp.getDbSyncDoccount());
                                entFeature.setDbSyncTime(gp.getDbSyncTime());
                                if (null != gp.getAlias()) {
                                    entFeature.addAllAlias(gp.getAlias());
                                }
                                if (null != gp.getSemanticLinks()) {
                                    entFeature.addToSemanticLinks(gp.getSemanticLinks());
                                }
                            } //TESTED (cached case and non-cached case)
                              // (in the cached case, entFeature has already been updated by updateCachedEntityFeatureStatistics)

                            if (_diagnosticMode) {
                                if (_logInDiagnosticMode)
                                    System.out.println("EntityAggregationUtils.updateEntityFeatures, found: "
                                            + ((BasicDBObject) gp.toDb()).toString());
                                if (_logInDiagnosticMode)
                                    System.out.println(
                                            "EntityAggregationUtils.updateEntityFeatures, ^^^ found from query: "
                                                    + query.toString() + " / " + updateOp.toString());
                            }
                            // (In background aggregation mode we update db_sync_prio when checking the doc update schedule) 
                        } else // (the object in memory is now an accurate representation of the database, minus some fields we'll now add)
                        {
                            numNewEntities++;

                            // Synchronization settings for the newly created object
                            if (null == savedSyncTime) {
                                savedSyncTime = Long.toString(System.currentTimeMillis());
                            }
                            entFeature.setDbSyncDoccount(nSavedDocCount);
                            entFeature.setDbSyncTime(savedSyncTime);

                            // This is all "distributed safe" (apart from the db_syc_xxx and it doesn't matter if that is 
                            // out of date, the update will just be slightly out-of-date at worst) since (otherwise) these fields are 
                            // only set here, and the findAndModify is atomic

                            // (Do in raw MongoDB for performance)
                            BasicDBObject baseFields = new BasicDBObject();
                            baseFields.put(EntityFeaturePojo.dimension_, entFeature.getDimension().toString());
                            baseFields.put(EntityFeaturePojo.type_, entFeature.getType());
                            baseFields.put(EntityFeaturePojo.disambiguated_name_,
                                    entFeature.getDisambiguatedName());
                            baseFields.put(EntityFeaturePojo.db_sync_doccount_, entFeature.getDbSyncDoccount());
                            baseFields.put(EntityFeaturePojo.db_sync_prio_, 1000.0);
                            baseFields.put(EntityFeaturePojo.db_sync_time_, entFeature.getDbSyncTime());
                            if ((null != entFeature.getSemanticLinks())
                                    && !entFeature.getSemanticLinks().isEmpty()) {
                                baseFields.put(EntityFeaturePojo.linkdata_, entFeature.getSemanticLinks());
                            }

                            //attempt to add geotag (makes necessary checks on util side)
                            //also add ontology type if geotag is found
                            EntityGeotagAggregationUtils.addEntityGeo(entFeature);
                            if (entFeature.getGeotag() != null) {
                                BasicDBObject geo = new BasicDBObject(GeoPojo.lat_, entFeature.getGeotag().lat);
                                geo.put(GeoPojo.lon_, entFeature.getGeotag().lon);
                                baseFields.put(EntityFeaturePojo.geotag_, geo);

                                if (entFeature.getOntology_type() != null) {
                                    baseFields.put(EntityFeaturePojo.ontology_type_, entFeature.getOntology_type());
                                }
                            }

                            if (!_diagnosticMode) {
                                // Store the object
                                col.update(query, new BasicDBObject(MongoDbManager.set_, baseFields));
                            } else {
                                if (_logInDiagnosticMode)
                                    System.out.println("EntityAggregationUtils.updateEntityFeatures, not found: "
                                            + query.toString() + ": " + baseFields.toString());
                            }

                        } //(end first time this feature seen - globally)

                        if (null == cachedEnt) { // First time we've seen this locally, so add to cache
                            currCache.addCachedEntityFeature(entFeature);
                            if (_diagnosticMode) {
                                if (_logInDiagnosticMode)
                                    System.out
                                            .println("EntityAggregationUtils.updateEntityFeatures, added to cache: "
                                                    + entFeature.toDb());
                            }
                        } //TESTED (by hand)                     

                    } //(end if community id assigned)
                } catch (Exception e) {
                    // Exception, remove from feature list
                    it.remove();

                    // If an exception occurs log the error
                    logger.error("Exception Message: " + e.getMessage(), e);
                }

            } // (end loop over communities)
        } // (end loop over indexes)

        if ((numCacheHits > 0) || (numCacheMisses > 0)) { // ie some ents were grabbed
            int cacheSize = 0;
            if (null != currCache) {
                cacheSize = currCache.getEntityCacheSize();
            }
            StringBuffer logMsg = new StringBuffer() // (should append key, but don't have that...)
                    .append(" ent_agg_time_ms=").append(new Date().getTime() - entityAggregationTime)
                    .append(" total_ents=").append(entFeatures.size()).append(" new_ents=").append(numNewEntities)
                    .append(" cache_misses=").append(numCacheMisses).append(" cache_hits=").append(numCacheHits)
                    .append(" cache_size=").append(cacheSize);

            logger.info(logMsg.toString());
        }

    }//TESTED (just by eye - made few changes during re-factoring)

    ///////////////////////////////////////////////////////////////////////////////////////   
    ///////////////////////////////////////////////////////////////////////////////////////   

    // DOCUMENT UPDATE

    ///////////////////////////////////////////////////////////////////////////////////////   
    ///////////////////////////////////////////////////////////////////////////////////////   

    // NOTE: IMPORTANT that only entFeature attributes used are index/totalfreq/doccount
    // (this is relied upon by BackgroundAggregationThread)

    public static void updateMatchingEntities(EntityFeaturePojo entFeature, ObjectId communityId) {
        String index = entFeature.getIndex();
        long totalFreq = entFeature.getTotalfreq();
        long docCount = entFeature.getDoccount();

        try {
            DBCollection docDb = DbManager.getDocument().getMetadata();

            BasicDBObject query1 = new BasicDBObject();
            query1.put(EntityPojo.docQuery_index_, index);
            query1.put(DocumentPojo.communityId_, communityId);

            BasicDBObject multiopB = new BasicDBObject();
            multiopB.put(EntityPojo.docUpdate_totalfrequency_, totalFreq);
            multiopB.put(EntityPojo.docUpdate_doccount_, docCount);
            BasicDBObject multiopA = new BasicDBObject(MongoDbManager.set_, multiopB);

            if (_diagnosticMode) {
                if (_logInDiagnosticMode)
                    System.out.println("EntityAggregationUtils.updateMatchingEntities: " + query1.toString() + " / "
                            + multiopA.toString());
            } else {
                synchronized (GenericProcessingController.class) {
                    // Because this op can be slow, and traverse a lot of disk, need to ensure that
                    // we don't allow all the threads to hammer it at once (the updates all yield to each other
                    // enough that the disk goes totally crazy)

                    docDb.update(query1, multiopA, false, true);
                    DbManager.getDocument().getLastError(DbManager.getDocument().getMetadata().getName());
                    // (enforce consecutive accesses for this potentially very slow operation)
                }

                // Was originally checked updatedExisting but for INF-1406, it sometimes seemed to be 
                // checking the wrong command. I suspect the reason we had this code in here has gone away,
                // and it doesn't matter if this update occasionally fails anyway, it will just be out of date
                // so the check/retry has been removed.
            }
        } catch (Exception ex) {
            logger.error(ex.getMessage(), ex);
        }
    }//TESTED (by eye, mostly cut-and-paste from test Beta)

    ///////////////////////////////////////////////////////////////////////////////////////   
    ///////////////////////////////////////////////////////////////////////////////////////   

    // INDEX SYNCHRONIZATION

    ///////////////////////////////////////////////////////////////////////////////////////   
    ///////////////////////////////////////////////////////////////////////////////////////   

    // Synchronization: sync vs index, update sync counts for each community

    public static void synchronizeEntityFeature(EntityFeaturePojo entityFeature, ObjectId communityId) {
        DBCollection entityFeatureDb = DbManager.getFeature().getEntity();

        // NOTE: Important that feeds update occurs before synchronization, since the sync "corrupts" the entity

        if (_diagnosticMode || (null != entityFeature.getDbSyncTime())
                || (null != entityFeature.getDb_sync_prio())) {
            // Else this is a new feature so don't need to update the feature DB, only the index (if db_sync_prio null then have to update to avoid b/g aggergation loop)
            // (note that db_sync_prio will in practice not be set when this is a new feature because it will have same sync_doccount as doc_count)

            long nCurrTime = System.currentTimeMillis();
            //(query from top of the function, basically lookup on gaz_index)
            BasicDBObject update2 = new BasicDBObject();
            update2.put(EntityFeaturePojo.db_sync_time_, Long.toString(nCurrTime));
            update2.put(EntityFeaturePojo.db_sync_doccount_, entityFeature.getDoccount());
            BasicDBObject update = new BasicDBObject(MongoDbManager.set_, update2);
            BasicDBObject update3 = new BasicDBObject(EntityFeaturePojo.db_sync_prio_, 1);
            update.put(MongoDbManager.unset_, update3);
            BasicDBObject query = new BasicDBObject(EntityFeaturePojo.index_, entityFeature.getIndex());
            query.put(EntityFeaturePojo.communityId_, communityId);

            if (_diagnosticMode) {
                if ((null != entityFeature.getDbSyncTime()) || (null != entityFeature.getDb_sync_prio())) {
                    if (_logInDiagnosticMode)
                        System.out.println("EntityAggregationUtils.synchronizeEntityFeature, featureDB: "
                                + query.toString() + " / " + update.toString());
                } else {
                    if (_logInDiagnosticMode)
                        System.out.println(
                                "(WOULD NOT RUN) EntityAggregationUtils.synchronizeEntityFeature, featureDB: "
                                        + query.toString() + " / " + update.toString());
                }
            } else {
                entityFeatureDb.update(query, update, false, true);
            }
        }

        if (_diagnosticMode) {
            if (_logInDiagnosticMode)
                System.out.println("EntityAggregationUtils.synchronizeEntityFeature, synchronize: "
                        + new StringBuffer(entityFeature.getIndex()).append(':').append(communityId).toString()
                        + " = " + IndexManager.mapToIndex(entityFeature, new EntityFeaturePojoIndexMap()));
        } else {
            ElasticSearchManager esm = IndexManager.getIndex(EntityFeaturePojoIndexMap.indexName_);
            esm.addDocument(entityFeature, new EntityFeaturePojoIndexMap(), null, true);
            //(_id is set by the index map to index:communityId)
        }
    }//TESTED (by eye, mostly cut-and-paste from test Beta)

    ///////////////////////////////////////////////////////////////////////////////////////

    // Set flag to synchronize entity features

    public static void markEntityFeatureForSync(EntityFeaturePojo entityFeature, ObjectId communityId) {
        DBCollection entityFeatureDb = DbManager.getFeature().getEntity();
        double dPrio = 100.0 * (double) entityFeature.getDoccount()
                / (0.01 + (double) entityFeature.getDbSyncDoccount());
        entityFeature.setDb_sync_prio(dPrio);
        BasicDBObject query = new BasicDBObject(EntityFeaturePojo.index_, entityFeature.getIndex());
        query.put(EntityFeaturePojo.communityId_, communityId);
        BasicDBObject update = new BasicDBObject(MongoDbManager.set_,
                new BasicDBObject(EntityFeaturePojo.db_sync_prio_, dPrio));
        if (_diagnosticMode) {
            if (_logInDiagnosticMode)
                System.out.println("EntityAggregationUtils.markEntityFeatureForSynchronization, featureDB: "
                        + query.toString() + " / " + update.toString());
        } else {
            entityFeatureDb.update(query, update, false, true);
        }
    }//TESTED
}