com.ikanow.infinit.e.processing.generic.GenericProcessingController.java Source code

Introduction

Here is the source code for com.ikanow.infinit.e.processing.generic.GenericProcessingController.java
Source

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package com.ikanow.infinit.e.processing.generic;

import java.util.HashMap;
import java.util.List;

//import org.apache.log4j.Logger;
import org.bson.types.ObjectId;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.ImmutableSettings.Builder;

import com.google.gson.Gson;
import com.ikanow.infinit.e.data_model.InfiniteEnums;
import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
import com.ikanow.infinit.e.data_model.index.IndexManager;
import com.ikanow.infinit.e.data_model.index.document.DocumentPojoIndexMap;
import com.ikanow.infinit.e.data_model.index.feature.entity.EntityFeaturePojoIndexMap;
import com.ikanow.infinit.e.data_model.index.feature.event.AssociationFeaturePojoIndexMap;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.custom.mapreduce.CustomMapReduceJobPojo;
import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
import com.ikanow.infinit.e.data_model.store.document.DocCountPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.ikanow.infinit.e.data_model.store.feature.association.AssociationFeaturePojo;
import com.ikanow.infinit.e.data_model.store.feature.entity.EntityFeaturePojo;
import com.ikanow.infinit.e.data_model.store.social.community.CommunityPojo;
import com.ikanow.infinit.e.processing.generic.aggregation.AggregationManager;
import com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager;
import com.ikanow.infinit.e.processing.generic.utils.PropertiesManager;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;

import org.elasticsearch.action.admin.cluster.state.ClusterStateRequest;
import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
import org.elasticsearch.cluster.metadata.AliasMetaData;
import org.elasticsearch.common.collect.CrossVersionImmutableMapOfImmutableMaps;

//DEBUG (alias corruption)
//import org.elasticsearch.action.admin.indices.status.IndexStatus;
//import org.elasticsearch.action.admin.indices.status.IndicesStatusRequest;
//import org.elasticsearch.action.admin.indices.status.IndicesStatusResponse;

public class GenericProcessingController {

    //NOTE THIS FUNCTION SHOULD CONTAIN NO STATE SINCE IT CAN BE RUN ACROSS MULTIPLE THREADS

    //(Nothing currently to log)
    //private static final Logger logger = Logger.getLogger(GenericProcessingController.class);

    ///////////////////////////////////////////////////////////////////////////////////////
    //
    // Set up the databases and indexes

    public void Initialize() {
        InitializeDatabase();
        InitializeIndex(false, false, false);
        // (Don't delete anything, obviously)
    }

    public void InitializeDatabase() {
        // Add indices:
        try {
            PropertiesManager pm = new PropertiesManager();

            ////////////////////////
            //
            // Remove old indexes, mostly just old code that is no longer needed
            //
            dropIndexIfItExists(DbManager.getDocument().getContent(), CompressedFullTextPojo.url_, 1);
            dropIndexIfItExists(DbManager.getDocument().getContent(), CompressedFullTextPojo.sourceKey_, 2);
            dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.sourceUrl_, 1);
            dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.sourceKey_, 1);
            dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.title_, 1);
            // (Title simply not needed, that was a mistake from an early iteration)
            dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.updateId_, 1);
            dropIndexIfItExists(DbManager.getSocial().getShare(), "type", 1);
            dropIndexIfItExists(DbManager.getSocial().getCookies(), "apiKey", 1);
            dropIndexIfItExists(DbManager.getCustom().getLookup(), CustomMapReduceJobPojo.jobidS_, 2);
            dropIndexIfItExists(DbManager.getCustom().getLookup(), CustomMapReduceJobPojo.waitingOn_, 2);
            // (see shard keys below, these legacy ones can appear if the DB is restored from a different machine's backup)
            dropIndexIfNotNeeded(DbManager.getDocument().getContent(), "sourceKey_1_url_1", 0, "sourceKey_2_url_2",
                    0);
            dropIndexIfNotNeeded(DbManager.getDocument().getMetadata(), "sourceKey_1__id_1", 0,
                    "sourceKey_1__id_-1", 0);

            ////////////////////////
            //
            // Indexes needed for sharding:
            //
            // ** Content (has changed a bit)         
            BasicDBObject compIndex = new BasicDBObject(CompressedFullTextPojo.sourceKey_, 1);
            compIndex.put(CompressedFullTextPojo.url_, 1);
            addIndexIfNeeded(DbManager.getDocument().getContent(), "sourceKey_2_url_2", 0, compIndex); // (remove legacy 2_2 and replace with 1_1, which supports shards)
            // ** Metadata
            // Add {_id:1} to "standalone" sourceKey, sort docs matching source key by "time" (sort of!) 
            compIndex = new BasicDBObject(DocumentPojo.sourceKey_, 1);
            compIndex.put(DocumentPojo._id_, 1);
            addIndexIfNeeded(DbManager.getDocument().getMetadata(), "sourceKey_1__id_-1", 0, compIndex); // (remove legacy 1_-1 and replace with 1_1, which supports shards)
            // ** Entities and associations
            DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.index_, 1));
            DbManager.getFeature().getAssociation()
                    .ensureIndex(new BasicDBObject(AssociationFeaturePojo.index_, 1));

            ////////////////////////
            //
            // Other indexes
            //
            // Needed to handle updates of large files containing many URLs:
            DbManager.getDocument().getMetadata().ensureIndex(new BasicDBObject(DocumentPojo.sourceUrl_, 2),
                    new BasicDBObject(MongoDbManager.sparse_, true));
            //^NOTE: if this index changes, also need to change DuplicateManager_Integrated - search for "sourceUrl_" to see where
            //TODO (INF-1922): at some point should look into making (sparse) sourceUrl be compounded with sourceKey - this is a bit risky

            // Needed for duplicate checking
            // (Compound index lets me access {url, sourceKey}, {url} efficiently ... but need sourceKey separately to do {sourceKey})
            compIndex = new BasicDBObject(DocumentPojo.url_, 1);
            compIndex.put(DocumentPojo.sourceKey_, 1);
            DbManager.getDocument().getMetadata().ensureIndex(compIndex);
            // Needed to handle document updates
            DbManager.getDocument().getMetadata().ensureIndex(new BasicDBObject(DocumentPojo.updateId_, 2),
                    new BasicDBObject(MongoDbManager.sparse_, true));
            // Needed to update documents' entities' doc counts
            if (!pm.getAggregationDisabled()) {
                compIndex = new BasicDBObject(EntityPojo.docQuery_index_, 1);
                compIndex.put(DocumentPojo.communityId_, 1);
                DbManager.getDocument().getMetadata().ensureIndex(compIndex);
            }
            // Needed for keeping source/community doc counts
            compIndex = new BasicDBObject(DocCountPojo._id_, 1);
            compIndex.put(DocCountPojo.doccount_, 1);
            DbManager.getDocument().getCounts().ensureIndex(compIndex);
            // Needed for keep tracking of entities
            DbManager.getFeature().getEntity()
                    .ensureIndex(new BasicDBObject(EntityFeaturePojo.disambiguated_name_, 1));
            DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.alias_, 1));
            // Needed for background re-calculation
            DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.db_sync_prio_, 2),
                    new BasicDBObject(MongoDbManager.sparse_, true));
            DbManager.getFeature().getAssociation().ensureIndex(
                    new BasicDBObject(AssociationFeaturePojo.db_sync_prio_, 2),
                    new BasicDBObject(MongoDbManager.sparse_, true));
            // Needed for geo-location in the entity pipeline
            DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("country", 1));
            DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("search_field", 1));
            DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("geoindex", "2d"));
            // Needed for source management
            DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.key_, 1));
            DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.communityIds_, 1));
            DbManager.getIngest().getSource()
                    .ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvested_, 1));
            DbManager.getIngest().getSource()
                    .ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_synced_, 1));
            DbManager.getIngest().getSource()
                    .ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_, 1));
            // Federated query engine
            DbManager.getIngest().getSource().ensureIndex(
                    new BasicDBObject(SourcePojo.federatedQueryCommunityIds_, 1),
                    new BasicDBObject(MongoDbManager.sparse_, true));
            // Communities:
            DbManager.getSocial().getCommunity().ensureIndex(new BasicDBObject("members._id", 1));

            // Searching shares
            // Compound index lets me access {type, communities._id}, {type} efficiently
            compIndex = new BasicDBObject("type", 1);
            compIndex.put("communities._id", 1);
            DbManager.getSocial().getShare().ensureIndex(compIndex);
            // User logins
            DbManager.getSocial().getCookies().ensureIndex(new BasicDBObject("apiKey", 2),
                    new BasicDBObject(MongoDbManager.sparse_, true));
            // Custom job scheduling
            DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobtitle_, 1));
            //TODO (): MOVE THESE TO SPARSE INDEXES AFTER YOU'VE UPDATED THE LOGIC (SWAP THE 1 AND 2)
            DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 1),
                    new BasicDBObject(MongoDbManager.sparse_, false));
            //         DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
            //         dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.jobidS_, 1);
            DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 1),
                    new BasicDBObject(MongoDbManager.sparse_, false));
            //         DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
            //         dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.waitingOn_, 1);
        } catch (Exception e) {
            e.printStackTrace();
            throw new RuntimeException(e.getMessage());
        }
    }//TESTED (not changed since by-eye test in Beta)

    // Some *DB* index utilities (note note Lucene index)

    private static void addIndexIfNeeded(DBCollection coll, String indexToCheck, int nIndexIndex,
            BasicDBObject newIndex) {
        StringBuffer indexNameStrBuff = new StringBuffer(indexToCheck);
        if (0 != nIndexIndex) {
            indexNameStrBuff.append("_").append(nIndexIndex);
        }
        String indexName2 = indexNameStrBuff.toString();
        List<DBObject> list = coll.getIndexInfo();
        for (DBObject dbo : list) {
            String name = (String) dbo.get("name");
            if (indexName2.equalsIgnoreCase(name)) {
                return; // no need to create a new index
            }
        }
        // If we're here then we didn't find the index so create a new index
        try {
            coll.ensureIndex(newIndex);
        } catch (Exception e) {
        }
    }//TESTED

    private static void dropIndexIfNotNeeded(DBCollection coll, String indexToCheck, int nIndexToCheckIndex,
            String indexToDelete, int nIndexToDeleteIndex) {
        StringBuffer indexNameStrBuff = new StringBuffer(indexToCheck);
        if (0 != nIndexToCheckIndex) {
            indexNameStrBuff.append("_").append(nIndexToCheckIndex);
        }
        String indexToCheck2 = indexNameStrBuff.toString();
        indexNameStrBuff.setLength(0);
        indexNameStrBuff.append(indexToDelete);
        if (0 != nIndexToDeleteIndex) {
            indexNameStrBuff.append("_").append(nIndexToDeleteIndex);
        }

        boolean foundIndexToDelete = false;
        boolean foundIndexToCheck = false;
        String indexToDelete2 = indexNameStrBuff.toString();
        List<DBObject> list = coll.getIndexInfo();
        for (DBObject dbo : list) {
            String name = (String) dbo.get("name");
            if (indexToCheck2.equalsIgnoreCase(name)) {
                foundIndexToCheck = true;
            } else if (indexToDelete2.equalsIgnoreCase(name)) {
                foundIndexToDelete = true;
            }
        }
        if (foundIndexToCheck && foundIndexToDelete) {
            try {
                coll.dropIndex(indexToDelete2);
            } catch (Exception e) {
            }
        }
    }//TESTED

    private void dropIndexIfItExists(DBCollection coll, String indexName, int nIndexIndex) {
        StringBuffer indexNameStrBuff = new StringBuffer(indexName);
        if (0 != nIndexIndex) {
            indexNameStrBuff.append("_").append(nIndexIndex);
        }
        String indexName2 = indexNameStrBuff.toString();
        List<DBObject> list = coll.getIndexInfo();
        for (DBObject dbo : list) {
            String name = (String) dbo.get("name");
            if (indexName2.equalsIgnoreCase(name)) {
                try {
                    coll.dropIndex(name);
                } catch (Exception e) {
                }
            }
        }
    }//TESTED

    /////////////////////////////////////////////////////////

    // Lucene index initialization

    // (Note some of the code below is duplicated in MongoDocumentTxfer, so make sure you sync changes)
    public void InitializeIndex(boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature) {
        InitializeIndex(bDeleteDocs, bDeleteEntityFeature, bDeleteEventFeature, false);
    }

    public void InitializeIndex(boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature,
            boolean bRebuildDocsIndex) {

        try { //create elasticsearch indexes

            if (!ElasticSearchManager.pingIndex(null, null)) {
                throw new RuntimeException("Index is red, disable indexing operations");
            } //TESTED

            PropertiesManager pm = new PropertiesManager();

            if (!pm.getAggregationDisabled()) {

                boolean languageNormalization = pm.getNormalizeEncoding();

                Builder localSettingsEvent = ImmutableSettings.settingsBuilder();
                localSettingsEvent.put("number_of_shards", 10).put("number_of_replicas", 2);
                localSettingsEvent.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
                if (languageNormalization) {
                    localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer",
                            "icu_folding", "standard", "lowercase");
                } else {
                    localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard",
                            "lowercase");
                }

                Builder localSettingsGaz = ImmutableSettings.settingsBuilder();
                localSettingsGaz.put("number_of_shards", 10).put("number_of_replicas", 2);
                localSettingsGaz.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
                if (languageNormalization) {
                    localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer",
                            "icu_folding", "standard", "lowercase");
                } else {
                    localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard",
                            "lowercase");
                }

                //event feature
                String eventGazMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(),
                        AssociationFeaturePojoIndexMap.Mapping.class);
                ElasticSearchManager eventIndex = IndexManager.createIndex(
                        AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping,
                        localSettingsEvent);
                if (null == eventIndex) { // (if has been previously referenced in this process space)
                    eventIndex = IndexManager.getIndex(AssociationFeaturePojoIndexMap.indexName_);
                }
                eventIndex.createAlias(AssociationFeaturePojoIndexMap.indexCollectionName_);
                if (bDeleteEventFeature) {
                    eventIndex.deleteMe();
                    eventIndex = IndexManager.createIndex(AssociationFeaturePojoIndexMap.indexName_, null, false,
                            null, eventGazMapping, localSettingsEvent);
                }
                //entity feature
                String gazMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(),
                        EntityFeaturePojoIndexMap.Mapping.class);
                ElasticSearchManager entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_,
                        null, false, null, gazMapping, localSettingsGaz);
                if (null == entityIndex) { // (if has been previously referenced in this process space)
                    entityIndex = IndexManager.getIndex(EntityFeaturePojoIndexMap.indexName_);
                }
                entityIndex.createAlias(EntityFeaturePojoIndexMap.indexCollectionName_);
                if (bDeleteEntityFeature) {
                    entityIndex.deleteMe();
                    entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_, null, false, null,
                            gazMapping, localSettingsGaz);
                }
            }

            //DOCS - much more complicated than anything else 

            boolean bPingMainIndexFailed = !ElasticSearchManager
                    .pingIndex(DocumentPojoIndexMap.globalDocumentIndex_);
            // (ie if main doc index doesn't exist then always rebuild all indexes)

            if (bPingMainIndexFailed) { // extra level of robustness... sleep for a minute then double check the index is really missing...
                try {
                    Thread.sleep(60000);
                } catch (Exception e) {
                }
                bPingMainIndexFailed = !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_);
            }
            bRebuildDocsIndex |= bPingMainIndexFailed;

            // check the main index has the "collection" alias - if not then rebuild everything

            if (!bPingMainIndexFailed && (null == _aliasInfo)) {
                ElasticSearchManager docIndex = ElasticSearchManager
                        .getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
                ClusterStateResponse clusterState = docIndex.getRawClient().admin().cluster()
                        .state(new ClusterStateRequest()).actionGet();
                _aliasInfo = CrossVersionImmutableMapOfImmutableMaps
                        .getAliases(clusterState.getState().getMetaData());
                if (!_aliasInfo.containsKey(DocumentPojoIndexMap.globalDocumentIndexCollection_)) {
                    bRebuildDocsIndex = true;
                }
            } //TESTED

            createCommunityDocIndex(DocumentPojoIndexMap.globalDocumentIndex_, null, false, true, bDeleteDocs);
            createCommunityDocIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_, null, false, false, bDeleteDocs);

            // Some hardwired dummy communities
            createCommunityDocIndex("4e3706c48d26852237078005", null, true, false, bDeleteDocs); // (admin)
            createCommunityDocIndex("4e3706c48d26852237079004", null, true, false, bDeleteDocs); // (test user)
            // (create dummy index used to keep personal group aliases)

            if (bRebuildDocsIndex || bDeleteDocs) {

                // OK, going to have different shards for different communities:
                // Get a list of all the communities:

                BasicDBObject query = new BasicDBObject();
                BasicDBObject fieldsToDrop = new BasicDBObject("members", 0);
                fieldsToDrop.put("communityAttributes", 0);
                fieldsToDrop.put("userAttributes", 0);
                DBCursor dbc = DbManager.getSocial().getCommunity().find(query, fieldsToDrop);

                List<DBObject> tmparray = dbc.toArray(); // (brings the entire thing into memory so don't get cursor timeouts)
                int i = 0;
                System.out.println("Initializing " + dbc.size() + " indexes:");
                for (int j = 0; j < 2; ++j) {
                    for (DBObject dbotmp : tmparray) {
                        if ((++i % 100) == 0) {
                            System.out.println("Initialized " + i + " indexes.");
                        }
                        BasicDBObject dbo = (BasicDBObject) dbotmp;

                        // OK, going to see if there are any sources with this group id, create a new index if so:
                        // (Don't use CommunityPojo data model here for performance reasons....
                        //  (Also, haven't gotten round to porting CommunityPojo field access to using static fields))
                        ObjectId communityId = (ObjectId) dbo.get("_id");
                        boolean bPersonalGroup = dbo.getBoolean("isPersonalCommunity", false);
                        boolean bSystemGroup = dbo.getBoolean("isSystemCommunity", false);
                        ObjectId parentCommunityId = (ObjectId) dbo.get("parentId");

                        createCommunityDocIndex(communityId.toString(), parentCommunityId, bPersonalGroup,
                                bSystemGroup, bDeleteDocs, j == 0);

                    } //end loop over communities
                } // end loop over communities - first time parents only
            } // (end if need to do big loop over all sources)
        } catch (Exception e) {
            //DEBUG
            //e.printStackTrace();

            throw new RuntimeException(e.getMessage());
        }
    }//TESTED (not changed since by-eye test in Beta - retested after moving code into createCommunityDocIndex below)

    ///////////////////////////////////////////////////////////////////////////////////////

    // Utility code for creating community indexes

    public static void createCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId,
            boolean bPersonalGroup, boolean bSystemGroup, boolean bClearIndex) {
        if (!ElasticSearchManager.pingCluster()) {
            throw new RuntimeException("Index not running");
        } //TESTED (by hand)
        createCommunityDocIndex(nameOrCommunityIdStr, parentCommunityId, bPersonalGroup, bSystemGroup, bClearIndex,
                false);
    }

    protected static void createCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId,
            boolean bPersonalGroup, boolean bSystemGroup, boolean bClearIndex, boolean bParentsOnly) {
        //create elasticsearch indexes
        PropertiesManager pm = new PropertiesManager();
        boolean languageNormalization = pm.getNormalizeEncoding();
        int nPreferredReplicas = pm.getMaxIndexReplicas();

        String docMapping = new Gson()
                .toJson(new DocumentPojoIndexMap.Mapping(), DocumentPojoIndexMap.Mapping.class)
                .replace("__AMP__", "@");

        String sGroupIndex = null; // for indexing, ie always a single index
        String sAliasIndex = null; // for querying, ie will point to doc_commid, doc_commid_1, etc
        try {
            sGroupIndex = new StringBuffer("doc_").append(new ObjectId(nameOrCommunityIdStr).toString()).toString();
            sAliasIndex = new StringBuffer("docs_").append(new ObjectId(nameOrCommunityIdStr).toString())
                    .toString();
        } catch (Exception e) {
            sGroupIndex = nameOrCommunityIdStr;
            if (DocumentPojoIndexMap.globalDocumentIndex_.equals(nameOrCommunityIdStr)) {
                sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_;
            } else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(nameOrCommunityIdStr)) {
                sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_;
            } else { // fallback
                sAliasIndex = nameOrCommunityIdStr.replaceAll("doc(?:ument)?_", "docs_");
            }
            //TESTED
        }
        if (!bPersonalGroup) {

            if (null == parentCommunityId) {

                int nShards = bSystemGroup ? 10 : 5; // (system group is largest)

                // Remove the alias, in case it exists:
                // Then create an index with this name:
                Builder localSettingsGroupIndex = ImmutableSettings.settingsBuilder();
                localSettingsGroupIndex.put("number_of_shards", nShards).put("number_of_replicas",
                        nPreferredReplicas);
                if (languageNormalization) {
                    localSettingsGroupIndex.put("index.analysis.analyzer.default.tokenizer", "standard");
                    localSettingsGroupIndex.putArray("index.analysis.analyzer.default.filter", "icu_normalizer",
                            "icu_folding", "standard", "lowercase", "stop");
                } //TESTED

                ElasticSearchManager docIndex = IndexManager.createIndex(sGroupIndex,
                        DocumentPojoIndexMap.documentType_, false, null, docMapping, localSettingsGroupIndex);
                if (null == docIndex) { // index has already been referenced, hence createIndex returns null
                    docIndex = IndexManager.getIndex(sGroupIndex);
                }
                if (bClearIndex) {
                    docIndex.deleteMe();
                    docIndex = IndexManager.createIndex(sGroupIndex, DocumentPojoIndexMap.documentType_, false,
                            null, docMapping, localSettingsGroupIndex);
                }
                if (null != docIndex) {
                    try {
                        docIndex.pingIndex(); // (wait until it's created itself)
                    } catch (Exception e) {
                    } // (just make sure this doesn't die horribly)
                } else {
                    docIndex = IndexManager.getIndex(sGroupIndex);
                }
                if (null != docIndex) { // should always be true
                    docIndex.createAlias(sAliasIndex);
                    docIndex.closeIndex();
                }
            } else if (!bParentsOnly) { // A sub-index of a parent          

                parentCommunityId = getRootCommunity(parentCommunityId);

                if (null != parentCommunityId) {
                    String parentCommunityIdStr = parentCommunityId.toString();

                    String sParentGroupIndex = new StringBuffer("doc_")
                            .append(new ObjectId(parentCommunityIdStr).toString()).toString();
                    ElasticSearchManager docIndex = IndexManager.getIndex(sParentGroupIndex);

                    //DEBUG (alias corruption)
                    //               if (null == _aliasInfo) {
                    //                  ClusterStateResponse clusterState = docIndex.getRawClient().admin().cluster().state(new ClusterStateRequest()).actionGet();
                    //                  _aliasInfo = CrossVersionImmutableMapOfImmutableMaps.getAliases(clusterState.getState().getMetaData());
                    //               }
                    //               else {
                    //                  if (_aliasInfo.containsKey(sGroupIndex)) { // has no aliases, we're not good
                    //                     return;
                    //                  }
                    //                  else {
                    //                     //DEBUG
                    //                     System.out.println("Alias " + sGroupIndex + " has no aliases (but should)");                  
                    //                     ElasticSearchManager docIndex2 = IndexManager.getIndex(sGroupIndex);
                    //                     docIndex2.deleteMe();
                    //                  }
                    //               }

                    docIndex.createAlias(sGroupIndex); // for indexing 
                    // (this is going to be tricky when the functionality is fully implemented
                    //  because it will need to handle the parent index splitting)
                    docIndex.createAlias(sAliasIndex); // for queries
                    docIndex.closeIndex();
                    // (do nothing on delete - that will be handled at the parent index level)
                }
            }
            //TESTED (parents, children, and personal + docs_ aliases)
        } else { // (Personal group)
            // Just create the dummy index, no different to getting it in practice
            Builder localSettingsGroupIndex = ImmutableSettings.settingsBuilder();
            localSettingsGroupIndex.put("number_of_shards", 1).put("number_of_replicas", 0); // (ie guaranteed to be local to each ES node)   
            ElasticSearchManager dummyGroupIndex = IndexManager.createIndex(
                    DocumentPojoIndexMap.dummyDocumentIndex_, DocumentPojoIndexMap.documentType_, false, null,
                    docMapping, localSettingsGroupIndex);
            if (null == dummyGroupIndex) {
                dummyGroupIndex = IndexManager.getIndex(DocumentPojoIndexMap.dummyDocumentIndex_);
            }

            // Just create an alias, so that queries work arbitrarily:
            dummyGroupIndex.createAlias(sGroupIndex); // (at some point we should delete the sGroupIndex alias, but leave it in for bw compatibility for now)
            dummyGroupIndex.createAlias(sAliasIndex); // (never index dummy indices so only need query index)
            // (do nothing on delete since don't have any docs in here anyway)
        }
    }
    //TESTED (including new docs_ alias)

    ///////////////////////////

    // (this utility function is needed for the legacy case where empty communities were
    //  treated as aliases of the dummy community ... first time I encounter a community, I need
    //  to recreate it...)

    public static void recreateCommunityDocIndex_unknownFields(ObjectId communityId, boolean bDeleteFirst) {
        if (!ElasticSearchManager.pingCluster()) {
            throw new RuntimeException("Index not running");
        } //TESTED (by c/p from createCommunityDocIndex)

        CommunityPojo cp = CommunityPojo.fromDb(
                MongoDbManager.getSocial().getCommunity().findOne(new BasicDBObject("_id", communityId)),
                CommunityPojo.class);
        if (null != cp) {
            deleteCommunityDocIndex(communityId.toString(), cp.getParentId(), true);
            // (in the legacy world this would have been treated as a "personal" ie equivalently to a dummy community ...
            //  this does nothing if it's already a real community)

            if (bDeleteFirst) {
                deleteCommunityDocIndex(communityId.toString(), cp.getParentId(), cp.getIsPersonalCommunity());
            }
            createCommunityDocIndex(communityId.toString(), cp.getParentId(), cp.getIsPersonalCommunity(),
                    cp.getIsSystemCommunity(), false);
        }
    }
    //TESTED

    ///////////////////////////

    public static void deleteCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId,
            boolean bPersonalGroup) {
        if (!ElasticSearchManager.pingCluster()) {
            throw new RuntimeException("Index not running");
        } //TESTED (by c/p from createCommunityDocIndex)

        String sGroupIndex = null; // for indexing, ie always a single index
        String sAliasIndex = null; // for querying, ie will point to doc_commid, doc_commid_1, etc
        ObjectId communityId = null;
        try {
            communityId = new ObjectId(nameOrCommunityIdStr);
            sGroupIndex = new StringBuffer("doc_").append(communityId.toString()).toString();
            sAliasIndex = new StringBuffer("docs_").append(communityId.toString()).toString();
        } catch (Exception e) {
            sGroupIndex = nameOrCommunityIdStr;
            if (DocumentPojoIndexMap.globalDocumentIndex_.equals(nameOrCommunityIdStr)) {
                sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_;
            } else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(nameOrCommunityIdStr)) {
                sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_;
            } else { // fallback
                sAliasIndex = nameOrCommunityIdStr.replaceAll("doc(?:ument)?_", "docs_");
            }
            //TESTED
        }
        if (bPersonalGroup) {
            ElasticSearchManager dummyGroupIndex = IndexManager.getIndex(DocumentPojoIndexMap.dummyDocumentIndex_);
            dummyGroupIndex.removeAlias(sAliasIndex);
            dummyGroupIndex.removeAlias(sGroupIndex);
        } else if (null != parentCommunityId) {

            parentCommunityId = getRootCommunity(parentCommunityId);
            if (null != parentCommunityId) {
                String sParentGroupIndex = new StringBuffer("doc_").append(parentCommunityId.toString()).toString();
                ElasticSearchManager docIndex = IndexManager.getIndex(sParentGroupIndex);
                docIndex.removeAlias(sGroupIndex);
                docIndex.removeAlias(sAliasIndex);
                docIndex.closeIndex();
            }
        } else {
            ElasticSearchManager docIndex = IndexManager.getIndex(sGroupIndex);
            docIndex.deleteMe();
        }
        //TESTED (parent, children, and personal)

        // Also need to delete any records indexes:
        // It's a bit more complex because we're not exactly sure which indexes exist:

        if (null != communityId) {
            ElasticSearchManager indexMgr = ElasticSearchManager
                    .getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
            // (just something that's guaranteed to exist)

            String stashedIndex = "recs_" + communityId.toString();
            String liveIndicesPrefix = "recs_t_" + communityId.toString();

            ClusterStateResponse clusterState = indexMgr.getRawClient().admin().cluster()
                    .state(new ClusterStateRequest()).actionGet();
            String indices[] = clusterState.getState().getMetaData().getConcreteAllOpenIndices();
            for (String index : indices) {
                if (index.startsWith(stashedIndex) || index.startsWith(liveIndicesPrefix)) {
                    ElasticSearchManager.getIndex(index).deleteMe();
                }
            } //TESTED

            // THIS CODE ONLY WORKS ON ES-1.0+ ... so have replaced with the less efficient code above

            // First off: stashed interface:

            //         String stashedIndex = "recs_" + communityId.toString();
            //         ClusterStateResponse retVal = indexMgr.getRawClient().admin().cluster().prepareState()
            //               .setIndices(stashedIndex)
            //               .setRoutingTable(false).setNodes(false).setListenerThreaded(false).get();
            //         
            //         if (!retVal.getState().getMetaData().getIndices().isEmpty()) {
            //            ElasticSearchManager.getIndex(stashedIndex).deleteMe();
            //         }//TESTED
            //         // (else doesn't exist...)
            //         
            //         // Second: all the time-indexed versions
            //         
            //         String indexPattern = new StringBuffer("recs_t_").append(communityId.toString()).append("*").toString();
            //         retVal = indexMgr.getRawClient().admin().cluster().prepareState()
            //               .setIndices(indexPattern)
            //               .setRoutingTable(false).setNodes(false).setListenerThreaded(false).get();
            //
            //         for (IndexMetaData indexMetadata: retVal.getState().getMetaData()) {
            //            ElasticSearchManager.getIndex(indexMetadata.index()).deleteMe();
            //         }//TESTED
        } //TESTED
    }
    //TESTED (personal and system)

    ///////////////////////////

    // Utility function to get the root community of a community hierarchy, since you can't add aliases to aliases

    static ObjectId getRootCommunity(ObjectId parentCommunityId) {

        for (;;) {
            BasicDBObject query = new BasicDBObject("_id", parentCommunityId);
            BasicDBObject field = new BasicDBObject("parentId", 1);
            BasicDBObject retVal = (BasicDBObject) MongoDbManager.getSocial().getCommunity().findOne(query, field);
            if (null == retVal) { // (shouldn't ever happen)
                return parentCommunityId;
            }
            ObjectId tmp = retVal.getObjectId("parentId", null);
            if (null == tmp) { // (no more parents)
                return parentCommunityId;
            }
            if (tmp.equals(parentCommunityId)) { // (shouldn't ever happen but will prevent infinite loop)
                return parentCommunityId;
            }
            parentCommunityId = tmp;
        }
    }//TESTED (cases where have and don't have parent id)

    ///////////////////////////////////////////////////////////////////////////////////////
    //
    // Interface to handle scaleable indexes
    // Currently this is a dummy interface, but it will make it easy to split the indexes in the future

    private static HashMap<String, String> _docIndexMap = null;
    private static String _assocIndex = null;
    private static String _entityIndex = null;
    private static CrossVersionImmutableMapOfImmutableMaps<AliasMetaData> _aliasInfo = null;

    //TODO (INF-1136): Test and integrate this (phase 1), then implement the index splitting code (phase 2)

    public static synchronized String getIndex(String communityIdOrIndexStr) {
        if (communityIdOrIndexStr == EntityFeaturePojoIndexMap.indexName_) { // pointer == intended
            if (null == _entityIndex) {
                _entityIndex = EntityFeaturePojoIndexMap.indexName_;
            }
            return _entityIndex;
        } else if (communityIdOrIndexStr == AssociationFeaturePojoIndexMap.indexName_) { // pointer == intended         
            if (null == _assocIndex) {
                _assocIndex = AssociationFeaturePojoIndexMap.indexName_;
            }
            return _assocIndex;
        } else { // Documents

            if (null == _docIndexMap) {
                _docIndexMap = new HashMap<String, String>();
            }
            String sAliasIndex;
            try {
                sAliasIndex = new StringBuffer("doc_").append(new ObjectId(communityIdOrIndexStr).toString())
                        .toString();
            } catch (Exception e) {
                if (DocumentPojoIndexMap.globalDocumentIndex_.equals(communityIdOrIndexStr)) {
                    communityIdOrIndexStr = sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_;
                } else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(communityIdOrIndexStr)) {
                    communityIdOrIndexStr = sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_;
                } else { // fallback
                    communityIdOrIndexStr = sAliasIndex = communityIdOrIndexStr.replaceAll("doc(?:ument)?_", "");
                }
            }
            String sDocIndex = _docIndexMap.get(communityIdOrIndexStr);
            if (null == sDocIndex) {
                sDocIndex = sAliasIndex;
                _docIndexMap.put(communityIdOrIndexStr, sAliasIndex);
            }
            return sDocIndex;
        }
    }
    //TOTEST (lots of cases)

    ///////////////////////////////////////////////////////////////////////////////////////
    //
    // Enrich and store documents (source is optional - can choose not to index if set)
    // (and remove any documents)

    public void processDocuments(int harvestType, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate_subsetOfAdd,
            List<DocumentPojo> toDelete) {
        processDocuments(harvestType, toAdd, toUpdate_subsetOfAdd, toDelete, null);
    }

    public void processDocuments(int harvestType, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate_subsetOfAdd,
            List<DocumentPojo> toDelete, SourcePojo source) {
        PropertiesManager props = new PropertiesManager();

        // Note: toAdd = toAdd(old) + toUpdate
        // Need to treat updates as follows:
        // - Delete (inc children, eg events) but get fields to keep (currently _id, created; in the future comments etc)

        // Delete toUpdate and toAdd (also overwriting "created" for updated docs, well all actually...)
        toDelete.addAll(toUpdate_subsetOfAdd);
        StoreAndIndexManager storageManager = new StoreAndIndexManager();
        storageManager.removeFromDatastore_byURL(toDelete, source);
        // (note: expands toDelete if any sourceUrl "docs" are present, see FileHarvester)

        // (Storing docs messes up the doc/event/entity objects, so don't do that just yet...)

        // Aggregation:
        // 1+2. Create aggregate entities/events ("features") and write them to the DB
        // (then can store feeds - doesn't matter that the event/entities have been modified by the aggregation)
        // 3. (Scheduled for efficiency) Update all documents' frequencies based on new entities and events
        // 4. (Scheduled for efficiency) Synchronize with index [after this, queries can find them - so (2) must have happened]
        // (Syncronization currently "corrupts" the entities so needs to be run last)

        AggregationManager perSourceAggregation = null;

        if (!props.getAggregationDisabled()) {
            perSourceAggregation = new AggregationManager();
        }

        // 1+2]
        if (null != perSourceAggregation) {
            perSourceAggregation.doAggregation(toAdd, toDelete);
            perSourceAggregation.createOrUpdateFeatureEntries();
        }

        // Save feeds to feeds collection in MongoDb
        // (second field determines if content gets saved)
        if (null != perSourceAggregation) {
            perSourceAggregation.applyAggregationToDocs(toAdd);
            // (First save aggregated statistics back to the docs' entity/event instances)
        }
        storeFeeds(toAdd, (harvestType != InfiniteEnums.DATABASE), source);

        // Then finish aggregation:

        if (null != perSourceAggregation) {
            // 3]  
            perSourceAggregation.runScheduledDocumentUpdates();

            // 4] This needs to happen last because it "corrupts" the entities and events
            perSourceAggregation.runScheduledSynchronization();
        }

    }//TESTED (by eye - logic is v simple)

    ///////////////////////////////////////////////////////////////////////////////////////
    //
    // STORAGE AND INDEXING
    //
    //////////////////////////////////////////////////////////////////////////////////////

    /**
     * Writes the feeds to the DB and index
     * 
     * @param feeds list of feeds to be added to db
     */
    private void storeFeeds(List<DocumentPojo> docs, boolean bSaveContent, SourcePojo source) {
        if (null != docs && docs.size() > 0) {
            StoreAndIndexManager store = new StoreAndIndexManager();
            store.addToDatastore(docs, bSaveContent, source);
        }
    }//TESTED (by eye)

    // See StoreAndIndexManager

    ///////////////////////////////////////////////////////////////////////////////////////
    //
    // AGGREGATION
    //
    //////////////////////////////////////////////////////////////////////////////////////

    // See AggregationManager

}