Java tutorial
/******************************************************************************* * Copyright 2012, The Infinit.e Open Source Project. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package com.ikanow.infinit.e.processing.generic; import java.util.HashMap; import java.util.List; //import org.apache.log4j.Logger; import org.bson.types.ObjectId; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.ImmutableSettings.Builder; import com.google.gson.Gson; import com.ikanow.infinit.e.data_model.InfiniteEnums; import com.ikanow.infinit.e.data_model.index.ElasticSearchManager; import com.ikanow.infinit.e.data_model.index.IndexManager; import com.ikanow.infinit.e.data_model.index.document.DocumentPojoIndexMap; import com.ikanow.infinit.e.data_model.index.feature.entity.EntityFeaturePojoIndexMap; import com.ikanow.infinit.e.data_model.index.feature.event.AssociationFeaturePojoIndexMap; import com.ikanow.infinit.e.data_model.store.DbManager; import com.ikanow.infinit.e.data_model.store.MongoDbManager; import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo; import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo; import com.ikanow.infinit.e.data_model.store.custom.mapreduce.CustomMapReduceJobPojo; import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo; import com.ikanow.infinit.e.data_model.store.document.DocCountPojo; import com.ikanow.infinit.e.data_model.store.document.DocumentPojo; import com.ikanow.infinit.e.data_model.store.document.EntityPojo; import com.ikanow.infinit.e.data_model.store.feature.association.AssociationFeaturePojo; import com.ikanow.infinit.e.data_model.store.feature.entity.EntityFeaturePojo; import com.ikanow.infinit.e.data_model.store.social.community.CommunityPojo; import com.ikanow.infinit.e.processing.generic.aggregation.AggregationManager; import com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager; import com.ikanow.infinit.e.processing.generic.utils.PropertiesManager; import com.mongodb.BasicDBObject; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.DBObject; import org.elasticsearch.action.admin.cluster.state.ClusterStateRequest; import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse; import org.elasticsearch.cluster.metadata.AliasMetaData; import org.elasticsearch.common.collect.CrossVersionImmutableMapOfImmutableMaps; //DEBUG (alias corruption) //import org.elasticsearch.action.admin.indices.status.IndexStatus; //import org.elasticsearch.action.admin.indices.status.IndicesStatusRequest; //import org.elasticsearch.action.admin.indices.status.IndicesStatusResponse; public class GenericProcessingController { //NOTE THIS FUNCTION SHOULD CONTAIN NO STATE SINCE IT CAN BE RUN ACROSS MULTIPLE THREADS //(Nothing currently to log) //private static final Logger logger = Logger.getLogger(GenericProcessingController.class); /////////////////////////////////////////////////////////////////////////////////////// // // Set up the databases and indexes public void Initialize() { InitializeDatabase(); InitializeIndex(false, false, false); // (Don't delete anything, obviously) } public void InitializeDatabase() { // Add indices: try { PropertiesManager pm = new PropertiesManager(); //////////////////////// // // Remove old indexes, mostly just old code that is no longer needed // dropIndexIfItExists(DbManager.getDocument().getContent(), CompressedFullTextPojo.url_, 1); dropIndexIfItExists(DbManager.getDocument().getContent(), CompressedFullTextPojo.sourceKey_, 2); dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.sourceUrl_, 1); dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.sourceKey_, 1); dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.title_, 1); // (Title simply not needed, that was a mistake from an early iteration) dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.updateId_, 1); dropIndexIfItExists(DbManager.getSocial().getShare(), "type", 1); dropIndexIfItExists(DbManager.getSocial().getCookies(), "apiKey", 1); dropIndexIfItExists(DbManager.getCustom().getLookup(), CustomMapReduceJobPojo.jobidS_, 2); dropIndexIfItExists(DbManager.getCustom().getLookup(), CustomMapReduceJobPojo.waitingOn_, 2); // (see shard keys below, these legacy ones can appear if the DB is restored from a different machine's backup) dropIndexIfNotNeeded(DbManager.getDocument().getContent(), "sourceKey_1_url_1", 0, "sourceKey_2_url_2", 0); dropIndexIfNotNeeded(DbManager.getDocument().getMetadata(), "sourceKey_1__id_1", 0, "sourceKey_1__id_-1", 0); //////////////////////// // // Indexes needed for sharding: // // ** Content (has changed a bit) BasicDBObject compIndex = new BasicDBObject(CompressedFullTextPojo.sourceKey_, 1); compIndex.put(CompressedFullTextPojo.url_, 1); addIndexIfNeeded(DbManager.getDocument().getContent(), "sourceKey_2_url_2", 0, compIndex); // (remove legacy 2_2 and replace with 1_1, which supports shards) // ** Metadata // Add {_id:1} to "standalone" sourceKey, sort docs matching source key by "time" (sort of!) compIndex = new BasicDBObject(DocumentPojo.sourceKey_, 1); compIndex.put(DocumentPojo._id_, 1); addIndexIfNeeded(DbManager.getDocument().getMetadata(), "sourceKey_1__id_-1", 0, compIndex); // (remove legacy 1_-1 and replace with 1_1, which supports shards) // ** Entities and associations DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.index_, 1)); DbManager.getFeature().getAssociation() .ensureIndex(new BasicDBObject(AssociationFeaturePojo.index_, 1)); //////////////////////// // // Other indexes // // Needed to handle updates of large files containing many URLs: DbManager.getDocument().getMetadata().ensureIndex(new BasicDBObject(DocumentPojo.sourceUrl_, 2), new BasicDBObject(MongoDbManager.sparse_, true)); //^NOTE: if this index changes, also need to change DuplicateManager_Integrated - search for "sourceUrl_" to see where //TODO (INF-1922): at some point should look into making (sparse) sourceUrl be compounded with sourceKey - this is a bit risky // Needed for duplicate checking // (Compound index lets me access {url, sourceKey}, {url} efficiently ... but need sourceKey separately to do {sourceKey}) compIndex = new BasicDBObject(DocumentPojo.url_, 1); compIndex.put(DocumentPojo.sourceKey_, 1); DbManager.getDocument().getMetadata().ensureIndex(compIndex); // Needed to handle document updates DbManager.getDocument().getMetadata().ensureIndex(new BasicDBObject(DocumentPojo.updateId_, 2), new BasicDBObject(MongoDbManager.sparse_, true)); // Needed to update documents' entities' doc counts if (!pm.getAggregationDisabled()) { compIndex = new BasicDBObject(EntityPojo.docQuery_index_, 1); compIndex.put(DocumentPojo.communityId_, 1); DbManager.getDocument().getMetadata().ensureIndex(compIndex); } // Needed for keeping source/community doc counts compIndex = new BasicDBObject(DocCountPojo._id_, 1); compIndex.put(DocCountPojo.doccount_, 1); DbManager.getDocument().getCounts().ensureIndex(compIndex); // Needed for keep tracking of entities DbManager.getFeature().getEntity() .ensureIndex(new BasicDBObject(EntityFeaturePojo.disambiguated_name_, 1)); DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.alias_, 1)); // Needed for background re-calculation DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.db_sync_prio_, 2), new BasicDBObject(MongoDbManager.sparse_, true)); DbManager.getFeature().getAssociation().ensureIndex( new BasicDBObject(AssociationFeaturePojo.db_sync_prio_, 2), new BasicDBObject(MongoDbManager.sparse_, true)); // Needed for geo-location in the entity pipeline DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("country", 1)); DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("search_field", 1)); DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("geoindex", "2d")); // Needed for source management DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.key_, 1)); DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.communityIds_, 1)); DbManager.getIngest().getSource() .ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvested_, 1)); DbManager.getIngest().getSource() .ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_synced_, 1)); DbManager.getIngest().getSource() .ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_, 1)); // Federated query engine DbManager.getIngest().getSource().ensureIndex( new BasicDBObject(SourcePojo.federatedQueryCommunityIds_, 1), new BasicDBObject(MongoDbManager.sparse_, true)); // Communities: DbManager.getSocial().getCommunity().ensureIndex(new BasicDBObject("members._id", 1)); // Searching shares // Compound index lets me access {type, communities._id}, {type} efficiently compIndex = new BasicDBObject("type", 1); compIndex.put("communities._id", 1); DbManager.getSocial().getShare().ensureIndex(compIndex); // User logins DbManager.getSocial().getCookies().ensureIndex(new BasicDBObject("apiKey", 2), new BasicDBObject(MongoDbManager.sparse_, true)); // Custom job scheduling DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobtitle_, 1)); //TODO (): MOVE THESE TO SPARSE INDEXES AFTER YOU'VE UPDATED THE LOGIC (SWAP THE 1 AND 2) DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 1), new BasicDBObject(MongoDbManager.sparse_, false)); // DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 2), new BasicDBObject(MongoDbManager.sparse_, true)); // dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.jobidS_, 1); DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 1), new BasicDBObject(MongoDbManager.sparse_, false)); // DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 2), new BasicDBObject(MongoDbManager.sparse_, true)); // dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.waitingOn_, 1); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e.getMessage()); } }//TESTED (not changed since by-eye test in Beta) // Some *DB* index utilities (note note Lucene index) private static void addIndexIfNeeded(DBCollection coll, String indexToCheck, int nIndexIndex, BasicDBObject newIndex) { StringBuffer indexNameStrBuff = new StringBuffer(indexToCheck); if (0 != nIndexIndex) { indexNameStrBuff.append("_").append(nIndexIndex); } String indexName2 = indexNameStrBuff.toString(); List<DBObject> list = coll.getIndexInfo(); for (DBObject dbo : list) { String name = (String) dbo.get("name"); if (indexName2.equalsIgnoreCase(name)) { return; // no need to create a new index } } // If we're here then we didn't find the index so create a new index try { coll.ensureIndex(newIndex); } catch (Exception e) { } }//TESTED private static void dropIndexIfNotNeeded(DBCollection coll, String indexToCheck, int nIndexToCheckIndex, String indexToDelete, int nIndexToDeleteIndex) { StringBuffer indexNameStrBuff = new StringBuffer(indexToCheck); if (0 != nIndexToCheckIndex) { indexNameStrBuff.append("_").append(nIndexToCheckIndex); } String indexToCheck2 = indexNameStrBuff.toString(); indexNameStrBuff.setLength(0); indexNameStrBuff.append(indexToDelete); if (0 != nIndexToDeleteIndex) { indexNameStrBuff.append("_").append(nIndexToDeleteIndex); } boolean foundIndexToDelete = false; boolean foundIndexToCheck = false; String indexToDelete2 = indexNameStrBuff.toString(); List<DBObject> list = coll.getIndexInfo(); for (DBObject dbo : list) { String name = (String) dbo.get("name"); if (indexToCheck2.equalsIgnoreCase(name)) { foundIndexToCheck = true; } else if (indexToDelete2.equalsIgnoreCase(name)) { foundIndexToDelete = true; } } if (foundIndexToCheck && foundIndexToDelete) { try { coll.dropIndex(indexToDelete2); } catch (Exception e) { } } }//TESTED private void dropIndexIfItExists(DBCollection coll, String indexName, int nIndexIndex) { StringBuffer indexNameStrBuff = new StringBuffer(indexName); if (0 != nIndexIndex) { indexNameStrBuff.append("_").append(nIndexIndex); } String indexName2 = indexNameStrBuff.toString(); List<DBObject> list = coll.getIndexInfo(); for (DBObject dbo : list) { String name = (String) dbo.get("name"); if (indexName2.equalsIgnoreCase(name)) { try { coll.dropIndex(name); } catch (Exception e) { } } } }//TESTED ///////////////////////////////////////////////////////// // Lucene index initialization // (Note some of the code below is duplicated in MongoDocumentTxfer, so make sure you sync changes) public void InitializeIndex(boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature) { InitializeIndex(bDeleteDocs, bDeleteEntityFeature, bDeleteEventFeature, false); } public void InitializeIndex(boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature, boolean bRebuildDocsIndex) { try { //create elasticsearch indexes if (!ElasticSearchManager.pingIndex(null, null)) { throw new RuntimeException("Index is red, disable indexing operations"); } //TESTED PropertiesManager pm = new PropertiesManager(); if (!pm.getAggregationDisabled()) { boolean languageNormalization = pm.getNormalizeEncoding(); Builder localSettingsEvent = ImmutableSettings.settingsBuilder(); localSettingsEvent.put("number_of_shards", 10).put("number_of_replicas", 2); localSettingsEvent.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); if (languageNormalization) { localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer", "icu_folding", "standard", "lowercase"); } else { localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); } Builder localSettingsGaz = ImmutableSettings.settingsBuilder(); localSettingsGaz.put("number_of_shards", 10).put("number_of_replicas", 2); localSettingsGaz.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); if (languageNormalization) { localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer", "icu_folding", "standard", "lowercase"); } else { localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); } //event feature String eventGazMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(), AssociationFeaturePojoIndexMap.Mapping.class); ElasticSearchManager eventIndex = IndexManager.createIndex( AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping, localSettingsEvent); if (null == eventIndex) { // (if has been previously referenced in this process space) eventIndex = IndexManager.getIndex(AssociationFeaturePojoIndexMap.indexName_); } eventIndex.createAlias(AssociationFeaturePojoIndexMap.indexCollectionName_); if (bDeleteEventFeature) { eventIndex.deleteMe(); eventIndex = IndexManager.createIndex(AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping, localSettingsEvent); } //entity feature String gazMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(), EntityFeaturePojoIndexMap.Mapping.class); ElasticSearchManager entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_, null, false, null, gazMapping, localSettingsGaz); if (null == entityIndex) { // (if has been previously referenced in this process space) entityIndex = IndexManager.getIndex(EntityFeaturePojoIndexMap.indexName_); } entityIndex.createAlias(EntityFeaturePojoIndexMap.indexCollectionName_); if (bDeleteEntityFeature) { entityIndex.deleteMe(); entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_, null, false, null, gazMapping, localSettingsGaz); } } //DOCS - much more complicated than anything else boolean bPingMainIndexFailed = !ElasticSearchManager .pingIndex(DocumentPojoIndexMap.globalDocumentIndex_); // (ie if main doc index doesn't exist then always rebuild all indexes) if (bPingMainIndexFailed) { // extra level of robustness... sleep for a minute then double check the index is really missing... try { Thread.sleep(60000); } catch (Exception e) { } bPingMainIndexFailed = !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_); } bRebuildDocsIndex |= bPingMainIndexFailed; // check the main index has the "collection" alias - if not then rebuild everything if (!bPingMainIndexFailed && (null == _aliasInfo)) { ElasticSearchManager docIndex = ElasticSearchManager .getIndex(DocumentPojoIndexMap.globalDocumentIndex_); ClusterStateResponse clusterState = docIndex.getRawClient().admin().cluster() .state(new ClusterStateRequest()).actionGet(); _aliasInfo = CrossVersionImmutableMapOfImmutableMaps .getAliases(clusterState.getState().getMetaData()); if (!_aliasInfo.containsKey(DocumentPojoIndexMap.globalDocumentIndexCollection_)) { bRebuildDocsIndex = true; } } //TESTED createCommunityDocIndex(DocumentPojoIndexMap.globalDocumentIndex_, null, false, true, bDeleteDocs); createCommunityDocIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_, null, false, false, bDeleteDocs); // Some hardwired dummy communities createCommunityDocIndex("4e3706c48d26852237078005", null, true, false, bDeleteDocs); // (admin) createCommunityDocIndex("4e3706c48d26852237079004", null, true, false, bDeleteDocs); // (test user) // (create dummy index used to keep personal group aliases) if (bRebuildDocsIndex || bDeleteDocs) { // OK, going to have different shards for different communities: // Get a list of all the communities: BasicDBObject query = new BasicDBObject(); BasicDBObject fieldsToDrop = new BasicDBObject("members", 0); fieldsToDrop.put("communityAttributes", 0); fieldsToDrop.put("userAttributes", 0); DBCursor dbc = DbManager.getSocial().getCommunity().find(query, fieldsToDrop); List<DBObject> tmparray = dbc.toArray(); // (brings the entire thing into memory so don't get cursor timeouts) int i = 0; System.out.println("Initializing " + dbc.size() + " indexes:"); for (int j = 0; j < 2; ++j) { for (DBObject dbotmp : tmparray) { if ((++i % 100) == 0) { System.out.println("Initialized " + i + " indexes."); } BasicDBObject dbo = (BasicDBObject) dbotmp; // OK, going to see if there are any sources with this group id, create a new index if so: // (Don't use CommunityPojo data model here for performance reasons.... // (Also, haven't gotten round to porting CommunityPojo field access to using static fields)) ObjectId communityId = (ObjectId) dbo.get("_id"); boolean bPersonalGroup = dbo.getBoolean("isPersonalCommunity", false); boolean bSystemGroup = dbo.getBoolean("isSystemCommunity", false); ObjectId parentCommunityId = (ObjectId) dbo.get("parentId"); createCommunityDocIndex(communityId.toString(), parentCommunityId, bPersonalGroup, bSystemGroup, bDeleteDocs, j == 0); } //end loop over communities } // end loop over communities - first time parents only } // (end if need to do big loop over all sources) } catch (Exception e) { //DEBUG //e.printStackTrace(); throw new RuntimeException(e.getMessage()); } }//TESTED (not changed since by-eye test in Beta - retested after moving code into createCommunityDocIndex below) /////////////////////////////////////////////////////////////////////////////////////// // Utility code for creating community indexes public static void createCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId, boolean bPersonalGroup, boolean bSystemGroup, boolean bClearIndex) { if (!ElasticSearchManager.pingCluster()) { throw new RuntimeException("Index not running"); } //TESTED (by hand) createCommunityDocIndex(nameOrCommunityIdStr, parentCommunityId, bPersonalGroup, bSystemGroup, bClearIndex, false); } protected static void createCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId, boolean bPersonalGroup, boolean bSystemGroup, boolean bClearIndex, boolean bParentsOnly) { //create elasticsearch indexes PropertiesManager pm = new PropertiesManager(); boolean languageNormalization = pm.getNormalizeEncoding(); int nPreferredReplicas = pm.getMaxIndexReplicas(); String docMapping = new Gson() .toJson(new DocumentPojoIndexMap.Mapping(), DocumentPojoIndexMap.Mapping.class) .replace("__AMP__", "@"); String sGroupIndex = null; // for indexing, ie always a single index String sAliasIndex = null; // for querying, ie will point to doc_commid, doc_commid_1, etc try { sGroupIndex = new StringBuffer("doc_").append(new ObjectId(nameOrCommunityIdStr).toString()).toString(); sAliasIndex = new StringBuffer("docs_").append(new ObjectId(nameOrCommunityIdStr).toString()) .toString(); } catch (Exception e) { sGroupIndex = nameOrCommunityIdStr; if (DocumentPojoIndexMap.globalDocumentIndex_.equals(nameOrCommunityIdStr)) { sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_; } else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(nameOrCommunityIdStr)) { sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_; } else { // fallback sAliasIndex = nameOrCommunityIdStr.replaceAll("doc(?:ument)?_", "docs_"); } //TESTED } if (!bPersonalGroup) { if (null == parentCommunityId) { int nShards = bSystemGroup ? 10 : 5; // (system group is largest) // Remove the alias, in case it exists: // Then create an index with this name: Builder localSettingsGroupIndex = ImmutableSettings.settingsBuilder(); localSettingsGroupIndex.put("number_of_shards", nShards).put("number_of_replicas", nPreferredReplicas); if (languageNormalization) { localSettingsGroupIndex.put("index.analysis.analyzer.default.tokenizer", "standard"); localSettingsGroupIndex.putArray("index.analysis.analyzer.default.filter", "icu_normalizer", "icu_folding", "standard", "lowercase", "stop"); } //TESTED ElasticSearchManager docIndex = IndexManager.createIndex(sGroupIndex, DocumentPojoIndexMap.documentType_, false, null, docMapping, localSettingsGroupIndex); if (null == docIndex) { // index has already been referenced, hence createIndex returns null docIndex = IndexManager.getIndex(sGroupIndex); } if (bClearIndex) { docIndex.deleteMe(); docIndex = IndexManager.createIndex(sGroupIndex, DocumentPojoIndexMap.documentType_, false, null, docMapping, localSettingsGroupIndex); } if (null != docIndex) { try { docIndex.pingIndex(); // (wait until it's created itself) } catch (Exception e) { } // (just make sure this doesn't die horribly) } else { docIndex = IndexManager.getIndex(sGroupIndex); } if (null != docIndex) { // should always be true docIndex.createAlias(sAliasIndex); docIndex.closeIndex(); } } else if (!bParentsOnly) { // A sub-index of a parent parentCommunityId = getRootCommunity(parentCommunityId); if (null != parentCommunityId) { String parentCommunityIdStr = parentCommunityId.toString(); String sParentGroupIndex = new StringBuffer("doc_") .append(new ObjectId(parentCommunityIdStr).toString()).toString(); ElasticSearchManager docIndex = IndexManager.getIndex(sParentGroupIndex); //DEBUG (alias corruption) // if (null == _aliasInfo) { // ClusterStateResponse clusterState = docIndex.getRawClient().admin().cluster().state(new ClusterStateRequest()).actionGet(); // _aliasInfo = CrossVersionImmutableMapOfImmutableMaps.getAliases(clusterState.getState().getMetaData()); // } // else { // if (_aliasInfo.containsKey(sGroupIndex)) { // has no aliases, we're not good // return; // } // else { // //DEBUG // System.out.println("Alias " + sGroupIndex + " has no aliases (but should)"); // ElasticSearchManager docIndex2 = IndexManager.getIndex(sGroupIndex); // docIndex2.deleteMe(); // } // } docIndex.createAlias(sGroupIndex); // for indexing // (this is going to be tricky when the functionality is fully implemented // because it will need to handle the parent index splitting) docIndex.createAlias(sAliasIndex); // for queries docIndex.closeIndex(); // (do nothing on delete - that will be handled at the parent index level) } } //TESTED (parents, children, and personal + docs_ aliases) } else { // (Personal group) // Just create the dummy index, no different to getting it in practice Builder localSettingsGroupIndex = ImmutableSettings.settingsBuilder(); localSettingsGroupIndex.put("number_of_shards", 1).put("number_of_replicas", 0); // (ie guaranteed to be local to each ES node) ElasticSearchManager dummyGroupIndex = IndexManager.createIndex( DocumentPojoIndexMap.dummyDocumentIndex_, DocumentPojoIndexMap.documentType_, false, null, docMapping, localSettingsGroupIndex); if (null == dummyGroupIndex) { dummyGroupIndex = IndexManager.getIndex(DocumentPojoIndexMap.dummyDocumentIndex_); } // Just create an alias, so that queries work arbitrarily: dummyGroupIndex.createAlias(sGroupIndex); // (at some point we should delete the sGroupIndex alias, but leave it in for bw compatibility for now) dummyGroupIndex.createAlias(sAliasIndex); // (never index dummy indices so only need query index) // (do nothing on delete since don't have any docs in here anyway) } } //TESTED (including new docs_ alias) /////////////////////////// // (this utility function is needed for the legacy case where empty communities were // treated as aliases of the dummy community ... first time I encounter a community, I need // to recreate it...) public static void recreateCommunityDocIndex_unknownFields(ObjectId communityId, boolean bDeleteFirst) { if (!ElasticSearchManager.pingCluster()) { throw new RuntimeException("Index not running"); } //TESTED (by c/p from createCommunityDocIndex) CommunityPojo cp = CommunityPojo.fromDb( MongoDbManager.getSocial().getCommunity().findOne(new BasicDBObject("_id", communityId)), CommunityPojo.class); if (null != cp) { deleteCommunityDocIndex(communityId.toString(), cp.getParentId(), true); // (in the legacy world this would have been treated as a "personal" ie equivalently to a dummy community ... // this does nothing if it's already a real community) if (bDeleteFirst) { deleteCommunityDocIndex(communityId.toString(), cp.getParentId(), cp.getIsPersonalCommunity()); } createCommunityDocIndex(communityId.toString(), cp.getParentId(), cp.getIsPersonalCommunity(), cp.getIsSystemCommunity(), false); } } //TESTED /////////////////////////// public static void deleteCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId, boolean bPersonalGroup) { if (!ElasticSearchManager.pingCluster()) { throw new RuntimeException("Index not running"); } //TESTED (by c/p from createCommunityDocIndex) String sGroupIndex = null; // for indexing, ie always a single index String sAliasIndex = null; // for querying, ie will point to doc_commid, doc_commid_1, etc ObjectId communityId = null; try { communityId = new ObjectId(nameOrCommunityIdStr); sGroupIndex = new StringBuffer("doc_").append(communityId.toString()).toString(); sAliasIndex = new StringBuffer("docs_").append(communityId.toString()).toString(); } catch (Exception e) { sGroupIndex = nameOrCommunityIdStr; if (DocumentPojoIndexMap.globalDocumentIndex_.equals(nameOrCommunityIdStr)) { sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_; } else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(nameOrCommunityIdStr)) { sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_; } else { // fallback sAliasIndex = nameOrCommunityIdStr.replaceAll("doc(?:ument)?_", "docs_"); } //TESTED } if (bPersonalGroup) { ElasticSearchManager dummyGroupIndex = IndexManager.getIndex(DocumentPojoIndexMap.dummyDocumentIndex_); dummyGroupIndex.removeAlias(sAliasIndex); dummyGroupIndex.removeAlias(sGroupIndex); } else if (null != parentCommunityId) { parentCommunityId = getRootCommunity(parentCommunityId); if (null != parentCommunityId) { String sParentGroupIndex = new StringBuffer("doc_").append(parentCommunityId.toString()).toString(); ElasticSearchManager docIndex = IndexManager.getIndex(sParentGroupIndex); docIndex.removeAlias(sGroupIndex); docIndex.removeAlias(sAliasIndex); docIndex.closeIndex(); } } else { ElasticSearchManager docIndex = IndexManager.getIndex(sGroupIndex); docIndex.deleteMe(); } //TESTED (parent, children, and personal) // Also need to delete any records indexes: // It's a bit more complex because we're not exactly sure which indexes exist: if (null != communityId) { ElasticSearchManager indexMgr = ElasticSearchManager .getIndex(DocumentPojoIndexMap.globalDocumentIndex_); // (just something that's guaranteed to exist) String stashedIndex = "recs_" + communityId.toString(); String liveIndicesPrefix = "recs_t_" + communityId.toString(); ClusterStateResponse clusterState = indexMgr.getRawClient().admin().cluster() .state(new ClusterStateRequest()).actionGet(); String indices[] = clusterState.getState().getMetaData().getConcreteAllOpenIndices(); for (String index : indices) { if (index.startsWith(stashedIndex) || index.startsWith(liveIndicesPrefix)) { ElasticSearchManager.getIndex(index).deleteMe(); } } //TESTED // THIS CODE ONLY WORKS ON ES-1.0+ ... so have replaced with the less efficient code above // First off: stashed interface: // String stashedIndex = "recs_" + communityId.toString(); // ClusterStateResponse retVal = indexMgr.getRawClient().admin().cluster().prepareState() // .setIndices(stashedIndex) // .setRoutingTable(false).setNodes(false).setListenerThreaded(false).get(); // // if (!retVal.getState().getMetaData().getIndices().isEmpty()) { // ElasticSearchManager.getIndex(stashedIndex).deleteMe(); // }//TESTED // // (else doesn't exist...) // // // Second: all the time-indexed versions // // String indexPattern = new StringBuffer("recs_t_").append(communityId.toString()).append("*").toString(); // retVal = indexMgr.getRawClient().admin().cluster().prepareState() // .setIndices(indexPattern) // .setRoutingTable(false).setNodes(false).setListenerThreaded(false).get(); // // for (IndexMetaData indexMetadata: retVal.getState().getMetaData()) { // ElasticSearchManager.getIndex(indexMetadata.index()).deleteMe(); // }//TESTED } //TESTED } //TESTED (personal and system) /////////////////////////// // Utility function to get the root community of a community hierarchy, since you can't add aliases to aliases static ObjectId getRootCommunity(ObjectId parentCommunityId) { for (;;) { BasicDBObject query = new BasicDBObject("_id", parentCommunityId); BasicDBObject field = new BasicDBObject("parentId", 1); BasicDBObject retVal = (BasicDBObject) MongoDbManager.getSocial().getCommunity().findOne(query, field); if (null == retVal) { // (shouldn't ever happen) return parentCommunityId; } ObjectId tmp = retVal.getObjectId("parentId", null); if (null == tmp) { // (no more parents) return parentCommunityId; } if (tmp.equals(parentCommunityId)) { // (shouldn't ever happen but will prevent infinite loop) return parentCommunityId; } parentCommunityId = tmp; } }//TESTED (cases where have and don't have parent id) /////////////////////////////////////////////////////////////////////////////////////// // // Interface to handle scaleable indexes // Currently this is a dummy interface, but it will make it easy to split the indexes in the future private static HashMap<String, String> _docIndexMap = null; private static String _assocIndex = null; private static String _entityIndex = null; private static CrossVersionImmutableMapOfImmutableMaps<AliasMetaData> _aliasInfo = null; //TODO (INF-1136): Test and integrate this (phase 1), then implement the index splitting code (phase 2) public static synchronized String getIndex(String communityIdOrIndexStr) { if (communityIdOrIndexStr == EntityFeaturePojoIndexMap.indexName_) { // pointer == intended if (null == _entityIndex) { _entityIndex = EntityFeaturePojoIndexMap.indexName_; } return _entityIndex; } else if (communityIdOrIndexStr == AssociationFeaturePojoIndexMap.indexName_) { // pointer == intended if (null == _assocIndex) { _assocIndex = AssociationFeaturePojoIndexMap.indexName_; } return _assocIndex; } else { // Documents if (null == _docIndexMap) { _docIndexMap = new HashMap<String, String>(); } String sAliasIndex; try { sAliasIndex = new StringBuffer("doc_").append(new ObjectId(communityIdOrIndexStr).toString()) .toString(); } catch (Exception e) { if (DocumentPojoIndexMap.globalDocumentIndex_.equals(communityIdOrIndexStr)) { communityIdOrIndexStr = sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_; } else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(communityIdOrIndexStr)) { communityIdOrIndexStr = sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_; } else { // fallback communityIdOrIndexStr = sAliasIndex = communityIdOrIndexStr.replaceAll("doc(?:ument)?_", ""); } } String sDocIndex = _docIndexMap.get(communityIdOrIndexStr); if (null == sDocIndex) { sDocIndex = sAliasIndex; _docIndexMap.put(communityIdOrIndexStr, sAliasIndex); } return sDocIndex; } } //TOTEST (lots of cases) /////////////////////////////////////////////////////////////////////////////////////// // // Enrich and store documents (source is optional - can choose not to index if set) // (and remove any documents) public void processDocuments(int harvestType, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate_subsetOfAdd, List<DocumentPojo> toDelete) { processDocuments(harvestType, toAdd, toUpdate_subsetOfAdd, toDelete, null); } public void processDocuments(int harvestType, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate_subsetOfAdd, List<DocumentPojo> toDelete, SourcePojo source) { PropertiesManager props = new PropertiesManager(); // Note: toAdd = toAdd(old) + toUpdate // Need to treat updates as follows: // - Delete (inc children, eg events) but get fields to keep (currently _id, created; in the future comments etc) // Delete toUpdate and toAdd (also overwriting "created" for updated docs, well all actually...) toDelete.addAll(toUpdate_subsetOfAdd); StoreAndIndexManager storageManager = new StoreAndIndexManager(); storageManager.removeFromDatastore_byURL(toDelete, source); // (note: expands toDelete if any sourceUrl "docs" are present, see FileHarvester) // (Storing docs messes up the doc/event/entity objects, so don't do that just yet...) // Aggregation: // 1+2. Create aggregate entities/events ("features") and write them to the DB // (then can store feeds - doesn't matter that the event/entities have been modified by the aggregation) // 3. (Scheduled for efficiency) Update all documents' frequencies based on new entities and events // 4. (Scheduled for efficiency) Synchronize with index [after this, queries can find them - so (2) must have happened] // (Syncronization currently "corrupts" the entities so needs to be run last) AggregationManager perSourceAggregation = null; if (!props.getAggregationDisabled()) { perSourceAggregation = new AggregationManager(); } // 1+2] if (null != perSourceAggregation) { perSourceAggregation.doAggregation(toAdd, toDelete); perSourceAggregation.createOrUpdateFeatureEntries(); } // Save feeds to feeds collection in MongoDb // (second field determines if content gets saved) if (null != perSourceAggregation) { perSourceAggregation.applyAggregationToDocs(toAdd); // (First save aggregated statistics back to the docs' entity/event instances) } storeFeeds(toAdd, (harvestType != InfiniteEnums.DATABASE), source); // Then finish aggregation: if (null != perSourceAggregation) { // 3] perSourceAggregation.runScheduledDocumentUpdates(); // 4] This needs to happen last because it "corrupts" the entities and events perSourceAggregation.runScheduledSynchronization(); } }//TESTED (by eye - logic is v simple) /////////////////////////////////////////////////////////////////////////////////////// // // STORAGE AND INDEXING // ////////////////////////////////////////////////////////////////////////////////////// /** * Writes the feeds to the DB and index * * @param feeds list of feeds to be added to db */ private void storeFeeds(List<DocumentPojo> docs, boolean bSaveContent, SourcePojo source) { if (null != docs && docs.size() > 0) { StoreAndIndexManager store = new StoreAndIndexManager(); store.addToDatastore(docs, bSaveContent, source); } }//TESTED (by eye) // See StoreAndIndexManager /////////////////////////////////////////////////////////////////////////////////////// // // AGGREGATION // ////////////////////////////////////////////////////////////////////////////////////// // See AggregationManager }