Java tutorial
/******************************************************************************* * Copyright 2012, The Infinit.e Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package com.ikanow.infinit.e.utility; import java.io.IOException; import java.lang.reflect.Type; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import org.bson.BSONObject; import org.bson.types.ObjectId; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.ImmutableSettings.Builder; import com.google.gson.Gson; import com.google.gson.reflect.TypeToken; import com.ikanow.infinit.e.data_model.index.ElasticSearchManager; import com.ikanow.infinit.e.data_model.index.IndexManager; import com.ikanow.infinit.e.data_model.index.feature.entity.EntityFeaturePojoIndexMap; import com.ikanow.infinit.e.data_model.store.DbManager; import com.ikanow.infinit.e.data_model.store.feature.entity.EntityFeaturePojo; import com.ikanow.infinit.e.processing.generic.GenericProcessingController; import com.mongodb.BasicDBObject; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.Mongo; import com.mongodb.MongoException; public class MongoEntityFeatureTxfer { //___________________________________________________________________________________________________ // MAIN /** * @param args: 0 is config location, 1 is query, 2 is delete/split (optional) * to run 3 options: * Transfer: config query(opt) * Delete: config query delete * Split: config query split * * @throws MongoException * @throws NumberFormatException * @throws IOException */ public static void main(String sConfigPath, String sQuery, boolean bDelete, boolean bRebuildIndex, int nSkip, int nLimit, String chunksDescription) throws NumberFormatException, MongoException, IOException { MongoEntityFeatureTxfer txferManager = new MongoEntityFeatureTxfer(); // Command line processing com.ikanow.infinit.e.data_model.Globals .setIdentity(com.ikanow.infinit.e.data_model.Globals.Identity.IDENTITY_SERVICE); if (null != sConfigPath) { com.ikanow.infinit.e.data_model.Globals.overrideConfigLocation(sConfigPath); } if (bRebuildIndex) { new GenericProcessingController().InitializeIndex(false, true, false); } BasicDBObject query = null; if (null == sQuery) { query = new BasicDBObject(); } else { query = (BasicDBObject) com.mongodb.util.JSON.parse(sQuery); } if (bDelete) { MongoEntityFeatureTxfer.doDelete(query, nLimit); } else { if (null == chunksDescription) { txferManager.doTransfer(query, nSkip, nLimit, null); } else { txferManager.doChunkedTransfer(query, nSkip, nLimit, chunksDescription); } } } //___________________________________________________________________________________________________ // Wrapper for doing transfer in chunks: private void doChunkedTransfer(BasicDBObject query, int nSkip, int nLimit, String chunksDescription) throws IOException { List<BasicDBObject> chunkList = MongoIndexerUtils.getChunks("feature.entity", chunksDescription); System.out.println("CHUNKS: Found " + chunkList.size() + " chunks"); //DEBUG //System.out.println("Chunklist= " + chunkList); for (BasicDBObject chunk : chunkList) { BasicDBObject cleanQuery = new BasicDBObject(); cleanQuery.putAll((BSONObject) query); String id = null; try { id = (String) chunk.remove("$id"); System.out.println("CHUNK: " + id); doTransfer(cleanQuery, 0, 0, chunk); } catch (Exception e) { System.out.println("FAILED CHUNK: " + id + " ... " + e.getMessage()); } } }//TESTED //___________________________________________________________________________________________________ // PROCESSING LOOP (new interface) private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) { ElasticSearchManager elasticManager = null; // Initialize the DB: DBCollection entityFeatureDB = DbManager.getFeature().getEntity(); // Initialize the ES (create the index if it doesn't already): // 1. Set-up the entity feature index String indexName = "entity_index"; ElasticSearchManager.setDefaultClusterName("infinite-aws"); // (delete the index) //elasticManager = ElasticSearchManager.getIndex(indexName); //elasticManager.deleteMe(); // Create the index if necessary String sMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(), EntityFeaturePojoIndexMap.Mapping.class); Builder localSettings = ImmutableSettings.settingsBuilder(); localSettings.put("number_of_shards", 1).put("number_of_replicas", 0); localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); elasticManager = ElasticSearchManager.createIndex(indexName, null, false, null, sMapping, localSettings); // Get the index (necessary if already created) if (null == elasticManager) { elasticManager = ElasticSearchManager.getIndex(indexName); } // Now query the DB: DBCursor dbc = null; dbc = entityFeatureDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } int nSynced = 0; List<EntityFeaturePojo> entities = new ArrayList<EntityFeaturePojo>(); while (dbc.hasNext()) { EntityFeaturePojo feature = EntityFeaturePojo.fromDb(dbc.next(), EntityFeaturePojo.class); if (null != feature.getAlias()) { // (some corrupt gazateer entry) // Handle groups (system group is: "4c927585d591d31d7b37097a") // if there is no community id, add system group (something is wrong if this happens?) if (null == feature.getCommunityId()) { feature.setCommunityId(new ObjectId("4c927585d591d31d7b37097a")); } } entities.add(feature); nSynced++; // Add the entities if (entities.size() > 1000) { elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(), new EntityFeaturePojoIndexMap()), "_id", null, true); // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community) entities = new ArrayList<EntityFeaturePojo>(); } } //write whatevers left elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(), new EntityFeaturePojoIndexMap()), "_id", null, true); // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community) if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } } //___________________________________________________________________________________________________ // DELETE DOCUMENTS FROM A QUERY static void doDelete(BasicDBObject query, int nLimit) { doDelete(query, nLimit, false); } static void doDelete(BasicDBObject query, int nLimit, boolean automatedRequest) { try { // Initialize the DB: DBCollection entityFeatureDB = DbManager.getFeature().getEntity(); ElasticSearchManager elasticManager = ElasticSearchManager.getIndex("entity_index"); BasicDBObject fields = new BasicDBObject(); fields.put(EntityFeaturePojo.index_, 1); fields.put(EntityFeaturePojo.communityId_, 1); DBCursor cur = entityFeatureDB.find(query, fields).limit(nLimit); // (this internally works in batches of 1000) if (automatedRequest) { System.out.println("Found " + cur.count() + " records to delete from _id list"); } else { System.out.println("Found " + cur.count() + " records to delete from " + query.toString()); } if (nLimit > 0) { System.out.println("(limited to " + nLimit + " records)"); } int nArraySize = (cur.count() > 1000) ? 1000 : cur.count(); ArrayList<EntityFeaturePojo> batchList = new ArrayList<EntityFeaturePojo>(nArraySize); while (cur.hasNext()) { EntityFeaturePojo gp = EntityFeaturePojo.fromDb(cur.next(), EntityFeaturePojo.class); batchList.add(gp); if (batchList.size() >= nArraySize) { internalDelete(batchList, elasticManager); batchList.clear(); } } if (!batchList.isEmpty()) { internalDelete(batchList, elasticManager); } entityFeatureDB.remove(query); } catch (NumberFormatException e) { e.printStackTrace(); } catch (MongoException e) { e.printStackTrace(); } finally { } }//TESTED // Batch delete private static void internalDelete(List<EntityFeaturePojo> entitiesToDelete, ElasticSearchManager esMgr) { List<String> esids = new ArrayList<String>(entitiesToDelete.size()); for (EntityFeaturePojo gp : entitiesToDelete) { esids.add( new StringBuffer(gp.getIndex()).append(':').append(gp.getCommunityId().toString()).toString()); } esMgr.bulkDeleteDocuments(esids); }//TESTED //___________________________________________________________________________________________________ // TEST CODE @SuppressWarnings("unused") private void doUnitTestCode(String sMongoDbHost, String sMongoDbPort, String sElasticHost, String sElasticPort, BasicDBObject query, int nLimit) { Mongo mongoDB = null; ElasticSearchManager elasticManager = null; try { // Initialize the DB: mongoDB = new Mongo(sMongoDbHost, Integer.parseInt(sMongoDbPort)); DBCollection gazDB = mongoDB.getDB("feature").getCollection("entity"); // Initialize the ES (create the index if it doesn't already): // 1. Set-up the entity feature index String indexName = "entity_index"; //TEST: delete the index: // elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort); // elasticManager.deleteMe(); //TEST: create the index // String sMapping = new Gson().toJson(new GazateerPojo.Mapping(), GazateerPojo.Mapping.class); // Builder localSettings = ImmutableSettings.settingsBuilder(); // localSettings.put("number_of_shards", 1).put("number_of_replicas", 0); q // elasticManager = ElasticSearchManager.createIndex // (indexName, false, // sElasticHost + ":" + sElasticPort, // sMapping, localSettings); //TEST: delete the index: // elasticManager.deleteMe(); //TEST: get the index: // elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort); // Now query the DB: DBCursor dbc = null; if (nLimit > 0) { dbc = gazDB.find(query).limit(nLimit); } else { // Everything! dbc = gazDB.find(query); } Type listType = new TypeToken<ArrayList<EntityFeaturePojo>>() { }.getType(); List<EntityFeaturePojo> entities = new Gson().fromJson(dbc.toArray().toString(), listType); //Debug: List<String> entIds = new LinkedList<String>(); // Loop over array and invoke the cleansing function for each one for (EntityFeaturePojo ent : entities) { if (null != ent.getAlias()) { // (some corrupt gazateer entry) //Debug: //System.out.println("entity=" + ent.getGazateerIndex()); //System.out.println("aliases=" + Arrays.toString(ent.getAlias().toArray())); // Insert into the elasticsearch index //Debug: //System.out.println(new Gson().toJson(ent, GazateerPojo.class)); // Handle groups (system group is: "4c927585d591d31d7b37097a") if (null == ent.getCommunityId()) { ent.setCommunityId(new ObjectId("4c927585d591d31d7b37097a")); } //TEST: index documemt // ent.synchronizeWithIndex(); // boolean b = elasticManager.addDocument(ent, ent.getGazateerIndex(), true); //TEST: remove document //b = elasticManager.removeDocument(ent.getGazateerIndex()); //TEST: (part of get, bulk add/delete) entIds.add(ent.getIndex()); // Debug: // if (!b) { // System.out.println("Didn't add " + ent.getGazateerIndex()); // } } } // End loop over entities //TEST: bulk delete //elasticManager.bulkAddDocuments(entities, "index", null); //elasticManager.bulkDeleteDocuments(entIds); //TEST: get document // elasticManager.getRawClient().admin().indices().refresh(Requests.refreshRequest(indexName)).actionGet(); // for (String id: entIds) { // Map<String, GetField> results = elasticManager.getDocument(id,"doccount", "disambiguated_name"); // System.out.println(id + ": " + results.get("doccount").values().get(0) + " , " + results.get("disambiguated_name").values().get(0)); // } //TEST: search // elasticManager.getRawClient().admin().indices().refresh(Requests.refreshRequest(indexName)).actionGet(); // SearchRequestBuilder searchOptions = elasticManager.getSearchOptions(); // XContentQueryBuilder queryObj = QueryBuilders.matchAllQuery(); // searchOptions.addSort("doccount", SortOrder.DESC); // searchOptions.addFields("doccount", "type"); // SearchResponse rsp = elasticManager.doQuery(queryObj, searchOptions); // SearchHit[] docs = rsp.getHits().getHits(); // for (SearchHit hit: docs) { // String id = hit.getId(); // Long doccount = (Long) hit.field("doccount").value(); // String type = (String) hit.field("type").value(); // System.out.println(id + ": " + doccount + ", " + type); // } } catch (NumberFormatException e) { e.printStackTrace(); } catch (UnknownHostException e) { e.printStackTrace(); } catch (MongoException e) { e.printStackTrace(); } finally { if (null != mongoDB) { mongoDB.close(); } if (null != elasticManager) { //NB not sure when exactly to call this - probably can just not bother? //elasticManager.getRawClient().close(); } } } }