Java tutorial
/******************************************************************************* * Copyright 2012, The Infinit.e Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package com.ikanow.infinit.e.utility; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import org.bson.BSONObject; import org.bson.types.ObjectId; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.ImmutableSettings.Builder; import com.google.gson.Gson; import com.ikanow.infinit.e.data_model.index.ElasticSearchManager; import com.ikanow.infinit.e.data_model.index.document.DocumentPojoIndexMap; import com.ikanow.infinit.e.data_model.store.DbManager; import com.ikanow.infinit.e.data_model.store.MongoDbManager; import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo; import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo; import com.ikanow.infinit.e.data_model.store.document.DocumentPojo; import com.ikanow.infinit.e.data_model.store.social.community.CommunityPojo; import com.ikanow.infinit.e.harvest.HarvestController; import com.ikanow.infinit.e.harvest.HarvestControllerPipeline; import com.ikanow.infinit.e.processing.generic.GenericProcessingController; import com.ikanow.infinit.e.processing.generic.aggregation.AggregationManager; import com.ikanow.infinit.e.processing.generic.aggregation.AssociationBackgroundAggregationManager; import com.ikanow.infinit.e.processing.generic.aggregation.EntityBackgroundAggregationManager; import com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager; import com.ikanow.infinit.e.processing.generic.utils.PropertiesManager; import com.mongodb.BasicDBObject; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.DBObject; import com.mongodb.MongoException; public class MongoDocumentTxfer { //___________________________________________________________________________________________________ // MAIN /** * @param args: 0,1 is the location of the MongoDB host/port, 2/3 is the location of the ES index host/port * @throws MongoException * @throws NumberFormatException * @throws IOException */ public static void main(String sConfigPath, String sQuery, boolean bDelete, boolean bRebuildIndex, boolean bVerifyIndex, boolean bUpdateFeatures, int nSkip, int nLimit, String chunksDescription) throws NumberFormatException, MongoException, IOException { // Command line processing com.ikanow.infinit.e.data_model.Globals .setIdentity(com.ikanow.infinit.e.data_model.Globals.Identity.IDENTITY_SERVICE); if (null != sConfigPath) { com.ikanow.infinit.e.data_model.Globals.overrideConfigLocation(sConfigPath); } boolean bRebuildIndexOnFly = false; if (bRebuildIndex && ((null == sQuery) || sQuery.equals("{}"))) { // (else will do them 1-by-1) new GenericProcessingController().InitializeIndex(true, false, false); } else { // Have seen odd transport timeouts on occasion: this should ensure they never happen new GenericProcessingController().InitializeIndex(false, false, false, bVerifyIndex); // (don't delete anything, but do recalc) if (bRebuildIndex) { bRebuildIndexOnFly = true; } } if (bVerifyIndex && (0 == nLimit) && (null == sQuery)) { // Index verifcation with nothing else to do return; } MongoDocumentTxfer txferManager = new MongoDocumentTxfer(bRebuildIndexOnFly); BasicDBObject query = null; if (null == sQuery) { query = new BasicDBObject(); } else { query = (BasicDBObject) com.mongodb.util.JSON.parse(sQuery); } if (!bDelete) { if (null != chunksDescription) { txferManager.doChunkedTransfer(query, nSkip, nLimit, bUpdateFeatures, chunksDescription); } else { txferManager.doTransfer(query, nSkip, nLimit, bUpdateFeatures, null); } } else { txferManager.doDelete(query, nLimit); } } public MongoDocumentTxfer(boolean bRebuildIndexOnFly) { if (bRebuildIndexOnFly) { _deletedIndex = new TreeSet<String>(); _deletedIndex.add(DocumentPojoIndexMap.manyGeoDocumentIndex_); // (don't ever delete this on the fly, it contains docs matching other queries) } } //___________________________________________________________________________________________________ // Wrapper for doing transfer in chunks: private void doChunkedTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, String chunksDescription) throws IOException { List<BasicDBObject> chunkList = MongoIndexerUtils.getChunks("doc_metadata.metadata", chunksDescription); System.out.println("CHUNKS: Found " + chunkList.size() + " chunks"); //DEBUG //System.out.println("Chunklist= " + chunkList); for (BasicDBObject chunk : chunkList) { BasicDBObject cleanQuery = new BasicDBObject(); cleanQuery.putAll((BSONObject) query); String id = null; try { id = (String) chunk.remove("$id"); System.out.println("CHUNK: " + id); doTransfer(cleanQuery, 0, 0, bAggregate, chunk); } catch (Exception e) { System.out.println("FAILED CHUNK: " + id + " ... " + e.getMessage()); } } }//TESTED //___________________________________________________________________________________________________ // PROCESSING LOOP (new interface) private Map<String, SourcePojo> _sourceCache = new HashMap<String, SourcePojo>(); private TreeSet<String> _deletedIndex = null; private void doTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, BasicDBObject chunk) throws IOException { PropertiesManager pm = new PropertiesManager(); int nMaxContentSize_bytes = pm.getMaxContentSize(); // Initialize the DB: DBCollection docsDB = DbManager.getDocument().getMetadata(); DBCollection contentDB = DbManager.getDocument().getContent(); DBCollection sourcesDB = DbManager.getIngest().getSource(); ElasticSearchManager.setDefaultClusterName("infinite-aws"); // 1. Get the documents from the DB (combining data + metadata and refreshing source meta) // (Ignore soft-deleted records:) if (null == query) { query = new BasicDBObject(); } Object sourceKeyQueryTerm = query.remove(DocumentPojo.sourceKey_); if (null != sourceKeyQueryTerm) { if (query.toString() .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) { throw new RuntimeException( "Can't specify sourceKey as part of complex query term: " + query.toString()); } //TESTED (by hand, "{ \"sourceKey\": \"x\", \"$or\": [ { \"sourceKey\": \"x\" } ] }") if (sourceKeyQueryTerm instanceof String) { query.put(DocumentPojo.sourceKey_, SourcePojo.getDistributedKeyQueryTerm((String) sourceKeyQueryTerm)); } //TESTED (by hand, "{\"sourceKey\": \"feeds.arstechnica.com.arstechnica.index.11.2.\" }") else if (sourceKeyQueryTerm instanceof DBObject) { // find all the _sources_ matching this term, and convert to a big list including distribution BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1); fields.put(SourcePojo.highestDistributionFactorStored_, 1); DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.key_, sourceKeyQueryTerm), fields); LinkedList<String> sourceKeys = new LinkedList<String>(); for (DBObject dbo : dbc) { String key = (String) dbo.get(SourcePojo.key_); Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_); Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor); sourceKeys.addAll(sourceKeysForSource); } query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys)); } //TESTED (by hand, "{\"sourceKey\": { \"$gt\": \"dev.ikanow\" } }") else { throw new RuntimeException("Can't specify sourceKey as part of complex query term"); } //(actually not possible, just included here for mathematical completeness...) } else { if (query.toString() .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) { throw new RuntimeException("Can't specify sourceKey as part of complex query term"); } //TESTE (by hand, "{ \"$or\": [ { \"sourceKey\": \"x\" } ] }") // Optimize communityId into sourceKeys... if (null != query.get(DocumentPojo.communityId_)) { try { ObjectId commId = query.getObjectId(DocumentPojo.communityId_); BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1); fields.put(SourcePojo.highestDistributionFactorStored_, 1); DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.communityIds_, commId), fields); LinkedList<String> sourceKeys = new LinkedList<String>(); int added = 0; for (DBObject dbo : dbc) { String key = (String) dbo.get(SourcePojo.key_); Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_); Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor); sourceKeys.addAll(sourceKeysForSource); added += sourceKeysForSource.size(); } query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys)); System.out.println("(Optimized simple community query to " + added + " source key(s))"); } catch (Exception e) { //DEBUG //e.printStackTrace(); System.out.println("(Can't optimize complex community query: " + e.getMessage()); } } //TESTED (by hand - including distributed source version) } // Ignored delete objects Object urlQuery = query.get(DocumentPojo.url_); if (null == urlQuery) { query.put(DocumentPojo.url_, Pattern.compile("^[^?]")); // (ie nothing starting with ?) } //TESTED else if (urlQuery instanceof BasicDBObject) { ((BasicDBObject) urlQuery).append("$regex", "^[^?]"); } //TESTED //DEBUG //System.out.println("COMBINED QUERY= " + query.toString()); // If aggregating, kick off the background aggregation thread if (bAggregate) { EntityBackgroundAggregationManager.startThread(); AssociationBackgroundAggregationManager.startThread(); } //Debug: DBCursor dbc = null; dbc = docsDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } byte[] storageArray = new byte[200000]; int nSynced = 0; LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>(); Map<ObjectId, LinkedList<DocumentPojo>> communityList = null; ObjectId currCommunityId = null; while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class); String sDocIndex = doc.getIndex(); if (null == sDocIndex) { sDocIndex = "document_index"; } if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) { _deletedIndex.add(sDocIndex); rebuildIndex(sDocIndex); try { // (Just in case the index requires some time to sort itself out) Thread.sleep(1000); } catch (InterruptedException e) { } } //Debug: //System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl()); // Get the content: if ((0 != nMaxContentSize_bytes) && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) { BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl()); contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey()))); BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1); fields.put(CompressedFullTextPojo.sourceKey_, 1); DBCursor dbcGzip = contentDB.find(contentQ, fields); while (dbcGzip.hasNext()) { BasicDBObject dboContent = (BasicDBObject) dbcGzip.next(); if (!dboContent.containsField(CompressedFullTextPojo.sourceKey_)) { // If this has another version then ignore this one... if (dbc.hasNext()) { continue; } //TESTED (by hand) } byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_)); ByteArrayInputStream in = new ByteArrayInputStream(compressedData); GZIPInputStream gzip = new GZIPInputStream(in); int nRead = 0; StringBuffer output = new StringBuffer(); while (nRead >= 0) { nRead = gzip.read(storageArray, 0, 200000); if (nRead > 0) { String s = new String(storageArray, 0, nRead, "UTF-8"); output.append(s); } } doc.setFullText(output.toString()); } } // (else document has full text already) // Get tags, if necessary: // Always overwrite tags - one of the reasons we might choose to migrate // Also may need source in order to support source index filtering SourcePojo src = _sourceCache.get(doc.getSourceKey()); if (null == src) { //TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?) BasicDBObject srcDbo = (BasicDBObject) sourcesDB .findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey())); if (null != srcDbo) { src = SourcePojo.fromDb(srcDbo, SourcePojo.class); if (null != src.getProcessingPipeline()) { try { // Set the index settings HarvestController hc = new HarvestController(); HarvestControllerPipeline hcPipe = new HarvestControllerPipeline(); hcPipe.extractSource_preProcessingPipeline(src, hc); } catch (Exception e) { //DEBUG e.printStackTrace(); } } //TESTED (by hand) _sourceCache.put(doc.getSourceKey(), src); } } doc.setTempSource(src); // (needed for source index filtering) if (null != src) { if (null != src.getTags()) { Set<String> tagsTidied = new TreeSet<String>(); for (String s : src.getTags()) { String ss = s.trim().toLowerCase(); tagsTidied.add(ss); } // May also want to write this back to the DB: //TODO (INF-2223): Handle append tags or not in the pipeline... if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) { if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) { BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_, doc.getRawSourceKey()); // (ie including the # if there is one) updateQuery.put(DocumentPojo._id_, doc.getId()); docsDB.update(updateQuery, new BasicDBObject(DbManager.addToSet_, new BasicDBObject(DocumentPojo.tags_, new BasicDBObject(DbManager.each_, tagsTidied)))); } doc.setTags(tagsTidied); // (just copy ptr across) } } } // 2. Update the index with the new document // (Optionally also update entity and assoc features) if (bAggregate) { if (null == currCommunityId) { currCommunityId = doc.getCommunityId(); } else if (!currCommunityId.equals(doc.getCommunityId())) { LinkedList<DocumentPojo> perCommunityDocList = null; if (null == communityList) { // (very first time we see > 1 community) communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>(); perCommunityDocList = new LinkedList<DocumentPojo>(); perCommunityDocList.addAll(docsToTransfer); //(NOT including doc, this hasn't been added to docsToTransfer yet) communityList.put(currCommunityId, perCommunityDocList); } currCommunityId = doc.getCommunityId(); perCommunityDocList = communityList.get(currCommunityId); if (null == perCommunityDocList) { perCommunityDocList = new LinkedList<DocumentPojo>(); communityList.put(currCommunityId, perCommunityDocList); } perCommunityDocList.add(doc); } } //TESTED nSynced++; docsToTransfer.add(doc); if (0 == (nSynced % 10000)) { StoreAndIndexManager manager = new StoreAndIndexManager(); if (bAggregate) { // Loop over communities and aggregate each one then store the modified entities/assocs doAggregation(communityList, docsToTransfer); communityList = null; // (in case the next 10,000 docs are all in the same community!) currCommunityId = null; } //TOTEST manager.addToSearch(docsToTransfer); docsToTransfer.clear(); System.out.println("(Synced " + nSynced + " records)"); } } // (End loop over docs) // Sync remaining docs if (!docsToTransfer.isEmpty()) { if (bAggregate) { // Loop over communities and aggregate each one then store the modified entities/assocs doAggregation(communityList, docsToTransfer); } StoreAndIndexManager manager = new StoreAndIndexManager(); manager.addToSearch(docsToTransfer); } if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } if (bAggregate) { System.out.println("Completed. You can hit CTRL+C at any time."); System.out.println( "By default it will keep running for 5 minutes while the background aggregation runs to update the documents' entities."); try { Thread.sleep(300000); } catch (InterruptedException e) { } // Turn off so we can exit EntityBackgroundAggregationManager.stopThreadAndWait(); AssociationBackgroundAggregationManager.stopThreadAndWait(); } } //___________________________________________________________________________________________________ private void doAggregation(Map<ObjectId, LinkedList<DocumentPojo>> communityList, LinkedList<DocumentPojo> singleList) { if (null == communityList) { // just one community this one is easy AggregationManager aggManager = new AggregationManager(); aggManager.doAggregation(singleList, new LinkedList<DocumentPojo>()); aggManager.createOrUpdateFeatureEntries(); aggManager.applyAggregationToDocs(singleList); aggManager.runScheduledDocumentUpdates(); aggManager.runScheduledSynchronization(); } else { for (Map.Entry<ObjectId, LinkedList<DocumentPojo>> entry : communityList.entrySet()) { AggregationManager aggManager = new AggregationManager(); aggManager.doAggregation(entry.getValue(), new LinkedList<DocumentPojo>()); aggManager.createOrUpdateFeatureEntries(); aggManager.applyAggregationToDocs(entry.getValue()); aggManager.runScheduledDocumentUpdates(); aggManager.runScheduledSynchronization(); } } //TESTED // Finally, need to update all the docs (ick) DocumentPojo dummy = new DocumentPojo(); for (DocumentPojo doc : singleList) { boolean bEnts = (null != doc.getEntities()) && !doc.getEntities().isEmpty(); boolean bAssocs = (null != doc.getAssociations()) && !doc.getAssociations().isEmpty(); if (bEnts || bAssocs) { dummy.setEntities(doc.getEntities()); dummy.setAssociations(doc.getAssociations()); DBObject toWrite = dummy.toDb(); BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_, doc.getRawSourceKey()); updateQuery.put(DocumentPojo._id_, doc.getId()); MongoDbManager.getDocument().getMetadata().update(updateQuery, new BasicDBObject(MongoDbManager.set_, toWrite)); } //TESTED } // (end loop over docs) }//TESTED //___________________________________________________________________________________________________ // Utility function for the above, rebuilds an index private void rebuildIndex(String indexName) { if (indexName.startsWith("doc_")) { // Else not eligible... try { ObjectId communityId = new ObjectId(indexName.substring(4)); //OK ... issue here with child communities .. you can't just rebuild the index because it will delete the parent index also BasicDBObject query = new BasicDBObject("_id", communityId); BasicDBObject fields = new BasicDBObject("parentId", 1); fields.put("name", 1); CommunityPojo community = CommunityPojo .fromDb(DbManager.getSocial().getCommunity().findOne(query, fields), CommunityPojo.class); if (null == community) { System.out.println("WARNING_COMM_EXIST: community " + communityId + " does not exist, this will likely cause problems"); return; } if (null != community.getParentId()) { if (null == community.getParentName()) { CommunityPojo parentComm = CommunityPojo.fromDb(DbManager.getSocial().getCommunity() .findOne(new BasicDBObject("_id", community.getParentId())), CommunityPojo.class); if (null == parentComm) { System.out.println("WARNING_COMM_EXIST: community " + community.getParentId() + " does not exist, this will likely cause problems"); } else { community.setParentName(parentComm.getName()); } } System.out.println("WARNING_CHILD_COMM: " + "commid=" + communityId + ", community" + community.getName() + " has a parent, parent_id=" + community.getParentId() + " (name " + community.getParentName() + "). " + "This community will not be rebuilt, and you should ensure that it is re-indexed if the parent community is subsequently rebuilt."); return; } //TESTED (by hand - works normally on non-child communities, refuses to delete child communities) GenericProcessingController.recreateCommunityDocIndex_unknownFields(communityId, true); } catch (Exception e) { // I guess this wasn't a valid community?! e.printStackTrace(); } } } //TESTED (by hand, it's a straight call of tested GPC code anyway) //___________________________________________________________________________________________________ // DELETE DOCUMENTS FROM A QUERY private void doDelete(BasicDBObject query, int nLimit) { try { // Get the documents to delete BasicDBObject queryFields = new BasicDBObject(DocumentPojo.sourceKey_, 1); queryFields.put(DocumentPojo.sourceUrl_, 1); queryFields.put(DocumentPojo.url_, 1); queryFields.put(DocumentPojo.communityId_, 1); queryFields.put(DocumentPojo.index_, 1); DBCursor cur = DbManager.getDocument().getMetadata().find(query, queryFields).limit(nLimit); // (this internally works in batches of 1000) System.out.println("Found " + cur.count() + " records to delete"); if (nLimit > 0) { System.out.println("(limited to " + nLimit + " records)"); } List<DocumentPojo> docs = DocumentPojo.listFromDb(cur, DocumentPojo.listType()); // Keep track of number of docs per community getting deleted Map<ObjectId, Integer> communityMap = new HashMap<ObjectId, Integer>(); Map<String, Integer> sourceKeyMap = new HashMap<String, Integer>(); for (DocumentPojo doc : docs) { if (null != doc.getSourceKey()) { // (can only happen by error, still) ObjectId community = doc.getCommunityId(); Integer count = communityMap.get(community); communityMap.put(community, (count == null ? 1 : count + 1)); String sourceKey = doc.getSourceKey(); Integer count2 = sourceKeyMap.get(sourceKey); sourceKeyMap.put(sourceKey, (count2 == null ? 1 : count2 + 1)); } } StoreAndIndexManager dataStore = new StoreAndIndexManager(); dataStore.removeFromDatastore_byURL(docs, null); AggregationManager.updateEntitiesFromDeletedDocuments(dataStore.getUUID()); dataStore.removeSoftDeletedDocuments(); AggregationManager.updateDocEntitiesFromDeletedDocuments(dataStore.getUUID()); // Actually update the DB counts: for (Map.Entry<ObjectId, Integer> communityInfo : communityMap.entrySet()) { System.out.println("Removed " + communityInfo.getValue() + " records from community " + communityInfo.getKey()); DbManager.getDocument().getCounts().update(new BasicDBObject("_id", communityInfo.getKey()), new BasicDBObject("$inc", new BasicDBObject("doccount", -communityInfo.getValue()))); } for (Map.Entry<String, Integer> sourceInfo : sourceKeyMap.entrySet()) { System.out.println( "Removed " + sourceInfo.getValue() + " records from source " + sourceInfo.getKey()); DbManager.getIngest().getSource().update(new BasicDBObject("key", sourceInfo.getKey()), new BasicDBObject("$inc", new BasicDBObject("harvest.doccount", -sourceInfo.getValue()))); } } catch (Exception e) { e.printStackTrace(); } } //___________________________________________________________________________________________________ //___________________________________________________________________________________________________ //___________________________________________________________________________________________________ //___________________________________________________________________________________________________ // UNIT/FUNCTIONAL/COVERAGE TEST CODE @SuppressWarnings("unused") private void doUnitTest(String sMongoDbHost, String sMongoDbPort, String sElasticHost, String sElasticPort, BasicDBObject query, int nLimit) { ElasticSearchManager elasticManager = null; try { // Initialize the DB: DBCollection feedsDB = DbManager.getDocument().getMetadata(); DBCollection contentDB = DbManager.getDocument().getContent(); DBCollection sourcesDB = DbManager.getIngest().getSource(); String indexName = "document_index"; // Test/debug recreate the index if (true) { // (delete the index) System.out.println("Deleting index..."); elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort); elasticManager.deleteMe(); //(also deletes the child index - same index, different type) // Create the index if necessary String sMapping = new Gson().toJson(new DocumentPojoIndexMap.Mapping(), DocumentPojoIndexMap.Mapping.class); Builder localSettings = ImmutableSettings.settingsBuilder(); localSettings.put("number_of_shards", 10).put("number_of_replicas", 2); System.out.println("Creating index..." + sMapping); elasticManager = ElasticSearchManager.createIndex(indexName, null, false, sElasticHost + ":" + sElasticPort, sMapping, localSettings); } // Get the index (necessary if already created) if (null == elasticManager) { elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort); } // Get the feeds from the DB: //Debug: // System.out.println("Querying DB..."); DBCursor dbc = feedsDB.find(query).limit(nLimit); byte[] storageArray = new byte[200000]; while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class); //Debug: System.out.println("Getting content..." + doc.getTitle() + " / " + doc.getUrl()); // Get the content: BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl()); contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey()))); BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ); if (null != dboContent) { byte[] compressedData = ((byte[]) dboContent.get("gzip_content")); ByteArrayInputStream in = new ByteArrayInputStream(compressedData); GZIPInputStream gzip = new GZIPInputStream(in); int nRead = gzip.read(storageArray, 0, 200000); String s = new String(storageArray, 0, nRead, "UTF-8"); doc.setFullText(s); } // Get tag: SourcePojo src = _sourceCache.get(doc.getSourceKey()); if (null == src) { BasicDBObject srcDbo = (BasicDBObject) sourcesDB .findOne(new BasicDBObject("key", doc.getSourceKey())); if (null != srcDbo) { src = new Gson().fromJson(srcDbo.toString(), SourcePojo.class); _sourceCache.put(doc.getSourceKey(), src); } } if (null != src) { Set<String> tagsTidied = new TreeSet<String>(); for (String s : src.getTags()) { String ss = s.trim().toLowerCase(); tagsTidied.add(ss); } doc.setTags(tagsTidied); } //TEST: set dynamic field // Lots of testing of dynamic dates: // feed.addToMetadata("my_dateISO", Date.parse(feed.getCreated().toGMTString())); // String s1 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(feed.getCreated()); // feed.addToMetadata("another_dateISO", s1); // String s1_5 = new SimpleDateFormat().format(feed.getCreated()); // feed.addToMetadata("another_dateTimeJava", s1_5); // String s2 = new SimpleDateFormat("yyyyMMdd").format(feed.getCreated()); // feed.addToMetadata("another_dateYYYYMMDD", s2); // String s3 = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z").format(feed.getCreated()); // feed.addToMetadata("another_dateRFC822", s3); // feed.addToMetadata("another_dateGMT", feed.getCreated().toGMTString()); // // Testing of the string field types // feed.addToMetadata("my_comment", "Testing this ABCDEFG"); // feed.addToMetadata("my_term", "Testing this UVWXYZ"); // feed.addToMetadata("my_text", "Testing this 123456"); // // Test an array of longs: // Long tl[] = new Long[4]; tl[0] = 0L; tl[1] = 1L; tl[2] = 2L; tl[3] = 3L; // feed.addToMetadata("md_long", tl); //TEST: some dummy event timestamp adding code (not seeing much/any in the data) // if (null != feed.getEvents()) { // int i = 0; // for (EventPojo evt: feed.getEvents()) { // //1: Add single date // if (0 == i) { // evt.time_start = "2011-01-01"; // } // //2: Add short span // if (1 == i) { // evt.time_start = "2010-04-06"; // evt.time_end = "2010-08-09"; // } // //3: Add cross-yr span // if (2 == i) { // evt.time_start = "2012-06-05"; // evt.time_end = "2013-09-05"; // } // //4: Add too long span // if (3 == i) { // evt.time_start = "2012-04-06"; // evt.time_end = "2014-04-09"; // } // i++; // } // } // For event adding, see data_model.test.TestCode } } catch (IOException e) { e.printStackTrace(); } finally { //nothing to do } } }