List of usage examples for com.mongodb DBCollection findOne
@Nullable public DBObject findOne(@Nullable final DBObject query, final DBCollectionFindOptions findOptions)
From source file:bhl.pages.database.MongoConnection.java
License:Open Source License
/** * Get the content of the given page and document * @param docid the document identifier (an integer in BHL) * @param pageid the page identifier (NOT page_sequence) also an int * @return the textual content as a String * @throws Exception /*from w w w. j ava 2 s . c o m*/ */ public String getPageContent(String docid, String pageid) throws DbException { try { if (docid != null && pageid != null) { connect(); DBCollection coll = db.getCollection(Database.PAGES); BasicDBObject ref = new BasicDBObject(); ref.put(JSONKeys.IA_IDENTIFIER, docid); ref.put(JSONKeys.BHL_PAGE_ID, Integer.parseInt(pageid)); BasicDBObject key = new BasicDBObject(); key.put(JSONKeys.PAGE_SEQUENCE, 1); DBObject obj = coll.findOne(ref, key); if (obj != null) { Object pobj = obj.get(JSONKeys.PAGE_SEQUENCE); int pageNo = ((Number) pobj).intValue(); DBCollection coll2 = db.getCollection(Database.DOCUMENTS); BasicDBObject ref2 = new BasicDBObject(); ref2.put(JSONKeys.IA_IDENTIFIER, docid); ref2.put(JSONKeys.PAGE_SEQUENCE, pageNo); BasicDBObject key2 = new BasicDBObject(); key2.put(JSONKeys.CONTENT, 1); Object obj2 = coll2.findOne(ref2, key2); if (obj2 != null) return (String) ((DBObject) obj2).get(JSONKeys.CONTENT); else throw new Exception("could not find content for docid=" + docid + ", pageid=" + pageid); } else throw new Exception("could not find docid=" + docid + ", pageid=" + pageid); } else throw new Exception("Missing docid or pageid"); } catch (Exception e) { throw new DbException(e); } }
From source file:com.ebay.cloud.cms.dal.persistence.MongoExecutor.java
License:Apache License
public static DBObject findOne(PersistenceContext context, MetaClass metadata, DBObject queryObject, DBObject fieldObject) {/* ww w .j a v a 2 s . c o m*/ long start = System.currentTimeMillis(); DBObject findResult = null; String msg = "success"; DBCollection dbCollection = context.getDBCollection(metadata); try { findResult = dbCollection.findOne(queryObject, fieldObject); } catch (Throwable t) { msg = t.getMessage(); handleMongoException(t); } finally { logMongoAction(context, "find", start, dbCollection, queryObject, fieldObject, null, null, msg); } return findResult; }
From source file:com.ikanow.infinit.e.api.knowledge.DocumentHandler.java
License:Open Source License
/** * Get information function that returns the user information in the form of a JSON String. * @param isAdmin //from w w w . j a va 2 s. c om * * @param key the key definition of the user ( example email@email.com ) * @return a JSON string representation of the person information on success */ public ResponsePojo getInfo(String userIdStr, String sourceKey, String idStrOrUrl, boolean bReturnFullText, boolean returnRawData, boolean isAdmin) { ResponsePojo rp = new ResponsePojo(); try { // Set up the query BasicDBObject query = new BasicDBObject(); ObjectId id = null; if (null == sourceKey) { id = new ObjectId(idStrOrUrl); query.put(DocumentPojo._id_, id); } else { query.put(DocumentPojo.sourceKey_, sourceKey); query.put(DocumentPojo.url_, idStrOrUrl); } if (!isAdmin) query.put(DocumentPojo.communityId_, new BasicDBObject(MongoDbManager.in_, SocialUtils.getUserCommunities(userIdStr))); // (use DBObject here because DocumentPojo is pretty big and this call could conceivably have perf implications) BasicDBObject fieldsQ = new BasicDBObject(); if (!bReturnFullText) { fieldsQ.put(DocumentPojo.fullText_, 0); // (XML/JSON have fullText as part of pojo) } BasicDBObject dbo = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(query, fieldsQ); if ((null == dbo) || ((null != dbo.get(DocumentPojo.url_)) && dbo.getString(DocumentPojo.url_).startsWith("?DEL?"))) { if (null != id) { // this might be the update id... query = new BasicDBObject(DocumentPojo.updateId_, id); dbo = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(query, fieldsQ); } } //TESTED (update case, normal case, and intermediate case where both update and original still exist) if (null == dbo) { rp.setResponse(new ResponseObject("Doc Info", true, "Document not found")); return rp; } DocumentPojo dp = DocumentPojo.fromDb(dbo, DocumentPojo.class); if (bReturnFullText) { if (null == dp.getFullText()) { // (Some things like database records might have this stored already) byte[] storageArray = new byte[200000]; DBCollection contentDB = DbManager.getDocument().getContent(); BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, dp.getUrl()); contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, dp.getSourceKey()))); BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1); BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ, fields); if (null != dboContent) { byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_)); ByteArrayInputStream in = new ByteArrayInputStream(compressedData); GZIPInputStream gzip = new GZIPInputStream(in); int nRead = 0; StringBuffer output = new StringBuffer(); while (nRead >= 0) { nRead = gzip.read(storageArray, 0, 200000); if (nRead > 0) { String s = new String(storageArray, 0, nRead, "UTF-8"); output.append(s); } } dp.setFullText(output.toString()); dp.makeFullTextNonTransient(); } } } else if (!returnRawData) { dp.setFullText(null); // (obviously will normally contain full text anyway) } else // if ( returnRawData ) { //check if the harvest type is file, return the file instead //if file is db return the json //get source SourcePojo source = getSourceFromKey(dp.getSourceKey()); if (source.getExtractType().equals("File")) { //get file from harvester String fileURL = dp.getUrl(); if (dp.getSourceUrl() != null) fileURL = dp.getSourceUrl(); byte[] bytes = FileHarvester.getFile(fileURL, source); if (bytes == null) { // Try returning JSON instead String json = ApiManager.mapToApi(dp, new DocumentPojoApiMap()); DocumentFileInterface dfp = new DocumentFileInterface(); dfp.bytes = json.getBytes(); dfp.mediaType = "application/json"; rp.setResponse( new ResponseObject("Doc Info", true, "Document bytes returned successfully")); rp.setData(dfp, null); return rp; } else { DocumentFileInterface dfp = new DocumentFileInterface(); dfp.bytes = bytes; dfp.mediaType = getMediaType(fileURL); rp.setResponse( new ResponseObject("Doc Info", true, "Document bytes returned successfully")); rp.setData(dfp, null); return rp; } } else { String json = ApiManager.mapToApi(dp, new DocumentPojoApiMap()); DocumentFileInterface dfp = new DocumentFileInterface(); dfp.bytes = json.getBytes(); dfp.mediaType = "application/json"; rp.setResponse(new ResponseObject("Doc Info", true, "Document bytes returned successfully")); rp.setData(dfp, null); return rp; } } rp.setData(dp, new DocumentPojoApiMap()); rp.setResponse(new ResponseObject("Doc Info", true, "Feed info returned successfully")); } //(end full text vs raw data) catch (Exception e) { // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); rp.setResponse(new ResponseObject("Doc Info", false, "error returning feed: " + e.getMessage())); } // Return Json String representing the user return rp; }
From source file:com.ikanow.infinit.e.processing.generic.aggregation.AssociationAggregationUtils.java
License:Open Source License
/** * Add events to the elastic search index for events * and the mongodb collection/*from w w w .jav a 2 s . co m*/ * so they are searchable for searchsuggest * * Step 1.a, try to just update alias's * Step 1.b, if fail, create new entry * * Step 2, Update totalfreq and doccount * * Step 3, After updating totalfreq and doccount, write to ES for every group * * @param events */ public static void updateEventFeatures(Map<String, Map<ObjectId, AssociationFeaturePojo>> eventFeatures) { // Some diagnostic counters: int numCacheMisses = 0; int numCacheHits = 0; int numNewAssocs = 0; long entityAggregationTime = new Date().getTime(); DBCollection col = DbManager.getFeature().getAssociation(); // (This fn is normally run for a single community id) CommunityFeatureCaches.CommunityFeatureCache currCache = null; String savedSyncTime = null; for (Map<ObjectId, AssociationFeaturePojo> evtCommunity : eventFeatures.values()) { Iterator<Map.Entry<ObjectId, AssociationFeaturePojo>> it = evtCommunity.entrySet().iterator(); while (it.hasNext()) { Map.Entry<ObjectId, AssociationFeaturePojo> evtFeatureKV = it.next(); try { AssociationFeaturePojo evtFeature = evtFeatureKV.getValue(); long nSavedDocCount = evtFeature.getDoccount(); ObjectId communityID = evtFeature.getCommunityId(); if ((null == currCache) || !currCache.getCommunityId().equals(evtFeatureKV.getKey())) { currCache = CommunityFeatureCaches.getCommunityFeatureCache(evtFeatureKV.getKey()); if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println( "AssociationAggregationUtils.updateEventFeatures, Opened cache for community: " + evtFeatureKV.getKey()); } } //TESTED (by hand) // Is this in our cache? If so can short cut a bunch of the DB interaction: AssociationFeaturePojo cachedAssoc = currCache.getCachedAssocFeature(evtFeature); if (null != cachedAssoc) { if (_incrementalMode) { if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println( "AssociationAggregationUtils.updateEventFeatures, skip cached: " + cachedAssoc.toDb()); //TODO (INF-2825): should be continue-ing here so can use delta more efficiently... } } else if (_diagnosticMode) { if (_logInDiagnosticMode) System.out .println("AssociationAggregationUtils.updateEventFeatures, grabbed cached: " + cachedAssoc.toDb()); } numCacheHits++; } //TESTED (by hand) else { numCacheMisses++; } //try to update BasicDBObject query = new BasicDBObject(AssociationFeaturePojo.index_, evtFeature.getIndex()); query.put(AssociationFeaturePojo.communityId_, communityID); //Step1 try to update alias //update arrays BasicDBObject multiopAliasArrays = new BasicDBObject(); // Entity1 Alias: if (null != evtFeature.getEntity1_index()) { evtFeature.addEntity1(evtFeature.getEntity1_index()); } if (null != evtFeature.getEntity1()) { if ((null == cachedAssoc) || (null == cachedAssoc.getEntity1()) || !cachedAssoc.getEntity1().containsAll(evtFeature.getEntity1())) { BasicDBObject multiopE = new BasicDBObject(MongoDbManager.each_, evtFeature.getEntity1()); multiopAliasArrays.put(AssociationFeaturePojo.entity1_, multiopE); } } //TESTED (by hand) // Entity2 Alias: if (null != evtFeature.getEntity2_index()) { evtFeature.addEntity2(evtFeature.getEntity2_index()); } if (null != evtFeature.getEntity2()) { if ((null == cachedAssoc) || (null == cachedAssoc.getEntity2()) || !cachedAssoc.getEntity2().containsAll(evtFeature.getEntity2())) { BasicDBObject multiopE = new BasicDBObject(MongoDbManager.each_, evtFeature.getEntity2()); multiopAliasArrays.put(AssociationFeaturePojo.entity2_, multiopE); } } //TESTED (by hand) // verb/verb cat alias: if (null != evtFeature.getVerb_category()) { evtFeature.addVerb(evtFeature.getVerb_category()); } if (null != evtFeature.getVerb()) { if ((null == cachedAssoc) || (null == cachedAssoc.getVerb()) || !cachedAssoc.getVerb().containsAll(evtFeature.getVerb())) { BasicDBObject multiopE = new BasicDBObject(MongoDbManager.each_, evtFeature.getVerb()); multiopAliasArrays.put(AssociationFeaturePojo.verb_, multiopE); } } //TESTED (by hand) // OK - now we can copy across the fields into the cache: if (null != cachedAssoc) { currCache.updateCachedAssocFeatureStatistics(cachedAssoc, evtFeature); //(evtFeature is now fully up to date) } //TESTED (by hand) BasicDBObject updateOp = new BasicDBObject(); if (!multiopAliasArrays.isEmpty()) { updateOp.put(MongoDbManager.addToSet_, multiopAliasArrays); } // Document count for this event BasicDBObject updateFreqDocCount = new BasicDBObject(AssociationFeaturePojo.doccount_, nSavedDocCount); updateOp.put(MongoDbManager.inc_, updateFreqDocCount); BasicDBObject fields = new BasicDBObject(AssociationFeaturePojo.doccount_, 1); fields.put(AssociationFeaturePojo.entity1_, 1); fields.put(AssociationFeaturePojo.entity2_, 1); fields.put(AssociationFeaturePojo.verb_, 1); //(slightly annoying, since only want these if updating dc but won't know // until after i've got this object) fields.put(AssociationFeaturePojo.db_sync_time_, 1); fields.put(AssociationFeaturePojo.db_sync_doccount_, 1); DBObject dboUpdate = null; if (_diagnosticMode) { if (null == cachedAssoc) { dboUpdate = col.findOne(query, fields); } } else { if (null != cachedAssoc) { col.update(query, updateOp, false, false); } else { // Not cached - so have to grab the feature we're either getting or creating dboUpdate = col.findAndModify(query, fields, new BasicDBObject(), false, updateOp, false, true); // (can use findAndModify because specify index, ie the shard key) // (returns event before the changes above, update the feature object below) // (also atomically creates the object if it doesn't exist so is "distributed-safe") } } if ((null != cachedAssoc) || ((dboUpdate != null) && !dboUpdate.keySet().isEmpty())) // (feature already exists) { AssociationFeaturePojo egp = cachedAssoc; if (null == egp) { egp = AssociationFeaturePojo.fromDb(dboUpdate, AssociationFeaturePojo.class); evtFeature.setDoccount(egp.getDoccount() + nSavedDocCount); evtFeature.setDb_sync_doccount(egp.getDb_sync_doccount()); evtFeature.setDb_sync_time(egp.getDb_sync_time()); if (null != egp.getEntity1()) { for (String ent : egp.getEntity1()) evtFeature.addEntity1(ent); } if (null != egp.getEntity2()) { for (String ent : egp.getEntity2()) evtFeature.addEntity2(ent); } if (null != egp.getVerb()) { for (String verb : egp.getVerb()) evtFeature.addVerb(verb); } } //TESTED (cached and non-cached cases) // (in the cached case, evtFeature has already been updated by updateCachedAssocFeatureStatistics) if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println("AssociationAggregationUtils.updateEventFeatures, found: " + ((BasicDBObject) egp.toDb()).toString()); if (_logInDiagnosticMode) System.out.println( "AssociationAggregationUtils.updateEventFeatures, ^^^ found from query: " + query.toString() + " / " + updateOp.toString()); } // (In background aggregation mode we update db_sync_prio when checking the -otherwise unused, unlike entities- document update schedule) } else // (the object in memory is now an accurate representation of the database, minus some fields we'll now add) { numNewAssocs++; // Synchronization settings for the newly created object evtFeature.setDb_sync_doccount(nSavedDocCount); if (null == savedSyncTime) { savedSyncTime = Long.toString(System.currentTimeMillis()); } evtFeature.setDb_sync_time(savedSyncTime); // This is all "distributed safe" (apart from the db_syc_xxx and it doesn't matter if that is // out of date, the update will just be slightly out-of-date at worst) since (otherwise) these fields are // only set here, and the findAndModify is atomic BasicDBObject baseFields = new BasicDBObject(); if (null != evtFeature.getEntity1_index()) { baseFields.put(AssociationFeaturePojo.entity1_index_, evtFeature.getEntity1_index()); } if (null != evtFeature.getEntity2_index()) { baseFields.put(AssociationFeaturePojo.entity2_index_, evtFeature.getEntity2_index()); } if (null != evtFeature.getVerb_category()) { baseFields.put(AssociationFeaturePojo.verb_category_, evtFeature.getVerb_category()); } baseFields.put(AssociationFeaturePojo.assoc_type_, evtFeature.getAssociation_type()); baseFields.put(AssociationFeaturePojo.db_sync_doccount_, evtFeature.getDb_sync_doccount()); baseFields.put(AssociationFeaturePojo.db_sync_time_, evtFeature.getDb_sync_time()); baseFields.put(AssociationFeaturePojo.db_sync_prio_, 1000.0); // (ensures new objects are quickly index-synchronized) if (!_diagnosticMode) { // Store the object col.update(query, new BasicDBObject(MongoDbManager.set_, baseFields)); } else { if (_logInDiagnosticMode) System.out.println("AssociationAggregationUtils.updateEventFeatures, not found: " + query.toString() + " / " + baseFields.toString() + "/ orig_update= " + updateOp.toString()); } // (Note even in background aggregation mode we still perform the feature synchronization // for new entities - and it has to be right at the end because it "corrupts" the objects) } //(end if first time seen) if (null == cachedAssoc) { // First time we've seen this locally, so add to cache currCache.addCachedAssocFeature(evtFeature); if (_diagnosticMode) { if (_logInDiagnosticMode) System.out .println("AssociationAggregationUtils.updateEventFeatures, added to cache: " + evtFeature.toDb()); } } //TESTED (by hand) } catch (Exception e) { // Exception, remove from feature list it.remove(); // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); } } // (end loop over all communities for the set of features sharing and index) } // (end loop over indexes) if ((numCacheHits > 0) || (numCacheMisses > 0)) { // ie some assocs were grabbed int cacheSize = 0; if (null != currCache) { cacheSize = currCache.getAssocCacheSize(); } StringBuffer logMsg = new StringBuffer() // (should append key, but don't have that...) .append(" assoc_agg_time_ms=").append(new Date().getTime() - entityAggregationTime) .append(" total_assocs=").append(eventFeatures.size()).append(" new_assocs=") .append(numNewAssocs).append(" cache_misses=").append(numCacheMisses).append(" cache_hits=") .append(numCacheHits).append(" cache_size=").append(cacheSize); logger.info(logMsg.toString()); } }
From source file:com.ikanow.infinit.e.processing.generic.aggregation.EntityAggregationUtils.java
License:Open Source License
/** * Updates the feature entries for the list of entities * that was just extracted including changing frequency, * adding aliases etc//from ww w . ja va 2s . com * * This method now has 3 steps: * 1. Try to update alias * 1.a If fail, create new gaz * 2. Update totalfreq and doccount * * @param ents List of entities to update in the entity feature */ public static void updateEntityFeatures(Map<String, Map<ObjectId, EntityFeaturePojo>> entFeatures) { // Some diagnostic counters: int numCacheMisses = 0; int numCacheHits = 0; int numNewEntities = 0; long entityAggregationTime = new Date().getTime(); DBCollection col = DbManager.getFeature().getEntity(); // (This fn is normally run for a single community id) CommunityFeatureCaches.CommunityFeatureCache currCache = null; String savedSyncTime = null; for (Map<ObjectId, EntityFeaturePojo> entCommunity : entFeatures.values()) { Iterator<Map.Entry<ObjectId, EntityFeaturePojo>> it = entCommunity.entrySet().iterator(); while (it.hasNext()) { Map.Entry<ObjectId, EntityFeaturePojo> entFeatureKV = it.next(); try { EntityFeaturePojo entFeature = entFeatureKV.getValue(); long nSavedDocCount = entFeature.getDoccount(); long nSavedFreqCount = entFeature.getTotalfreq(); // (these should be constant across all communities but keep it here // so can assign it using entFeature, it's v cheap so no need to get once like for sync vars) // For each community, see if the entity feature already exists *for that community* ObjectId communityID = entFeature.getCommunityId(); if (null != communityID) { if ((null == currCache) || !currCache.getCommunityId().equals(entFeatureKV.getKey())) { currCache = CommunityFeatureCaches.getCommunityFeatureCache(entFeatureKV.getKey()); if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println( "EntityAggregationUtils.updateEntityFeatures, Opened cache for community: " + entFeatureKV.getKey()); } } //TESTED (by hand) // Is this in our cache? If so can short cut a bunch of the DB interaction: EntityFeaturePojo cachedEnt = currCache.getCachedEntityFeature(entFeature); if (null != cachedEnt) { if (_incrementalMode) { if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println( "EntityAggregationUtils.updateEntityFeatures, skip cached: " + cachedEnt.toDb()); //TODO (INF-2825): should be continue-ing here (after implementing incremental caching fully) so can use delta more efficiently... } } else if (_diagnosticMode) { if (_logInDiagnosticMode) System.out .println("EntityAggregationUtils.updateEntityFeatures, grabbed cached: " + cachedEnt.toDb()); } numCacheHits++; } //TESTED (by hand) else { numCacheMisses++; } BasicDBObject query = new BasicDBObject(EntityFeaturePojo.index_, entFeature.getIndex()); query.put(EntityFeaturePojo.communityId_, communityID); BasicDBObject updateOp = new BasicDBObject(); // Add aliases: BasicDBObject updateOpA = new BasicDBObject(); if (null != entFeature.getAlias()) { if ((null == cachedEnt) || (null == cachedEnt.getAlias()) || !cachedEnt.getAlias().containsAll(entFeature.getAlias())) { //(if the data we have is already cached, don't bother adding it again) BasicDBObject multiopE = new BasicDBObject(MongoDbManager.each_, entFeature.getAlias()); updateOpA.put(EntityFeaturePojo.alias_, multiopE); } //TESTED (by hand) } // Add link data, if there is any: if ((null != entFeature.getSemanticLinks()) && !entFeature.getSemanticLinks().isEmpty()) { if ((null == cachedEnt) || (null == cachedEnt.getSemanticLinks()) || !cachedEnt.getSemanticLinks().containsAll(entFeature.getSemanticLinks())) { //(if the data we have is already cached, don't bother adding it again) BasicDBObject multiopF = new BasicDBObject(MongoDbManager.each_, entFeature.getSemanticLinks()); updateOpA.put(EntityFeaturePojo.linkdata_, multiopF); } //TESTED (by hand) } // OK - now we can copy across the fields into the cache: if (null != cachedEnt) { currCache.updateCachedEntityFeatureStatistics(cachedEnt, entFeature); //(entFeature is now fully up to date) } //TESTED (by hand) if (!updateOpA.isEmpty()) { updateOp.put(MongoDbManager.addToSet_, updateOpA); } // Update frequency: BasicDBObject updateOpB = new BasicDBObject(); updateOpB.put(EntityFeaturePojo.totalfreq_, nSavedFreqCount); updateOpB.put(EntityFeaturePojo.doccount_, nSavedDocCount); updateOp.put(MongoDbManager.inc_, updateOpB); //try to use find/modify to see if something comes back and set doc freq/totalfreq BasicDBObject fields = new BasicDBObject(EntityFeaturePojo.totalfreq_, 1); fields.put(EntityFeaturePojo.doccount_, 1); fields.put(EntityFeaturePojo.alias_, 1); fields.put(EntityFeaturePojo.linkdata_, 1); //(slightly annoying, since only want these 2 largish fields if updating freq but won't know // until after i've got this object) fields.put(EntityFeaturePojo.db_sync_time_, 1); fields.put(EntityFeaturePojo.db_sync_doccount_, 1); DBObject dboUpdate = null; if (_diagnosticMode) { if (null == cachedEnt) { dboUpdate = col.findOne(query, fields); } } else { if (null != cachedEnt) { col.update(query, updateOp, false, false); } else { // Not cached - so have to grab the feature we're either getting or creating dboUpdate = col.findAndModify(query, fields, new BasicDBObject(), false, updateOp, false, true); // (can use findAndModify because specify index, ie the shard key) // (returns entity before the changes above, update the feature object below) // (also atomically creates the object if it doesn't exist so is "distributed-safe") } } if ((null != cachedEnt) || ((dboUpdate != null) && !dboUpdate.keySet().isEmpty())) // (feature already exists) { EntityFeaturePojo gp = cachedEnt; // (Update the entity feature to be correct so that it can be accurately synchronized with the index) if (null == gp) { gp = EntityFeaturePojo.fromDb(dboUpdate, EntityFeaturePojo.class); entFeature.setTotalfreq(gp.getTotalfreq() + nSavedFreqCount); entFeature.setDoccount(gp.getDoccount() + nSavedDocCount); entFeature.setDbSyncDoccount(gp.getDbSyncDoccount()); entFeature.setDbSyncTime(gp.getDbSyncTime()); if (null != gp.getAlias()) { entFeature.addAllAlias(gp.getAlias()); } if (null != gp.getSemanticLinks()) { entFeature.addToSemanticLinks(gp.getSemanticLinks()); } } //TESTED (cached case and non-cached case) // (in the cached case, entFeature has already been updated by updateCachedEntityFeatureStatistics) if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println("EntityAggregationUtils.updateEntityFeatures, found: " + ((BasicDBObject) gp.toDb()).toString()); if (_logInDiagnosticMode) System.out.println( "EntityAggregationUtils.updateEntityFeatures, ^^^ found from query: " + query.toString() + " / " + updateOp.toString()); } // (In background aggregation mode we update db_sync_prio when checking the doc update schedule) } else // (the object in memory is now an accurate representation of the database, minus some fields we'll now add) { numNewEntities++; // Synchronization settings for the newly created object if (null == savedSyncTime) { savedSyncTime = Long.toString(System.currentTimeMillis()); } entFeature.setDbSyncDoccount(nSavedDocCount); entFeature.setDbSyncTime(savedSyncTime); // This is all "distributed safe" (apart from the db_syc_xxx and it doesn't matter if that is // out of date, the update will just be slightly out-of-date at worst) since (otherwise) these fields are // only set here, and the findAndModify is atomic // (Do in raw MongoDB for performance) BasicDBObject baseFields = new BasicDBObject(); baseFields.put(EntityFeaturePojo.dimension_, entFeature.getDimension().toString()); baseFields.put(EntityFeaturePojo.type_, entFeature.getType()); baseFields.put(EntityFeaturePojo.disambiguated_name_, entFeature.getDisambiguatedName()); baseFields.put(EntityFeaturePojo.db_sync_doccount_, entFeature.getDbSyncDoccount()); baseFields.put(EntityFeaturePojo.db_sync_prio_, 1000.0); baseFields.put(EntityFeaturePojo.db_sync_time_, entFeature.getDbSyncTime()); if ((null != entFeature.getSemanticLinks()) && !entFeature.getSemanticLinks().isEmpty()) { baseFields.put(EntityFeaturePojo.linkdata_, entFeature.getSemanticLinks()); } //attempt to add geotag (makes necessary checks on util side) //also add ontology type if geotag is found EntityGeotagAggregationUtils.addEntityGeo(entFeature); if (entFeature.getGeotag() != null) { BasicDBObject geo = new BasicDBObject(GeoPojo.lat_, entFeature.getGeotag().lat); geo.put(GeoPojo.lon_, entFeature.getGeotag().lon); baseFields.put(EntityFeaturePojo.geotag_, geo); if (entFeature.getOntology_type() != null) { baseFields.put(EntityFeaturePojo.ontology_type_, entFeature.getOntology_type()); } } if (!_diagnosticMode) { // Store the object col.update(query, new BasicDBObject(MongoDbManager.set_, baseFields)); } else { if (_logInDiagnosticMode) System.out.println("EntityAggregationUtils.updateEntityFeatures, not found: " + query.toString() + ": " + baseFields.toString()); } } //(end first time this feature seen - globally) if (null == cachedEnt) { // First time we've seen this locally, so add to cache currCache.addCachedEntityFeature(entFeature); if (_diagnosticMode) { if (_logInDiagnosticMode) System.out .println("EntityAggregationUtils.updateEntityFeatures, added to cache: " + entFeature.toDb()); } } //TESTED (by hand) } //(end if community id assigned) } catch (Exception e) { // Exception, remove from feature list it.remove(); // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); } } // (end loop over communities) } // (end loop over indexes) if ((numCacheHits > 0) || (numCacheMisses > 0)) { // ie some ents were grabbed int cacheSize = 0; if (null != currCache) { cacheSize = currCache.getEntityCacheSize(); } StringBuffer logMsg = new StringBuffer() // (should append key, but don't have that...) .append(" ent_agg_time_ms=").append(new Date().getTime() - entityAggregationTime) .append(" total_ents=").append(entFeatures.size()).append(" new_ents=").append(numNewEntities) .append(" cache_misses=").append(numCacheMisses).append(" cache_hits=").append(numCacheHits) .append(" cache_size=").append(cacheSize); logger.info(logMsg.toString()); } }
From source file:com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager.java
License:Open Source License
/** * Remove a doc from the data store, ensures all the fields specified in "fields" are populated (ready for index deletion) * @param col/*from ww w . j a v a 2 s .c o m*/ * @param doc - needs url, sourceKey set * @param fields - fields to retrieve (index, created), set in calling function outside of loop for performance * * CALLED FROM: removeFromDatastore_byURL(col, List<doc>, bDeleteContent) <- ADDS INDEX, CREATED TO FIELDS * removeFromDataStore_byURL(List<doc>, bDeleteContent) [ALSO DELETES FROM INDEX AFTER ADDED FROM HERE] * MongoDocumentTxfer.doDelete(...) <- SETS URL, SOURCE URL, SOURCE KEY, COMMUNITY ID, INDEX, _ID * processDocuments(...) [ always called after harvester: have sourceUrl, sourceKey, * DON'T have _id, BUT do have updateId and index (correct except in many geo cases)] * pruneSource(source, ...) <- SETS URL, SOURCE URL, SOURCE KEY, INDEX * updateHarvestStatus(...) */ private void removeFromDatastore_byURL(DBCollection col, DocumentPojo doc, BasicDBObject fields, boolean bDeleteContent) { // 1] Create the query to soft delete the document BasicDBObject query = new BasicDBObject(); query.put(DocumentPojo.url_, doc.getUrl()); query.put(DocumentPojo.sourceKey_, SourcePojo.getDistributedKeyQueryTerm(doc.getSourceKey())); // 2] Delete the content if needed if (bDeleteContent) { if (docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) { if (!_diagnosticMode) { DbManager.getDocument().getContent().remove(query); } else { System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2), delete content: " + doc.getSourceKey() + "/" + doc.getUrl()); } } } //TESTED // 3] Work out which fields we have and which (if any we need to go and fetch): boolean needToFindAndModify = false; if (null == doc.getId()) { // This is called from processDocuments if (null != doc.getUpdateId()) { // update case... doc.setId(doc.getUpdateId()); // (note this is overwritten by addToDatastore later, in update case, so we're good) // (doc.index is populated but may not be correct because of the "many geos" workaround): if (DocumentPojoIndexMap.hasManyGeos(doc)) { doc.setIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_); // (note this check isn't stateless, it actually populates "locs" at the same time // this is handled in addToDatastore (update case), temp removed when adding to DB } //TESTED (2.1.2, diagnostic mode, doc2) } else { // Not an update case, we're going to have to grab the document after all, which is a bit slower needToFindAndModify = true; } } //TESTED (2.1.2, diagnostic mode, doc2) if (!needToFindAndModify) { // set created if we need to, since we're not grabbing it from the datastore if (null != doc.getUpdateId()) { // (this means we have an approx created if we don't need to go fetch the deleted doc) doc.setCreated(new Date(doc.getUpdateId().getTime())); } //TESTED (2.1.2, diagnostic mode, doc2) } // (if we're here and index is not set, then it is intended to be null) // 4] Update the doc_metadata collection BasicDBObject softDelete = getSoftDeleteUpdate(); BasicDBObject deadDoc = null; // (not normally needed) if (needToFindAndModify) { // less pleasant, need to go grab the doc deadDoc = (BasicDBObject) col.findOne(query, fields); } //TESTED (2.1.2) if (!_diagnosticMode) { col.update(query, softDelete, false, true); // (needs to be multi- even though there's a single element for sharding reasons) } //TESTED (2.1.2) // 5] Add fields if necessary if (null != deadDoc) { doc.setCreated((Date) deadDoc.get(DocumentPojo.created_)); // (if getting this doc anyway then might as well get the created) doc.setId((ObjectId) deadDoc.get(DocumentPojo._id_)); doc.setIndex((String) deadDoc.get(DocumentPojo.index_)); if (_diagnosticMode) { System.out .println("StoreAndIndexManager.removeFromDatastore_byUrl(2): found " + deadDoc.toString()); } } //TESTED (2.1.2) else if (_diagnosticMode) { if (!needToFindAndModify) { System.out.println("StoreAndIndexManager.removeFromDatastore_byUrl(2): straight deleted " + doc.toDb().toString()); } else { System.out.println( "StoreAndIndexManager.removeFromDatastore_byUrl(2): didn't find " + query.toString()); } } //TESTED (2.1.2) }
From source file:com.ikanow.infinit.e.processing.generic.synchronization.SynchronizationManager.java
License:Open Source License
/** * Does the DB sync, pulls all solr docs that occured from the * cleanseStartTime and source and makes sure they are in the mongo db. * /*from w ww .java2 s . c o m*/ * @param lastCleanse 1 hour before this harvester started * @param sources list of sources we are syncing * @return The number of errors fixed (docs deleted) */ //TODO INF-2239 ... lol fail if syncDB isn't called then dbCache is empty and everything gets deleted... public int syncSearch(long cleanseStartTime, Set<String> dbCache) { int fixcount = 0; StoreAndIndexManager storeManager = new StoreAndIndexManager(); // NO LONGER NEEDED, HAVE CACHE (EXCEPT IN ONE PLACE, THE "OLD DOCS" CHECK) DBCollection documentDb = DbManager.getDocument().getMetadata(); BasicDBObject queryFields = new BasicDBObject(); // (ie just _id, basically only need to know if it exists) try { //get solr entries from last cleanse point int source_index = 0; int source_count = sources.size(); for (SourcePojo sp : sources) { if (bKillMeNow) { return fixcount; } List<DocumentPojo> docs_to_remove = new ArrayList<DocumentPojo>(); // Get all indexes this source might use: StringBuffer sb = new StringBuffer("document_index"); for (ObjectId sCommunityId : sp.getCommunityIds()) { sb.append(",doc_").append(sCommunityId.toString()); } sb.append("/document_index"); ElasticSearchManager esm = ElasticSearchManager.getIndex(sb.toString()); SearchRequestBuilder searchOptions = esm.getSearchOptions(); BoolQueryBuilder boolQuery = QueryBuilders.boolQuery(); boolQuery.must(QueryBuilders.rangeQuery(DocumentPojo.created_).from(cleanseStartTime)); boolQuery.must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sp.getKey())); searchOptions.setSize(200); // (note this is multiplied by the number of primary shards) searchOptions.setSearchType(SearchType.SCAN); searchOptions.setScroll("10m"); SearchResponse rsp = esm.doQuery(boolQuery, searchOptions); String scrollId = rsp.getScrollId(); int nSkip = 0; for (;;) // Until no more hits { rsp = esm.doScrollingQuery(scrollId, "10m"); SearchHit[] docs = rsp.getHits().getHits(); scrollId = rsp.getScrollId(); if ((null == docs) || (0 == docs.length)) { break; } if (docs.length > 100) { // just display large checks) logger.info("Checking ES docs for large source=" + sp.getKey() + " source: " + source_index + "/" + source_count + " from " + nSkip + " to " + (nSkip + docs.length)); } //Check all solr docs against mongodb for (SearchHit hit : docs) { String idStr = hit.getId(); boolean found = true; //(fail closed!) if (null == dbCache) { //OBSOLETED, USE DBCACHE INSTEAD (WHERE AVAILABLE): ObjectId id = new ObjectId(idStr); BasicDBObject query = new BasicDBObject(DocumentPojo._id_, id); query.put(DocumentPojo.sourceKey_, sp.getDistributedKeyQueryTerm()); // (ensures uses only the right shard) DBObject dbo = documentDb.findOne(query, queryFields); found = (dbo != null); } //TESTED else { found = dbCache.contains(idStr); } //TESTED if (!found) { ObjectId id = new ObjectId(idStr); DocumentPojo doc = new DocumentPojo(); doc.setId(id); doc.setIndex(hit.getIndex() + "/document_index"); docs_to_remove.add(doc); logger.info("db sync removing doc: " + id + "/" + hit.getIndex() + "/" + source_index + " not found in mongo"); fixcount++; } // end if not found } // end loop over docs to check nSkip += docs.length; } // until no more hits if (!docs_to_remove.isEmpty()) { storeManager.removeFromSearch(docs_to_remove); docs_to_remove.clear(); } //CHECK OLD FEEDS 10 at atime int iteration = 1; boolean removedAll = true; while (removedAll) { int rows = iteration * iteration * 10;//exponential scaling 10x^2 iteration++; int oldfixes = 0; //get old docs from es SearchRequestBuilder searchOptionsOLD = esm.getSearchOptions(); BoolQueryBuilder boolQueryOLD = QueryBuilders.boolQuery(); boolQueryOLD.must(QueryBuilders.rangeQuery(DocumentPojo.created_).from(cleanseStartTime)); boolQueryOLD.must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sp.getKey())); searchOptionsOLD.addSort(DocumentPojo.created_, SortOrder.ASC); searchOptionsOLD.setSize(rows); SearchResponse rspOLD = esm.doQuery(boolQueryOLD, searchOptionsOLD); SearchHit[] docsOLD = rspOLD.getHits().getHits(); //Check all solr docs against mongodb for (SearchHit hit : docsOLD) { String idStr = hit.getId(); boolean found = true; if (null == dbCache) { //OBSOLETED, USE DBCACHE INSTEAD (WHERE AVAILABLE): ObjectId id = new ObjectId(idStr); BasicDBObject queryOLD = new BasicDBObject(DocumentPojo._id_, id); DBObject dbo = documentDb.findOne(queryOLD, queryFields); found = (dbo != null); } //TESTED else { found = dbCache.contains(idStr); } //TESTED if (!found) { // Also need to check the DB since dbCache is not guaranteed to be populated with the same // number of "final" docs ObjectId id = new ObjectId(idStr); if (rows > 10) { // (dbCache always loaded with the first 10 rows) BasicDBObject queryOLD = new BasicDBObject(DocumentPojo._id_, id); if (null != documentDb.findOne(queryOLD, queryFields)) { // it is actually present continue; } } DocumentPojo doc = new DocumentPojo(); doc.setId(id); doc.setIndex(hit.getIndex() + "/document_index"); docs_to_remove.add(doc); logger.info( "db sync removing doc: " + idStr + "/" + source_index + " not found in mongo"); oldfixes++; fixcount++; } } if (!docs_to_remove.isEmpty()) { storeManager.removeFromSearch(docs_to_remove); } if (oldfixes != rows) removedAll = false; } source_index++; } // end loop over sources } catch (Exception e) { // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); } return fixcount; }
From source file:com.nlp.twitterstream.MongoUtil.java
License:Open Source License
/** * Get first document that matches selection in database Return only * selected fields/*from ww w .j ava 2s . c om*/ * * @param collection * DBCollection object * @return DBObject */ public DBObject getOneDocFields(DBCollection collection, DBObject obj1, DBObject obj2) { myDoc = collection.findOne(obj1, obj2); return myDoc; }
From source file:com.socialsky.mods.MongoPersistor.java
License:Apache License
private void doFindOne(Message<JsonObject> message) { String collection = getMandatoryString("collection", message); if (collection == null) { return;/* w ww .j a v a 2s. c om*/ } JsonObject matcher = message.body().getObject("matcher"); JsonObject keys = message.body().getObject("keys"); DBCollection coll = db.getCollection(collection); DBObject res; if (matcher == null) { res = keys != null ? coll.findOne(null, jsonToDBObject(keys)) : coll.findOne(); } else { res = keys != null ? coll.findOne(jsonToDBObject(matcher), jsonToDBObject(keys)) : coll.findOne(jsonToDBObject(matcher)); } JsonObject reply = new JsonObject(); if (res != null) { JsonObject m = new JsonObject(res.toMap()); reply.putObject("result", m); } sendOK(message, reply); }
From source file:com.softinstigate.restheart.db.CollectionDAO.java
License:Open Source License
/** * Deletes a collection./* w ww .j a v a2 s . c o m*/ * * @param dbName the database name of the collection * @param collName the collection name * @param etag the entity tag. must match to allow actual write (otherwise * http error code is returned) * @return the HttpStatus code to set in the http response */ public static int deleteCollection(String dbName, String collName, ObjectId etag) { DBCollection coll = getCollection(dbName, collName); BasicDBObject checkEtag = new BasicDBObject("_id", "_properties"); checkEtag.append("_etag", etag); DBObject exists = coll.findOne(checkEtag, fieldsToReturn); if (exists == null) { return HttpStatus.SC_PRECONDITION_FAILED; } else { coll.drop(); return HttpStatus.SC_NO_CONTENT; } }