List of usage examples for com.mongodb DBCollection find
public DBCursor find(@Nullable final DBObject query, final DBCollectionFindOptions options)
From source file:com.hangum.tadpole.mongodb.core.test.MongoTestNotEqualsStmt.java
License:Open Source License
/** * @param args//from w w w .java 2s . co m */ public static void main(String[] args) throws Exception { ConAndAuthentication testMongoCls = new ConAndAuthentication(); Mongo mongo = testMongoCls.connection(ConAndAuthentication.serverurl, ConAndAuthentication.port); DB db = mongo.getDB("test"); DBCollection myColl = db.getCollection("test_table"); // BasicDBObject myAndQuery = new BasicDBObject(); // myAndQuery.append("rental_id", new BasicDBObject("$ne", 1)); BasicDBObject basicFields = new BasicDBObject(); BasicDBObject basicWhere = new BasicDBObject(); BasicDBObject basicSort = new BasicDBObject(); DBCursor myCursor = myColl.find(basicFields, basicWhere).sort(basicSort).limit(999); while (myCursor.hasNext()) { System.out.println(myCursor.next()); } mongo.close(); }
From source file:com.ikanow.infinit.e.api.knowledge.QueryHandler.java
License:Open Source License
private DBCursor getDocIds(DBCollection docDb, ObjectId[] ids, int nFromServerLimit, AdvancedQueryPojo.QueryOutputPojo output, AdvancedQueryPojo.QueryScorePojo score) { DBCursor docdCursor = null;// w w w .jav a2 s .c o m try { BasicDBObject query = new BasicDBObject(); query.put("_id", new BasicDBObject("$in", ids)); BasicDBObject fields = new BasicDBObject(DocumentPojo.fullText_, 0); // (used to discard community ids -plus legacy versions-, now need it) if (!output.docs.metadata) { fields.put(DocumentPojo.metadata_, 0); } boolean bNotAggEnts = ((output.aggregation == null) || (output.aggregation.entsNumReturn == null) || (output.aggregation.entsNumReturn == 0)); if (bNotAggEnts && (null != score) && (null != score.sigWeight) && (score.sigWeight > 0.0)) { bNotAggEnts = false; // (special case, use agg entities to score docs) } if (!output.docs.ents && bNotAggEnts) { fields.put(DocumentPojo.entities_, 0); } boolean bNotAggEvents = ((output.aggregation == null) || (output.aggregation.eventsNumReturn == null) || (output.aggregation.eventsNumReturn == 0)); boolean bNotAggFacts = ((output.aggregation == null) || (output.aggregation.factsNumReturn == null) || (output.aggregation.factsNumReturn == 0)); boolean bNoStandaloneEvents = (null == output.docs.eventsTimeline) || (null == output.docs.numEventsTimelineReturn) || (output.docs.numEventsTimelineReturn == 0); if (!output.docs.events && !output.docs.facts && !output.docs.summaries && bNoStandaloneEvents && bNotAggEvents && bNotAggFacts) { fields.put(DocumentPojo.associations_, 0); } //TESTED //cm = new CollectionManager(); boolean bPrimary = true; if (_replicaSetDistributionRatio > 0) { if (0 != (new Date().getTime() % _replicaSetDistributionRatio)) { bPrimary = false; } } if (bPrimary) { // Get from the primary docdCursor = docDb.find(query, fields).batchSize(nFromServerLimit); } else { // Try and get from the secondary if possible docdCursor = docDb.find(query, fields).batchSize(nFromServerLimit) .setReadPreference(ReadPreference.secondaryPreferred()); } } catch (Exception e) { // If an exception occurs log the error _logger.error("Address Exception Message: " + e.getMessage(), e); } return docdCursor; }
From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java
License:Open Source License
/** * Tests to see if duplicates might exist. * If it is not a duplicate, true is returned. If it is a duplicate, * the modified date is then checked to see if the file has been updated. * True is returned if the file has been updated, false otherwise. * //from w ww . ja v a 2 s . co m * @param collection * @param modifiedDate * @param url * @param title * @return boolean (true/false) */ public boolean needsUpdated_SourceUrl(Date modifiedDate, String sourceUrl, SourcePojo source) { // Performance shortcut: if (!_bCalculatedMostRecentlyModifiedFile) { _bCalculatedMostRecentlyModifiedFile = true; // Get date of most recently modified file: try { if ((null != source.getHarvestStatus()) && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) { BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1); BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1); if (null != source.getDistributionFactor()) { // (need the created date also mostRecentFields.put(DocumentPojo.created_, 1); } DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata() .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1); if (mostRecentDocs.hasNext()) { BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next(); _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_); _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_); if (null != source.getDistributionFactor()) { // This is a slightly more complex case because other... //...threads for this source could be writing documents asynchronously ... so we're just going to disable everything //if the most recent doc is _after_ our last harvest time (since this means we've already started harvesting the new source) Date mostRecentlyModifedFile_createdTime = (Date) mostRecentDocDbo .get(DocumentPojo.created_); if ((null != source.getHarvestStatus()) && (null != source.getHarvestStatus().getHarvested() && (null != mostRecentlyModifedFile_createdTime))) { if (mostRecentlyModifedFile_createdTime .after(source.getHarvestStatus().getHarvested())) { _mostRecentlyModifiedFile = null; _mostRecentlyModifiedDocId = null; } } else { // If we don't have a date then force a "slow" dedup _mostRecentlyModifiedFile = null; _mostRecentlyModifiedDocId = null; } } //TESTED } //(found docs) } //(success mode) } catch (Exception e) { } // If anything goes wrong will just check all files (slower) } //TESTED if (null != _mostRecentlyModifiedFile) { // Use short cut... long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L; long nFileTime = modifiedDate.getTime() / 1000L; if (nFileTime <= nMostRecentlyModifiedTime) { return false; } } //TESTED else if (null == sourceUrl) { return true; // (for custom checking - if we couldn't get a cached value to compare against then assume we are inspecting) } // No short cut, go the long way round: DBCollection collection = DbManager.getDocument().getMetadata(); boolean ret = true; BasicDBObject query = new BasicDBObject(); query.put(DocumentPojo.sourceUrl_, sourceUrl); query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); BasicDBObject hint = new BasicDBObject(DocumentPojo.sourceUrl_, 2); BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1); DBCursor dbc = collection.find(query, fields).hint(hint).limit(1); // (this should be very fast since sourceUrl is indexed ... order doesn't matter as all docs should have the same modified) //TODO (INF-1922): at some point should look into making (sparse) sourceUrl be compounded with sourceKey - this is a bit risky if (!dbc.hasNext()) { //if there is no record, return true ret = true; modifiedDate.setTime(0); } else { // (all docs should have same modified, though this is ~ time ordered anyway) BasicDBObject dbo = (BasicDBObject) dbc.iterator().next(); Date oldModified = (Date) dbo.get(DocumentPojo.modified_); ret = ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)); // ie if different -> true -> update docs from sourceUrl // ^^ note granularity seems only to be guaranteed to 1s somewhere in the system (not sure where) // (this is just backwards compatible for a deployment where this has happened for some % -probably 100- of the docs // once an RPM >=5955 is deployed this will no longer be necessary) } return ret; }
From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java
License:Open Source License
public boolean needsUpdated_Url(Date modifiedDate, String url, SourcePojo source) { // Performance shortcut: if (!_bCalculatedMostRecentlyModifiedFile) { _bCalculatedMostRecentlyModifiedFile = true; // Get date of most recently modified file: try {/* ww w . ja v a2 s. c o m*/ if ((null != source.getHarvestStatus()) && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) { BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); if (null != source.getDistributionFactor()) { // if distributed, then apply extra term if ((null != source.getHarvestStatus()) && (null != source.getHarvestStatus().getDistributedLastCompletedCycle())) { Date d = source.getHarvestStatus().getDistributedLastCompletedCycle(); mostRecentQuery.put(DocumentPojo._id_, new BasicDBObject(DbManager.lte_, new ObjectId(d))); } } //TESTED BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1); BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1); if (null != source.getDistributionFactor()) { // (need the created date also mostRecentFields.put(DocumentPojo.created_, 1); } DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata() .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1); if (mostRecentDocs.hasNext()) { BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next(); _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_); _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_); } //TESTED (found docs) //DEBUG //if (null != _mostRecentlyModifiedDocId) // System.out.println("DEDUP: " + mostRecentQuery + ": RESULTS IN " + new Date(_mostRecentlyModifiedDocId.getTime())); } //(success mode) } catch (Exception e) { } // If anything goes wrong will just check all files (slower) } //TESTED if (null != _mostRecentlyModifiedFile) { // Use short cut... long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L; long nFileTime = modifiedDate.getTime() / 1000L; if (nFileTime <= nMostRecentlyModifiedTime) { return false; } } //TESTED if (null == url) { // use this call with url==null to just check the modified file... return true; } // No short cut, go the long way round: DBCollection collection = DbManager.getDocument().getMetadata(); boolean ret = true; BasicDBObject query = new BasicDBObject(); query.put(DocumentPojo.url_, url); query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1); DBCursor dbc = collection.find(query, fields).limit(2); // (will normally return 0 or 1) boolean foundMatch = dbc.hasNext(); if (!foundMatch) { //if there is no record, return true ret = true; } else { BasicDBObject dbo = (BasicDBObject) dbc.next(); Date oldModified = (Date) dbo.get(DocumentPojo.modified_); if ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)) { // times don't match if (!dbc.hasNext()) { // 1 matching doc, different modified times so update ret = true; } //TESTED else { // Not sure about this case, multiple docs, are any of them the same? (Shouldn't ever occur) // (slightly slow but should be OK because not going to happen very often) int nCount = dbc.count(); query.put(DocumentPojo.modified_, modifiedDate); ret = !(collection.find(query).limit(1).count() == nCount); } //TOTEST (shouldn't ever occur) } else { // Doc has same modified time so don't update ret = false; } //TESTED } return ret; }
From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java
License:Open Source License
private LinkedList<String> getCandidateDuplicates(BasicDBObject query, String parentSourceKey, boolean bUpdate) { _modifiedTimeOfActualDuplicate = null; _duplicateId = null;/* w w w . ja va 2 s .c om*/ LinkedList<String> returnVal = new LinkedList<String>(); DBCollection collection = DbManager.getDocument().getMetadata(); BasicDBObject fields = new BasicDBObject(DocumentPojo.sourceKey_, 1); if (bUpdate) { fields.put(DocumentPojo.modified_, 1); fields.put(DocumentPojo.updateId_, 1); } //TESTED boolean bPrimary = true; if (_replicaSetDistributionRatio > 0) { // (distribute based on source key, should ensure some reasonable cache grouping...) if (0 != (parentSourceKey.hashCode() % _replicaSetDistributionRatio)) { bPrimary = false; } } DBCursor dbc = null; if (bPrimary) { dbc = collection.find(query, fields); } else { dbc = collection.find(query, fields).setReadPreference(ReadPreference.secondaryPreferred()); } while (dbc.hasNext()) { DBObject dbo = dbc.next(); String sourceKey = DocumentPojo.getSourceKey((String) dbo.get(DocumentPojo.sourceKey_)); if (null != sourceKey) { // Check for exact duplicates, in which case can bypass horrible functional duplicate logic: boolean bFoundExactDuplicate = sourceKey.equals(parentSourceKey); // Update logic: if (bUpdate && bFoundExactDuplicate) { _modifiedTimeOfActualDuplicate = (Date) dbo.get(DocumentPojo.modified_); _duplicateId = (ObjectId) dbo.get(DocumentPojo.updateId_); if (null == _duplicateId) { // first time, use the _id _duplicateId = (ObjectId) dbo.get(DocumentPojo._id_); } } //TESTED if (bFoundExactDuplicate) { // Found exact duplicate, so return just that for performance returnVal.clear(); } returnVal.add(sourceKey); if (bFoundExactDuplicate) { // Found exact duplicate, we're done here return returnVal; } } //(if doc has source key, else is malformed, ignore) } //(end loop over URL-duplicates) return returnVal; }
From source file:com.ikanow.infinit.e.processing.custom.utils.CustomApiUtils.java
License:Apache License
public static void getJobResults(ResponsePojo rp, CustomMapReduceJobPojo cmr, int limit, String fields, String findStr, String sortStr, boolean bCsv) { BasicDBObject queryDbo = null;//ww w . ja v a 2s . co m if (null != findStr) { queryDbo = (BasicDBObject) com.mongodb.util.JSON.parse(findStr); } else { queryDbo = new BasicDBObject(); } //TOTEST BasicDBObject fieldsDbo = new BasicDBObject(); if (null != fields) { fieldsDbo = (BasicDBObject) com.mongodb.util.JSON.parse("{" + fields + "}"); } //return the results: // Need to handle sorting... BasicDBObject sort = null; if (null != sortStr) { //override sort = (BasicDBObject) com.mongodb.util.JSON.parse(sortStr); } else { //defaults String sortField = "_id"; int sortDir = 1; BasicDBObject postProcObject = (BasicDBObject) com.mongodb.util.JSON.parse( InfiniteHadoopUtils.getQueryOrProcessing(cmr.query, InfiniteHadoopUtils.QuerySpec.POSTPROC)); if (postProcObject != null) { sortField = postProcObject.getString("sortField", "_id"); sortDir = postProcObject.getInt("sortDirection", 1); } //TESTED (post proc and no post proc) sort = new BasicDBObject(sortField, sortDir); } //TOTEST // Case 1: DB rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", true, "Map reduce job completed at: " + cmr.lastCompletionTime)); if ((null == cmr.exportToHdfs) || !cmr.exportToHdfs) { DBCursor resultCursor = null; DBCollection coll = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollection); DBDecoderFactory defaultDecoder = coll.getDBDecoderFactory(); CsvGeneratingBsonDecoder csvDecoder = null; SizeReportingBasicBSONDecoder sizeDecoder = null; CustomMapReduceResultPojo cmrr = new CustomMapReduceResultPojo(); try { if (bCsv) { coll.setDBDecoderFactory((csvDecoder = new CsvGeneratingBsonDecoder())); } else { coll.setDBDecoderFactory((sizeDecoder = new SizeReportingBasicBSONDecoder())); } if (limit > 0) { resultCursor = coll.find(queryDbo, fieldsDbo).sort(sort).limit(limit); } else { resultCursor = coll.find(queryDbo, fieldsDbo).sort(sort); } LinkedList<BasicDBObject> list = null; if (!bCsv) { list = new LinkedList<BasicDBObject>(); } final int MAX_SIZE_CSV = 80 * 1024 * 1024; //(80MB) final int MAX_SIZE_JSON = 80 * 1024 * 1024; //(80MB) while (resultCursor.hasNext()) { BasicDBObject x = (BasicDBObject) resultCursor.next(); if (!bCsv) { list.add(x); } if (null != csvDecoder) { if (csvDecoder.getCsv().length() > MAX_SIZE_CSV) { break; } } else if (null != sizeDecoder) { if (sizeDecoder.getSize() > MAX_SIZE_JSON) { break; } } } cmrr.results = list; } finally { coll.setDBDecoderFactory(defaultDecoder); } cmrr.lastCompletionTime = cmr.lastCompletionTime; if (null != csvDecoder) { StringBuffer header = new StringBuffer(); for (String field : csvDecoder.getOrderedFields()) { if (0 != header.length()) { header.append(','); } header.append('"'); header.append(field.replace("\"", "\\\"")); header.append("\""); } header.append('\n'); header.append(csvDecoder.getCsv().toString()); cmrr.results = header.toString(); } rp.setData(cmrr); } //TESTED else { // Case 2: HDFS if ((null != cmr.outputKey) && (null != cmr.outputValue) && cmr.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text") && cmr.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) { // special case, text file try { rp.setData(HadoopUtils.getBsonFromTextFiles(cmr, limit, fields), (BasePojoApiMap<BasicDBList>) null); } catch (Exception e) { rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", false, "Files don't appear to be in text file format, did you run the job before changing the output to Text/Text?")); } } //TESTED else { // sequence file try { rp.setData(HadoopUtils.getBsonFromSequenceFile(cmr, limit, fields), (BasePojoApiMap<BasicDBList>) null); } catch (Exception e) { rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", false, "Files don't appear to be in sequence file format, did you run the job with Text/Text?")); } } //TESTED } //TESTED }
From source file:com.ikanow.infinit.e.processing.generic.synchronization.SynchronizationManager.java
License:Open Source License
/** * Does the DB sync, pulls all mongo docs that occured from the * cleanseStartTime and source and makes sure they are in the search db. * /*w ww . ja v a2s . c o m*/ * @param lastCleanse 1 hour before this harvester started * @param sources list of sources we are syncing * @return The number of errors fixed (docs deleted) */ // DON'T USE THIS UNTIL REWRITTEN - IT SHOULD TRANSFER DOCS ACROSS, NOT LEAVE THEM ALONE @Deprecated public int syncDB(long cleanseStartTime, Set<String> dbCache) { dbCache.clear(); int fixcount = 0; DBCollection contentDb = DbManager.getDocument().getContent(); DBCollection documentDb = DbManager.getDocument().getMetadata(); StoreAndIndexManager storeManager = new StoreAndIndexManager(); for (SourcePojo sp : sources) { // Don't combine the sources (apart from unusual multi-community case), because // that prevents you from using the compound sourceKey/_id index List<String> sourceKeyList = new ArrayList<String>(); sourceKeyList.addAll(sp.getDistributedKeys()); try { List<DocumentPojo> docs_to_remove = new ArrayList<DocumentPojo>(); //FIRST DO ALL NEW FEEDS BasicDBObject query = new BasicDBObject(); query.put(DocumentPojo._id_, new BasicDBObject(MongoDbManager.gt_, new ObjectId((int) (cleanseStartTime / 1000), 0, 0))); // time aspect query.put(DocumentPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, sourceKeyList)); //source aspect BasicDBObject queryFields = new BasicDBObject(); queryFields.append(DocumentPojo.url_, 1); queryFields.append(DocumentPojo.index_, 1); queryFields.append(DocumentPojo.sourceKey_, 1); DBCursor cur = documentDb.find(query, queryFields).batchSize(100); ElasticSearchManager esm = null; ElasticSearchManager esm_base = ElasticSearchManager.getIndex("document_index"); String sIndex = null; while (cur.hasNext()) { if (bKillMeNow) { return fixcount; } DocumentPojo doc = DocumentPojo.fromDb(cur.next(), DocumentPojo.class); if (null != doc.getId()) { dbCache.add(doc.getId().toString()); } // Get index of doc to check in: String sNewIndex = doc.getIndex(); if (null == sNewIndex) { sIndex = null; esm = esm_base; } else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) { sIndex = sNewIndex; if (sNewIndex.equals("document_index")) { esm = esm_base; } else { esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index"); } } //Compare mongo doc to search doc Map<String, GetField> results = esm.getDocument(doc.getId().toString(), DocumentPojo.url_); if (null == results || results.isEmpty()) { //either too many entries (duplicates) or no entry //delete this doc from both logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey() + " not found in search (or duplicate)"); docs_to_remove.add(doc); documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId())); BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl()); contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey()))); contentDb.remove(contentQ); fixcount++; } } //end loop over new docs for this source storeManager.removeFromSearch(docs_to_remove); //NOW VERIFY ALL OLD FEEDS int iteration = 1; boolean removedAll = true; docs_to_remove.clear(); while (removedAll) { int rows = iteration * iteration * 10; //10x^2 exponentially check more docs int oldfixes = 0; BasicDBObject queryOLD = new BasicDBObject(); queryOLD.put(DocumentPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, sourceKeyList)); //source aspect BasicDBObject sortOLD = new BasicDBObject(DocumentPojo._id_, 1); DBCursor curOLD = documentDb.find(queryOLD, queryFields).sort(sortOLD).limit(rows); while (curOLD.hasNext()) { DocumentPojo doc = DocumentPojo.fromDb(curOLD.next(), DocumentPojo.class); if (null != doc.getId()) { dbCache.add(doc.getId().toString()); } // Get index of doc to check in: String sNewIndex = doc.getIndex(); if (null == sNewIndex) { sIndex = null; esm = esm_base; } else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) { sIndex = sNewIndex; if (sNewIndex.equals("document_index")) { esm = esm_base; } else { esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index"); } } //Compare mongo doc to search doc Map<String, GetField> results = esm.getDocument(doc.getId().toString(), DocumentPojo.url_); if (null == results || results.isEmpty()) { //either too many entries (duplicates) or no entry //delete this doc from both logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey() + " not found in search (or duplicate)"); docs_to_remove.add(doc); documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId())); contentDb.remove(new BasicDBObject(DocumentPojo.url_, doc.getUrl())); fixcount++; oldfixes++; } } if (oldfixes != rows) removedAll = false; } //(end loop over old docs for this source) storeManager.removeFromSearch(docs_to_remove); } catch (Exception e) { // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); } } return fixcount; }
From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java
License:Apache License
private void doTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, BasicDBObject chunk) throws IOException { PropertiesManager pm = new PropertiesManager(); int nMaxContentSize_bytes = pm.getMaxContentSize(); // Initialize the DB: DBCollection docsDB = DbManager.getDocument().getMetadata(); DBCollection contentDB = DbManager.getDocument().getContent(); DBCollection sourcesDB = DbManager.getIngest().getSource(); ElasticSearchManager.setDefaultClusterName("infinite-aws"); // 1. Get the documents from the DB (combining data + metadata and refreshing source meta) // (Ignore soft-deleted records:) if (null == query) { query = new BasicDBObject(); }//from w w w.j av a 2 s . c o m Object sourceKeyQueryTerm = query.remove(DocumentPojo.sourceKey_); if (null != sourceKeyQueryTerm) { if (query.toString() .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) { throw new RuntimeException( "Can't specify sourceKey as part of complex query term: " + query.toString()); } //TESTED (by hand, "{ \"sourceKey\": \"x\", \"$or\": [ { \"sourceKey\": \"x\" } ] }") if (sourceKeyQueryTerm instanceof String) { query.put(DocumentPojo.sourceKey_, SourcePojo.getDistributedKeyQueryTerm((String) sourceKeyQueryTerm)); } //TESTED (by hand, "{\"sourceKey\": \"feeds.arstechnica.com.arstechnica.index.11.2.\" }") else if (sourceKeyQueryTerm instanceof DBObject) { // find all the _sources_ matching this term, and convert to a big list including distribution BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1); fields.put(SourcePojo.highestDistributionFactorStored_, 1); DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.key_, sourceKeyQueryTerm), fields); LinkedList<String> sourceKeys = new LinkedList<String>(); for (DBObject dbo : dbc) { String key = (String) dbo.get(SourcePojo.key_); Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_); Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor); sourceKeys.addAll(sourceKeysForSource); } query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys)); } //TESTED (by hand, "{\"sourceKey\": { \"$gt\": \"dev.ikanow\" } }") else { throw new RuntimeException("Can't specify sourceKey as part of complex query term"); } //(actually not possible, just included here for mathematical completeness...) } else { if (query.toString() .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) { throw new RuntimeException("Can't specify sourceKey as part of complex query term"); } //TESTE (by hand, "{ \"$or\": [ { \"sourceKey\": \"x\" } ] }") // Optimize communityId into sourceKeys... if (null != query.get(DocumentPojo.communityId_)) { try { ObjectId commId = query.getObjectId(DocumentPojo.communityId_); BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1); fields.put(SourcePojo.highestDistributionFactorStored_, 1); DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.communityIds_, commId), fields); LinkedList<String> sourceKeys = new LinkedList<String>(); int added = 0; for (DBObject dbo : dbc) { String key = (String) dbo.get(SourcePojo.key_); Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_); Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor); sourceKeys.addAll(sourceKeysForSource); added += sourceKeysForSource.size(); } query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys)); System.out.println("(Optimized simple community query to " + added + " source key(s))"); } catch (Exception e) { //DEBUG //e.printStackTrace(); System.out.println("(Can't optimize complex community query: " + e.getMessage()); } } //TESTED (by hand - including distributed source version) } // Ignored delete objects Object urlQuery = query.get(DocumentPojo.url_); if (null == urlQuery) { query.put(DocumentPojo.url_, Pattern.compile("^[^?]")); // (ie nothing starting with ?) } //TESTED else if (urlQuery instanceof BasicDBObject) { ((BasicDBObject) urlQuery).append("$regex", "^[^?]"); } //TESTED //DEBUG //System.out.println("COMBINED QUERY= " + query.toString()); // If aggregating, kick off the background aggregation thread if (bAggregate) { EntityBackgroundAggregationManager.startThread(); AssociationBackgroundAggregationManager.startThread(); } //Debug: DBCursor dbc = null; dbc = docsDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } byte[] storageArray = new byte[200000]; int nSynced = 0; LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>(); Map<ObjectId, LinkedList<DocumentPojo>> communityList = null; ObjectId currCommunityId = null; while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class); String sDocIndex = doc.getIndex(); if (null == sDocIndex) { sDocIndex = "document_index"; } if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) { _deletedIndex.add(sDocIndex); rebuildIndex(sDocIndex); try { // (Just in case the index requires some time to sort itself out) Thread.sleep(1000); } catch (InterruptedException e) { } } //Debug: //System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl()); // Get the content: if ((0 != nMaxContentSize_bytes) && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) { BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl()); contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey()))); BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1); fields.put(CompressedFullTextPojo.sourceKey_, 1); DBCursor dbcGzip = contentDB.find(contentQ, fields); while (dbcGzip.hasNext()) { BasicDBObject dboContent = (BasicDBObject) dbcGzip.next(); if (!dboContent.containsField(CompressedFullTextPojo.sourceKey_)) { // If this has another version then ignore this one... if (dbc.hasNext()) { continue; } //TESTED (by hand) } byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_)); ByteArrayInputStream in = new ByteArrayInputStream(compressedData); GZIPInputStream gzip = new GZIPInputStream(in); int nRead = 0; StringBuffer output = new StringBuffer(); while (nRead >= 0) { nRead = gzip.read(storageArray, 0, 200000); if (nRead > 0) { String s = new String(storageArray, 0, nRead, "UTF-8"); output.append(s); } } doc.setFullText(output.toString()); } } // (else document has full text already) // Get tags, if necessary: // Always overwrite tags - one of the reasons we might choose to migrate // Also may need source in order to support source index filtering SourcePojo src = _sourceCache.get(doc.getSourceKey()); if (null == src) { //TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?) BasicDBObject srcDbo = (BasicDBObject) sourcesDB .findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey())); if (null != srcDbo) { src = SourcePojo.fromDb(srcDbo, SourcePojo.class); if (null != src.getProcessingPipeline()) { try { // Set the index settings HarvestController hc = new HarvestController(); HarvestControllerPipeline hcPipe = new HarvestControllerPipeline(); hcPipe.extractSource_preProcessingPipeline(src, hc); } catch (Exception e) { //DEBUG e.printStackTrace(); } } //TESTED (by hand) _sourceCache.put(doc.getSourceKey(), src); } } doc.setTempSource(src); // (needed for source index filtering) if (null != src) { if (null != src.getTags()) { Set<String> tagsTidied = new TreeSet<String>(); for (String s : src.getTags()) { String ss = s.trim().toLowerCase(); tagsTidied.add(ss); } // May also want to write this back to the DB: //TODO (INF-2223): Handle append tags or not in the pipeline... if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) { if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) { BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_, doc.getRawSourceKey()); // (ie including the # if there is one) updateQuery.put(DocumentPojo._id_, doc.getId()); docsDB.update(updateQuery, new BasicDBObject(DbManager.addToSet_, new BasicDBObject(DocumentPojo.tags_, new BasicDBObject(DbManager.each_, tagsTidied)))); } doc.setTags(tagsTidied); // (just copy ptr across) } } } // 2. Update the index with the new document // (Optionally also update entity and assoc features) if (bAggregate) { if (null == currCommunityId) { currCommunityId = doc.getCommunityId(); } else if (!currCommunityId.equals(doc.getCommunityId())) { LinkedList<DocumentPojo> perCommunityDocList = null; if (null == communityList) { // (very first time we see > 1 community) communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>(); perCommunityDocList = new LinkedList<DocumentPojo>(); perCommunityDocList.addAll(docsToTransfer); //(NOT including doc, this hasn't been added to docsToTransfer yet) communityList.put(currCommunityId, perCommunityDocList); } currCommunityId = doc.getCommunityId(); perCommunityDocList = communityList.get(currCommunityId); if (null == perCommunityDocList) { perCommunityDocList = new LinkedList<DocumentPojo>(); communityList.put(currCommunityId, perCommunityDocList); } perCommunityDocList.add(doc); } } //TESTED nSynced++; docsToTransfer.add(doc); if (0 == (nSynced % 10000)) { StoreAndIndexManager manager = new StoreAndIndexManager(); if (bAggregate) { // Loop over communities and aggregate each one then store the modified entities/assocs doAggregation(communityList, docsToTransfer); communityList = null; // (in case the next 10,000 docs are all in the same community!) currCommunityId = null; } //TOTEST manager.addToSearch(docsToTransfer); docsToTransfer.clear(); System.out.println("(Synced " + nSynced + " records)"); } } // (End loop over docs) // Sync remaining docs if (!docsToTransfer.isEmpty()) { if (bAggregate) { // Loop over communities and aggregate each one then store the modified entities/assocs doAggregation(communityList, docsToTransfer); } StoreAndIndexManager manager = new StoreAndIndexManager(); manager.addToSearch(docsToTransfer); } if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } if (bAggregate) { System.out.println("Completed. You can hit CTRL+C at any time."); System.out.println( "By default it will keep running for 5 minutes while the background aggregation runs to update the documents' entities."); try { Thread.sleep(300000); } catch (InterruptedException e) { } // Turn off so we can exit EntityBackgroundAggregationManager.stopThreadAndWait(); AssociationBackgroundAggregationManager.stopThreadAndWait(); } }
From source file:com.ikanow.infinit.e.utility.MongoEntityFeatureTxfer.java
License:Apache License
static void doDelete(BasicDBObject query, int nLimit, boolean automatedRequest) { try {/*from w w w. j a v a 2 s. c o m*/ // Initialize the DB: DBCollection entityFeatureDB = DbManager.getFeature().getEntity(); ElasticSearchManager elasticManager = ElasticSearchManager.getIndex("entity_index"); BasicDBObject fields = new BasicDBObject(); fields.put(EntityFeaturePojo.index_, 1); fields.put(EntityFeaturePojo.communityId_, 1); DBCursor cur = entityFeatureDB.find(query, fields).limit(nLimit); // (this internally works in batches of 1000) if (automatedRequest) { System.out.println("Found " + cur.count() + " records to delete from _id list"); } else { System.out.println("Found " + cur.count() + " records to delete from " + query.toString()); } if (nLimit > 0) { System.out.println("(limited to " + nLimit + " records)"); } int nArraySize = (cur.count() > 1000) ? 1000 : cur.count(); ArrayList<EntityFeaturePojo> batchList = new ArrayList<EntityFeaturePojo>(nArraySize); while (cur.hasNext()) { EntityFeaturePojo gp = EntityFeaturePojo.fromDb(cur.next(), EntityFeaturePojo.class); batchList.add(gp); if (batchList.size() >= nArraySize) { internalDelete(batchList, elasticManager); batchList.clear(); } } if (!batchList.isEmpty()) { internalDelete(batchList, elasticManager); } entityFeatureDB.remove(query); } catch (NumberFormatException e) { e.printStackTrace(); } catch (MongoException e) { e.printStackTrace(); } finally { } }
From source file:com.impetus.client.mongodb.DefaultMongoDBDataHandler.java
License:Apache License
/** * Retrieves A collection of embedded object within a document that match a * criteria specified in <code>query</code> TODO: This code requires a * serious overhawl. Currently it assumes that user query is in the form * "Select alias.columnName from EntityName alias". However, correct query * to be supported is//from w w w.j a v a 2 s . com * "Select alias.superColumnName.columnName from EntityName alias" * * @param dbCollection * the db collection * @param m * the m * @param documentName * the document name * @param mongoQuery * the mongo query * @param result * the result * @param orderBy * the order by * @param maxResult * @return the embedded object list * @throws PropertyAccessException * the property access exception * @throws IllegalAccessException * @throws InstantiationException */ public List getEmbeddedObjectList(DBCollection dbCollection, EntityMetadata m, String documentName, BasicDBObject mongoQuery, String result, BasicDBObject orderBy, int maxResult, int firstResult, BasicDBObject keys, final KunderaMetadata kunderaMetadata) throws PropertyAccessException, InstantiationException, IllegalAccessException { List list = new ArrayList();// List of embedded object to be returned // Specified after entity alias in query String columnName = result; // Something user didn't specify and we have to derive // TODO: User must specify this in query and remove this logic once // query format is changed String enclosingDocumentName = null; MetamodelImpl metaModel = (MetamodelImpl) kunderaMetadata.getApplicationMetadata() .getMetamodel(m.getPersistenceUnit()); EntityType entityType = metaModel.entity(m.getEntityClazz()); EmbeddableType superColumn = null; Set<Attribute> columns = null; Attribute attrib = null; try { attrib = entityType.getAttribute(columnName); Map<String, EmbeddableType> embeddables = metaModel.getEmbeddables(m.getEntityClazz()); for (String key : embeddables.keySet()) { superColumn = embeddables.get(key); columns = superColumn.getAttributes(); for (Attribute column : columns) { if (((AbstractAttribute) column).getJPAColumnName().equals(columnName)) { enclosingDocumentName = key; break; } } } } catch (IllegalArgumentException iax) { if (log.isWarnEnabled()) { log.warn("No column found for: " + columnName); } } // Query for fetching entities based on user specified criteria DBCursor cursor = orderBy != null ? dbCollection.find(mongoQuery, keys).sort(orderBy) : dbCollection.find(mongoQuery, keys).limit(maxResult).skip(firstResult); if (superColumn != null) { Field superColumnField = (Field) attrib.getJavaMember(); while (cursor.hasNext()) { DBObject fetchedDocument = cursor.next(); Object embeddedDocumentObject = fetchedDocument.get(superColumnField.getName()); if (embeddedDocumentObject != null) { if (embeddedDocumentObject instanceof BasicDBList) { Class embeddedObjectClass = PropertyAccessorHelper.getGenericClass(superColumnField); for (Object dbObj : (BasicDBList) embeddedDocumentObject) { Object obj = embeddedObjectClass.newInstance(); Object embeddedObject = new DocumentObjectMapper().getObjectFromDocument(metaModel, (BasicDBObject) dbObj, superColumn.getAttributes(), obj); Object fieldValue = PropertyAccessorHelper.getObject(embeddedObject, columnName); } } else if (embeddedDocumentObject instanceof BasicDBObject) { Object obj = superColumn.getJavaType().newInstance(); Object embeddedObject = DocumentObjectMapper.getObjectFromDocument(metaModel, (BasicDBObject) embeddedDocumentObject, superColumn.getAttributes(), obj); list.add(embeddedObject); } else { throw new PersistenceException("Can't retrieve embedded object from MONGODB document coz " + "it wasn't stored as BasicDBObject, possible problem in format."); } } } } return list; }