List of usage examples for com.mongodb BasicDBObject getLong
public long getLong(final String key, final long def)
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
/** * Checks if the new params MAX_SPLITS and MAX_DOCS_PER_SPLIT are set * in the config. If they are it will use those to do splits via limit/skip * otherwise it will call the previous chunking splitter in MongoSplitter. * // ww w . j av a2s. c om * @param conf * @return */ public static List<InputSplit> calculateSplits(InfiniteMongoConfig conf) { // First off: What is our sharding scheme? boolean shardingPolicyNew = false; try { BasicDBObject shardQuery = new BasicDBObject("_id", "doc_metadata.metadata"); BasicDBObject shardInfo = (BasicDBObject) DbManager.getCollection("config", "collections") .findOne(shardQuery); if (null != shardInfo) { BasicDBObject shardInfoKey = (BasicDBObject) shardInfo.get("key"); if (null != shardInfoKey) { shardingPolicyNew = (shardInfoKey.size() > 1); } } } //TESTED (new and old) catch (Exception e) { } // stick with the old sharding, it's probably going to die soon after though, honestly // conf.getQuery returns a new copy of the query, so get once and use everywhere... BasicDBObject confQuery = (BasicDBObject) conf.getQuery(); BasicDBObject srcTagsQuery = (BasicDBObject) conf.getSourceTags(); String collection = conf.getInputURI().getCollection(); if (!collection.equals(DbManager.getDocument().getContent().getName()) && !collection.equals(DbManager.getDocument().getMetadata().getName())) { // Case 1: feature table or custom table // Just run legacy code return calculateSplits_phase2(conf, confQuery, false, false, null); } else { // complex cases... boolean simpleOtherIndex = false; // Check whether a simple query has been performed on a different indexed field if (null == srcTagsQuery) { // (if srcTags specified, then going to want to use sourceKey as the index) for (String s : Arrays.asList(EntityPojo.docQuery_index_, DocumentPojo.url_)) { Object selector = confQuery.get(s); if (selector instanceof String) { simpleOtherIndex = true; break; } else if (selector instanceof DBObject) { DBObject selectorDbo = (DBObject) selector; if (selectorDbo.containsField(DbManager.in_)) { simpleOtherIndex = true; break; } } } //TESTED (both types, plus check complex indexes don't work) // ALLOWED: {"entities.index": { "$in": [ "xxx", "yyy"] }, {"entities.index": "xxx" }, ditto for "url" // NOT ALLOWED: { "entities.index": { "$ne": "xxx" } } } //TESTED check ignored if eg entity_index specified if (simpleOtherIndex) { // Case 2: we have a simple query on an indexed field // Just run legacy code return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null); } //TESTED else if (conf.getLimit() > 0) { // debug //Case 3: Ensure we have small sets of sources to search over BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery, conf.getMaxDocsPerSplit()); final List<InputSplit> splits = new ArrayList<InputSplit>(); boolean queryNonTrivial = isQueryNonTrivial(confQuery); if (!queryNonTrivial) { //Case 3a: query is trivial, so can just create splits directly from the split pre-calcs int toProcess = conf.getLimit(); Iterator<Object> itSplit = collectionOfSplits.iterator(); while ((toProcess > 0) && (itSplit.hasNext())) { BasicDBObject split = (BasicDBObject) itSplit.next(); int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L); int toGet = (docCount > toProcess) ? toProcess : docCount; BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout())); toProcess -= docCount; } } //TESTED } else { // Case 3b: annoying, some extra query terms, gonna need to do it the hard way... int toProcess = conf.getLimit(); Iterator<Object> itSplit = collectionOfSplits.iterator(); DBCollection coll = InfiniteMongoConfigUtil.getCollection(conf.getInputURI()); while ((toProcess > 0) && (itSplit.hasNext())) { BasicDBObject split = (BasicDBObject) itSplit.next(); BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { int docsCounted = (int) coll.getCount(modQuery, null, toProcess, 0); int toGet = (docsCounted > toProcess) ? toProcess : docsCounted; if (docsCounted > 0) { splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout())); toProcess -= docsCounted; } } //TESTED } } //TESTED return splits; } else { // More complex cases: if (shardingPolicyNew) { // Case 4a: NEW SHARDING SCHEME // Always fetch the new sources, eg convert communityId to sourceKeys try { splitPrecalculations_newShardScheme(confQuery, srcTagsQuery); // (modifies confQuery if returns true) boolean queryNonTrivial = isQueryNonTrivial(confQuery); return calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null); // (ie trivial query => always use chunks, bypass skip/limit test) } //TESTED (trivial + non-trivial) catch (Exception e) { // Didn't match any sources, no problem return new ArrayList<InputSplit>(); } //TESTED } //TESTED else { BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery, conf.getMaxDocsPerSplit()); if (null == collectionOfSplits) { // Case 4b: OLD SHARDING SCHEME can't get a partition by source keys, just back off to old code return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null); } //TESTED (old code) else { conf.setMaxDocsPerSplit(2 * conf.getMaxDocsPerSplit()); // (because we stop creating splits when the exceed the size) // Case 4c: OLD SHARDING SCHEME, have a source key partition int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits(); boolean queryNonTrivial = isQueryNonTrivial(confQuery); final List<InputSplit> splits = new ArrayList<InputSplit>(); BasicDBObject savedQuery = confQuery; Iterator<Object> itSplit = collectionOfSplits.iterator(); BasicDBList bigSplit = null; while (itSplit.hasNext()) { BasicDBObject split = (BasicDBObject) itSplit.next(); int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L); if (docCount < nMaxCount) { // small split, will use skip/limit BasicDBObject modQuery = convertQuery(savedQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { final int SPLIT_THRESHOLD = 3; // A few cases: if ((docCount < (SPLIT_THRESHOLD * conf.getMaxDocsPerSplit())) || !queryNonTrivial) { splits.addAll(calculateSplits_phase2(conf, modQuery, false, shardingPolicyNew, (Integer) docCount)); } //TESTED (based on limit, based on query) else { // My guess at the point at which you might as well as do the full query in the hope you're going // to save some (empty) splits splits.addAll(calculateSplits_phase2(conf, modQuery, false, shardingPolicyNew, null)); } //TESTED } //TESTED } else { // large split, combine all these guys into an array of source keys if (null == bigSplit) { bigSplit = new BasicDBList(); } bigSplit.add(split.get(DocumentPojo.sourceKey_)); // (guaranteed to be a single element) } } //(end loop over collections) if (null != bigSplit) { // If we have a big left over community then create a set of splits for that - always chunks if query trivial if (1 == bigSplit.size()) { confQuery.put(DocumentPojo.sourceKey_, bigSplit.iterator().next()); } else { confQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, bigSplit)); } splits.addAll(calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null)); } //TESTED: singleton+trivial (sandy), array+trivial (sentiment/enron), array+non-trivial (sentiment/enron, docGeo), singleton+non-trivial (sandy, docGeo) return splits; } //TESTED: end if Cases 4a, 4b, 4c } //(end if old vs new sharding policy) } //(non-debug case) } //(content or metadata table are most complex) }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
@SuppressWarnings("unchecked") public static BasicDBList splitPrecalculations_oldShardSchemeOrDebug(BasicDBObject query, BasicDBObject srcTagsQuery, int maxCountPerTask) { // Get the communityIds from the query Collection<ObjectId> communityIds = null; try {//from w w w. ja v a 2 s. com BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_); communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_); if (null == communityIds) { return null; } } catch (Exception e) { return null; // back out } BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_, new BasicDBObject(DbManager.in_, communityIds)); BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1); keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1); BasicDBObject sortFields = new BasicDBObject(SourcePojo.key_, 1); // Get and remove the sourceKey information, incorporate into source query: Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_); if (null != sourceKeyQueryTerm) { keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); } //TESTED if (null != srcTagsQuery) { keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_)); } //TESTED DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(sortFields); // (note the sort is needed so that the potentially expensive doc query has a sensibly ordered $in clause) if (dbc.count() > 5000) { // (too many source keys to process, just going to leave well alone... note this means $srctags will fail open) return null; } else { //TreeMap<String, Long> sourceKeys = new TreeMap<String, Long>(); // Build collections of objects of format { sourceKey: string or [], totalDocs } BasicDBList sourceKeyListCollection = new BasicDBList(); BasicDBList sourceKeyList = null; int runningDocs = 0; int runningSources = 0; while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); String sourceKey = (String) dbo.get(SourcePojo.key_); if (null != sourceKey) { long docCount = 0L; try { BasicDBObject harvestStatus = (BasicDBObject) dbo.get(SourcePojo.harvest_); if (null != harvestStatus) { docCount = harvestStatus.getLong(SourceHarvestStatusPojo.doccount_, 0L); } } catch (Exception e) { } //DEBUG //System.out.println("SOURCE=" + sourceKey + " DOC_COUNT=" + docCount + " RUNNING=" + runningDocs +"," + runningSources + ": " + sourceKeyList); if (docCount > maxCountPerTask) { // source is large enough by itself // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKey); collection.put(SourceHarvestStatusPojo.doccount_, docCount); sourceKeyListCollection.add(collection); // (leaving running* alone, can keep building that) } //TESTED (by eye, system community of demo cluster) else if ((runningDocs + docCount) > maxCountPerTask) { // have now got a large enough collection of sources if (null == sourceKeyList) { sourceKeyList = new BasicDBList(); } sourceKeyList.add(sourceKey); // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount); sourceKeyListCollection.add(collection); sourceKeyList = null; runningDocs = 0; runningSources = 0; } //TESTED (by eye, system community of demo cluster) else if (runningSources >= 15) { // have a limit on the number of sources per query, to keep the queries manageable sourceKeyList.add(sourceKey); // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount); sourceKeyListCollection.add(collection); sourceKeyList = null; runningDocs = 0; runningSources = 0; } //TESTED (by eye, system community of demo cluster) else { // (keep) build(ing) list if (null == sourceKeyList) { sourceKeyList = new BasicDBList(); } sourceKeyList.add(sourceKey); runningDocs += docCount; runningSources++; } //TESTED (by eye, system community of demo cluster) } //(end if has source key) } //(end loop over cursor) // Finish off: if (null != sourceKeyList) { // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs); sourceKeyListCollection.add(collection); } //TESTED (by eye, system community of demo cluster) if (sourceKeyListCollection.isEmpty()) { // query returns empty throw new RuntimeException("Communities contain no sources"); } return sourceKeyListCollection; } // (end if too many source keys across the communities) }