List of usage examples for com.mongodb DBCollection getCount
@Deprecated public long getCount(@Nullable final DBObject query, @Nullable final DBObject projection, final long limit, final long skip)
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
/** * Checks if the new params MAX_SPLITS and MAX_DOCS_PER_SPLIT are set * in the config. If they are it will use those to do splits via limit/skip * otherwise it will call the previous chunking splitter in MongoSplitter. * //ww w.j a v a2s. com * @param conf * @return */ public static List<InputSplit> calculateSplits(InfiniteMongoConfig conf) { // First off: What is our sharding scheme? boolean shardingPolicyNew = false; try { BasicDBObject shardQuery = new BasicDBObject("_id", "doc_metadata.metadata"); BasicDBObject shardInfo = (BasicDBObject) DbManager.getCollection("config", "collections") .findOne(shardQuery); if (null != shardInfo) { BasicDBObject shardInfoKey = (BasicDBObject) shardInfo.get("key"); if (null != shardInfoKey) { shardingPolicyNew = (shardInfoKey.size() > 1); } } } //TESTED (new and old) catch (Exception e) { } // stick with the old sharding, it's probably going to die soon after though, honestly // conf.getQuery returns a new copy of the query, so get once and use everywhere... BasicDBObject confQuery = (BasicDBObject) conf.getQuery(); BasicDBObject srcTagsQuery = (BasicDBObject) conf.getSourceTags(); String collection = conf.getInputURI().getCollection(); if (!collection.equals(DbManager.getDocument().getContent().getName()) && !collection.equals(DbManager.getDocument().getMetadata().getName())) { // Case 1: feature table or custom table // Just run legacy code return calculateSplits_phase2(conf, confQuery, false, false, null); } else { // complex cases... boolean simpleOtherIndex = false; // Check whether a simple query has been performed on a different indexed field if (null == srcTagsQuery) { // (if srcTags specified, then going to want to use sourceKey as the index) for (String s : Arrays.asList(EntityPojo.docQuery_index_, DocumentPojo.url_)) { Object selector = confQuery.get(s); if (selector instanceof String) { simpleOtherIndex = true; break; } else if (selector instanceof DBObject) { DBObject selectorDbo = (DBObject) selector; if (selectorDbo.containsField(DbManager.in_)) { simpleOtherIndex = true; break; } } } //TESTED (both types, plus check complex indexes don't work) // ALLOWED: {"entities.index": { "$in": [ "xxx", "yyy"] }, {"entities.index": "xxx" }, ditto for "url" // NOT ALLOWED: { "entities.index": { "$ne": "xxx" } } } //TESTED check ignored if eg entity_index specified if (simpleOtherIndex) { // Case 2: we have a simple query on an indexed field // Just run legacy code return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null); } //TESTED else if (conf.getLimit() > 0) { // debug //Case 3: Ensure we have small sets of sources to search over BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery, conf.getMaxDocsPerSplit()); final List<InputSplit> splits = new ArrayList<InputSplit>(); boolean queryNonTrivial = isQueryNonTrivial(confQuery); if (!queryNonTrivial) { //Case 3a: query is trivial, so can just create splits directly from the split pre-calcs int toProcess = conf.getLimit(); Iterator<Object> itSplit = collectionOfSplits.iterator(); while ((toProcess > 0) && (itSplit.hasNext())) { BasicDBObject split = (BasicDBObject) itSplit.next(); int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L); int toGet = (docCount > toProcess) ? toProcess : docCount; BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout())); toProcess -= docCount; } } //TESTED } else { // Case 3b: annoying, some extra query terms, gonna need to do it the hard way... int toProcess = conf.getLimit(); Iterator<Object> itSplit = collectionOfSplits.iterator(); DBCollection coll = InfiniteMongoConfigUtil.getCollection(conf.getInputURI()); while ((toProcess > 0) && (itSplit.hasNext())) { BasicDBObject split = (BasicDBObject) itSplit.next(); BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { int docsCounted = (int) coll.getCount(modQuery, null, toProcess, 0); int toGet = (docsCounted > toProcess) ? toProcess : docsCounted; if (docsCounted > 0) { splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout())); toProcess -= docsCounted; } } //TESTED } } //TESTED return splits; } else { // More complex cases: if (shardingPolicyNew) { // Case 4a: NEW SHARDING SCHEME // Always fetch the new sources, eg convert communityId to sourceKeys try { splitPrecalculations_newShardScheme(confQuery, srcTagsQuery); // (modifies confQuery if returns true) boolean queryNonTrivial = isQueryNonTrivial(confQuery); return calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null); // (ie trivial query => always use chunks, bypass skip/limit test) } //TESTED (trivial + non-trivial) catch (Exception e) { // Didn't match any sources, no problem return new ArrayList<InputSplit>(); } //TESTED } //TESTED else { BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery, conf.getMaxDocsPerSplit()); if (null == collectionOfSplits) { // Case 4b: OLD SHARDING SCHEME can't get a partition by source keys, just back off to old code return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null); } //TESTED (old code) else { conf.setMaxDocsPerSplit(2 * conf.getMaxDocsPerSplit()); // (because we stop creating splits when the exceed the size) // Case 4c: OLD SHARDING SCHEME, have a source key partition int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits(); boolean queryNonTrivial = isQueryNonTrivial(confQuery); final List<InputSplit> splits = new ArrayList<InputSplit>(); BasicDBObject savedQuery = confQuery; Iterator<Object> itSplit = collectionOfSplits.iterator(); BasicDBList bigSplit = null; while (itSplit.hasNext()) { BasicDBObject split = (BasicDBObject) itSplit.next(); int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L); if (docCount < nMaxCount) { // small split, will use skip/limit BasicDBObject modQuery = convertQuery(savedQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { final int SPLIT_THRESHOLD = 3; // A few cases: if ((docCount < (SPLIT_THRESHOLD * conf.getMaxDocsPerSplit())) || !queryNonTrivial) { splits.addAll(calculateSplits_phase2(conf, modQuery, false, shardingPolicyNew, (Integer) docCount)); } //TESTED (based on limit, based on query) else { // My guess at the point at which you might as well as do the full query in the hope you're going // to save some (empty) splits splits.addAll(calculateSplits_phase2(conf, modQuery, false, shardingPolicyNew, null)); } //TESTED } //TESTED } else { // large split, combine all these guys into an array of source keys if (null == bigSplit) { bigSplit = new BasicDBList(); } bigSplit.add(split.get(DocumentPojo.sourceKey_)); // (guaranteed to be a single element) } } //(end loop over collections) if (null != bigSplit) { // If we have a big left over community then create a set of splits for that - always chunks if query trivial if (1 == bigSplit.size()) { confQuery.put(DocumentPojo.sourceKey_, bigSplit.iterator().next()); } else { confQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, bigSplit)); } splits.addAll(calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null)); } //TESTED: singleton+trivial (sandy), array+trivial (sentiment/enron), array+non-trivial (sentiment/enron, docGeo), singleton+non-trivial (sandy, docGeo) return splits; } //TESTED: end if Cases 4a, 4b, 4c } //(end if old vs new sharding policy) } //(non-debug case) } //(content or metadata table are most complex) }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
@SuppressWarnings("unchecked") public static List<InputSplit> calculateSplits_phase2(InfiniteMongoConfig conf, BasicDBObject confQuery, boolean alwaysUseChunks, boolean newShardScheme, Integer splitDocCount) { alwaysUseChunks &= (conf.getMaxSplits() != MAX_SPLITS); // (in standalone mode, never use chunks) MongoURI uri = conf.getInputURI();/*from w w w .ja v a 2 s. com*/ DBCollection coll = InfiniteMongoConfigUtil.getCollection(uri); if (conf.getLimit() > 0) { return calculateManualSplits(conf, confQuery, 1, conf.getLimit(), coll); } else { if (!alwaysUseChunks) { int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits(); int count = 0; if (null == splitDocCount) { if (nMaxCount <= 1) { nMaxCount = 0; } else { //DEBUG //System.out.println(coll.find(confQuery).limit(1).explain()); count = (int) coll.getCount(confQuery, null, nMaxCount, 0); if (0 == count) { return new ArrayList<InputSplit>(); } } //TESTED } else { count = splitDocCount; } //if maxdocssplit and maxsplits is set and there are less documents than splits*docspersplit then use the new splitter //otherwise use the old splitter if (conf.getMaxDocsPerSplit() > 0 && conf.getMaxSplits() > 0 && (count < nMaxCount)) { _logger.debug("Calculating splits manually"); int splits_needed = (count / conf.getMaxDocsPerSplit()) + 1; return calculateManualSplits(conf, confQuery, splits_needed, conf.getMaxDocsPerSplit(), coll); } //TESTED } if (newShardScheme && !confQuery.containsField(DocumentPojo.sourceKey_)) { // OK if we're going to do the sharded version then we will want to calculate splitPrecalculations_newShardScheme(confQuery, null); // (modifies confQuery if returns true) } //TESTED: checked did nothing when had sourceKey, added sourceKey when necessary (eg entities.index case) if (!newShardScheme) { // unlike new sharding scheme, in this case the query is fixed, so overwrite now: conf.setQuery(confQuery); } List<InputSplit> splits = MongoSplitter.calculateSplits(conf); // (unless manually set, like above, runs with the _original_ query) int initialSplitSize = splits.size(); // We have the MongoDB-calculated splits, now calculate their intersection vs the query @SuppressWarnings("rawtypes") Map<String, TreeSet<Comparable>> orderedArraySet = new HashMap<String, TreeSet<Comparable>>(); @SuppressWarnings("rawtypes") Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin = new HashMap<String, NavigableSet<Comparable>>(); BasicDBObject originalQuery = confQuery; ArrayList<InputSplit> newsplits = new ArrayList<InputSplit>(splits.size()); Iterator<InputSplit> splitIt = splits.iterator(); while (splitIt.hasNext()) { try { orderedArraySet_afterMin.clear(); MongoInputSplit mongoSplit = (MongoInputSplit) splitIt.next(); BasicDBObject min = (BasicDBObject) mongoSplit.getQuerySpec().get("$min"); BasicDBObject max = (BasicDBObject) mongoSplit.getQuerySpec().get("$max"); //DEBUG //_logger.info("+----------------- NEW SPLIT ----------------: " + min + " /" + max); //System.out.println("+----------------- NEW SPLIT ----------------: " + min + " /" + max); if (null != min) { // How does the min fit in with the general query try { if (compareFields(-1, originalQuery, min, max, orderedArraySet, orderedArraySet_afterMin) < 0) { splitIt.remove(); continue; } } catch (Exception e) { } // do nothing probably just some comparable issue } //TESTED if (null != max) { // How does the min fit in with the general query try { if (compareFields(1, originalQuery, max, min, orderedArraySet, orderedArraySet_afterMin) > 0) { splitIt.remove(); continue; } } catch (Exception e) { } // do nothing probably just some comparable issue } //TESTED //DEBUG //_logger.info("(retained split)"); //System.out.println("(retained split)"); // (don't worry about edge cases, won't happen very often and will just result in a spurious empty mapper) //////////////////////////////// // Now some infinit.e specific processing... if (newShardScheme) { @SuppressWarnings("rawtypes") TreeSet<Comparable> sourceKeyOrderedArray = orderedArraySet.get(DocumentPojo.sourceKey_); if ((null != sourceKeyOrderedArray) && !sourceKeyOrderedArray.isEmpty()) { @SuppressWarnings("rawtypes") Comparable minSourceKey = null; Object minSourceKeyObj = (null == min) ? null : min.get(DocumentPojo.sourceKey_); if (minSourceKeyObj instanceof String) { minSourceKey = (String) minSourceKeyObj; } if (null == minSourceKey) { minSourceKey = sourceKeyOrderedArray.first(); } //TESTED @SuppressWarnings("rawtypes") Comparable maxSourceKey = null; Object maxSourceKeyObj = (null == max) ? null : max.get(DocumentPojo.sourceKey_); if (maxSourceKeyObj instanceof String) { maxSourceKey = (String) maxSourceKeyObj; } if (null == maxSourceKey) { maxSourceKey = sourceKeyOrderedArray.last(); } //TESTED DBObject splitQuery = mongoSplit.getQuerySpec(); BasicDBObject splitQueryQuery = new BasicDBObject( (BasicBSONObject) splitQuery.get("$query")); if (0 == minSourceKey.compareTo(maxSourceKey)) { // single matching sourceKEy splitQueryQuery.put(DocumentPojo.sourceKey_, maxSourceKey); } //TESTED (array of sources, only one matches) else { // multiple matching source keys splitQueryQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeyOrderedArray.subSet(minSourceKey, true, maxSourceKey, true))); } //TESTED (array of sources, multiple match) newsplits.add( new InfiniteMongoInputSplit(mongoSplit, splitQueryQuery, conf.isNoTimeout())); } else { // original query is of sufficient simplicity newsplits.add( new InfiniteMongoInputSplit(mongoSplit, originalQuery, conf.isNoTimeout())); } //TESTED (no change to existing source) } //TESTED else { // old sharding scheme, remove min/max and replace with normal _id based query where possible DBObject splitQuery = mongoSplit.getQuerySpec(); // Step 1: create a query range for _id: BasicDBObject idRange = null; Object idMin = (min == null) ? null : min.get(DocumentPojo._id_); Object idMax = (max == null) ? null : max.get(DocumentPojo._id_); if (!(idMin instanceof ObjectId)) idMin = null; if (!(idMax instanceof ObjectId)) idMax = null; if ((null != idMin) || (null != idMax)) { idRange = new BasicDBObject(); if (null != idMin) { idRange.put(DbManager.gte_, idMin); } if (null != idMax) { idRange.put(DbManager.lt_, idMax); } } //TESTED // Step 2: merge with whatever we have at the moment: if (null != idRange) { BasicDBObject splitQueryQuery = new BasicDBObject( (BasicBSONObject) splitQuery.get("$query")); Object idQueryElement = splitQueryQuery.get(DocumentPojo._id_); boolean convertedAwayFromMinMax = false; if (null == idQueryElement) { // nice and easy, add _id range splitQueryQuery.put(DocumentPojo._id_, idRange); convertedAwayFromMinMax = true; } //TESTED else if (!splitQueryQuery.containsField(DbManager.and_)) { // OK we're going to just going to make life easy splitQueryQuery.remove(DocumentPojo._id_); splitQueryQuery.put(DbManager.and_, Arrays.asList(new BasicDBObject(DocumentPojo._id_, idQueryElement), new BasicDBObject(DocumentPojo._id_, idRange))); convertedAwayFromMinMax = true; } //TESTED // (else stick with min/max) if (convertedAwayFromMinMax) { // can construct an _id query splitQuery.removeField("$min"); splitQuery.removeField("$max"); } //TESTED splitQuery.put("$query", splitQueryQuery); } newsplits.add(new InfiniteMongoInputSplit(mongoSplit, conf.isNoTimeout())); } //TESTED } catch (Exception e) { //DEBUG //e.printStackTrace(); } // do nothing must be some other type of input split } //TESTED //DEBUG //System.out.println("Calculating splits via mongo-hadoop: " + initialSplitSize + " reduced to " + splits.size()); _logger.info("Calculating (converted) splits via mongo-hadoop: " + initialSplitSize + " reduced to " + newsplits.size()); return newsplits; } }