Java tutorial
/******************************************************************************* * Copyright 2012 The Infinit.e Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package com.ikanow.infinit.e.data_model.custom; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.NavigableSet; import java.util.TreeSet; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.log4j.Logger; import org.bson.BasicBSONObject; import org.bson.types.ObjectId; import com.ikanow.infinit.e.data_model.store.DbManager; import com.ikanow.infinit.e.data_model.store.MongoDbManager; import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo; import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo; import com.ikanow.infinit.e.data_model.store.document.DocumentPojo; import com.ikanow.infinit.e.data_model.store.document.EntityPojo; import com.mongodb.BasicDBList; import com.mongodb.BasicDBObject; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.DBObject; import com.mongodb.MongoURI; import com.mongodb.hadoop.input.MongoInputSplit; import com.mongodb.hadoop.util.MongoSplitter; public class InfiniteMongoSplitter { public static final int MAX_SPLITS = 10000000; private static Logger _logger = Logger.getLogger(InfiniteMongoSplitter.class); /** * Checks if the new params MAX_SPLITS and MAX_DOCS_PER_SPLIT are set * in the config. If they are it will use those to do splits via limit/skip * otherwise it will call the previous chunking splitter in MongoSplitter. * * @param conf * @return */ public static List<InputSplit> calculateSplits(InfiniteMongoConfig conf) { // First off: What is our sharding scheme? boolean shardingPolicyNew = false; try { BasicDBObject shardQuery = new BasicDBObject("_id", "doc_metadata.metadata"); BasicDBObject shardInfo = (BasicDBObject) DbManager.getCollection("config", "collections") .findOne(shardQuery); if (null != shardInfo) { BasicDBObject shardInfoKey = (BasicDBObject) shardInfo.get("key"); if (null != shardInfoKey) { shardingPolicyNew = (shardInfoKey.size() > 1); } } } //TESTED (new and old) catch (Exception e) { } // stick with the old sharding, it's probably going to die soon after though, honestly // conf.getQuery returns a new copy of the query, so get once and use everywhere... BasicDBObject confQuery = (BasicDBObject) conf.getQuery(); BasicDBObject srcTagsQuery = (BasicDBObject) conf.getSourceTags(); String collection = conf.getInputURI().getCollection(); if (!collection.equals(DbManager.getDocument().getContent().getName()) && !collection.equals(DbManager.getDocument().getMetadata().getName())) { // Case 1: feature table or custom table // Just run legacy code return calculateSplits_phase2(conf, confQuery, false, false, null); } else { // complex cases... boolean simpleOtherIndex = false; // Check whether a simple query has been performed on a different indexed field if (null == srcTagsQuery) { // (if srcTags specified, then going to want to use sourceKey as the index) for (String s : Arrays.asList(EntityPojo.docQuery_index_, DocumentPojo.url_)) { Object selector = confQuery.get(s); if (selector instanceof String) { simpleOtherIndex = true; break; } else if (selector instanceof DBObject) { DBObject selectorDbo = (DBObject) selector; if (selectorDbo.containsField(DbManager.in_)) { simpleOtherIndex = true; break; } } } //TESTED (both types, plus check complex indexes don't work) // ALLOWED: {"entities.index": { "$in": [ "xxx", "yyy"] }, {"entities.index": "xxx" }, ditto for "url" // NOT ALLOWED: { "entities.index": { "$ne": "xxx" } } } //TESTED check ignored if eg entity_index specified if (simpleOtherIndex) { // Case 2: we have a simple query on an indexed field // Just run legacy code return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null); } //TESTED else if (conf.getLimit() > 0) { // debug //Case 3: Ensure we have small sets of sources to search over BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery, conf.getMaxDocsPerSplit()); final List<InputSplit> splits = new ArrayList<InputSplit>(); boolean queryNonTrivial = isQueryNonTrivial(confQuery); if (!queryNonTrivial) { //Case 3a: query is trivial, so can just create splits directly from the split pre-calcs int toProcess = conf.getLimit(); Iterator<Object> itSplit = collectionOfSplits.iterator(); while ((toProcess > 0) && (itSplit.hasNext())) { BasicDBObject split = (BasicDBObject) itSplit.next(); int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L); int toGet = (docCount > toProcess) ? toProcess : docCount; BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout())); toProcess -= docCount; } } //TESTED } else { // Case 3b: annoying, some extra query terms, gonna need to do it the hard way... int toProcess = conf.getLimit(); Iterator<Object> itSplit = collectionOfSplits.iterator(); DBCollection coll = InfiniteMongoConfigUtil.getCollection(conf.getInputURI()); while ((toProcess > 0) && (itSplit.hasNext())) { BasicDBObject split = (BasicDBObject) itSplit.next(); BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { int docsCounted = (int) coll.getCount(modQuery, null, toProcess, 0); int toGet = (docsCounted > toProcess) ? toProcess : docsCounted; if (docsCounted > 0) { splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout())); toProcess -= docsCounted; } } //TESTED } } //TESTED return splits; } else { // More complex cases: if (shardingPolicyNew) { // Case 4a: NEW SHARDING SCHEME // Always fetch the new sources, eg convert communityId to sourceKeys try { splitPrecalculations_newShardScheme(confQuery, srcTagsQuery); // (modifies confQuery if returns true) boolean queryNonTrivial = isQueryNonTrivial(confQuery); return calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null); // (ie trivial query => always use chunks, bypass skip/limit test) } //TESTED (trivial + non-trivial) catch (Exception e) { // Didn't match any sources, no problem return new ArrayList<InputSplit>(); } //TESTED } //TESTED else { BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery, conf.getMaxDocsPerSplit()); if (null == collectionOfSplits) { // Case 4b: OLD SHARDING SCHEME can't get a partition by source keys, just back off to old code return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null); } //TESTED (old code) else { conf.setMaxDocsPerSplit(2 * conf.getMaxDocsPerSplit()); // (because we stop creating splits when the exceed the size) // Case 4c: OLD SHARDING SCHEME, have a source key partition int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits(); boolean queryNonTrivial = isQueryNonTrivial(confQuery); final List<InputSplit> splits = new ArrayList<InputSplit>(); BasicDBObject savedQuery = confQuery; Iterator<Object> itSplit = collectionOfSplits.iterator(); BasicDBList bigSplit = null; while (itSplit.hasNext()) { BasicDBObject split = (BasicDBObject) itSplit.next(); int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L); if (docCount < nMaxCount) { // small split, will use skip/limit BasicDBObject modQuery = convertQuery(savedQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { final int SPLIT_THRESHOLD = 3; // A few cases: if ((docCount < (SPLIT_THRESHOLD * conf.getMaxDocsPerSplit())) || !queryNonTrivial) { splits.addAll(calculateSplits_phase2(conf, modQuery, false, shardingPolicyNew, (Integer) docCount)); } //TESTED (based on limit, based on query) else { // My guess at the point at which you might as well as do the full query in the hope you're going // to save some (empty) splits splits.addAll(calculateSplits_phase2(conf, modQuery, false, shardingPolicyNew, null)); } //TESTED } //TESTED } else { // large split, combine all these guys into an array of source keys if (null == bigSplit) { bigSplit = new BasicDBList(); } bigSplit.add(split.get(DocumentPojo.sourceKey_)); // (guaranteed to be a single element) } } //(end loop over collections) if (null != bigSplit) { // If we have a big left over community then create a set of splits for that - always chunks if query trivial if (1 == bigSplit.size()) { confQuery.put(DocumentPojo.sourceKey_, bigSplit.iterator().next()); } else { confQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, bigSplit)); } splits.addAll(calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null)); } //TESTED: singleton+trivial (sandy), array+trivial (sentiment/enron), array+non-trivial (sentiment/enron, docGeo), singleton+non-trivial (sandy, docGeo) return splits; } //TESTED: end if Cases 4a, 4b, 4c } //(end if old vs new sharding policy) } //(non-debug case) } //(content or metadata table are most complex) } @SuppressWarnings("unchecked") public static List<InputSplit> calculateSplits_phase2(InfiniteMongoConfig conf, BasicDBObject confQuery, boolean alwaysUseChunks, boolean newShardScheme, Integer splitDocCount) { alwaysUseChunks &= (conf.getMaxSplits() != MAX_SPLITS); // (in standalone mode, never use chunks) MongoURI uri = conf.getInputURI(); DBCollection coll = InfiniteMongoConfigUtil.getCollection(uri); if (conf.getLimit() > 0) { return calculateManualSplits(conf, confQuery, 1, conf.getLimit(), coll); } else { if (!alwaysUseChunks) { int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits(); int count = 0; if (null == splitDocCount) { if (nMaxCount <= 1) { nMaxCount = 0; } else { //DEBUG //System.out.println(coll.find(confQuery).limit(1).explain()); count = (int) coll.getCount(confQuery, null, nMaxCount, 0); if (0 == count) { return new ArrayList<InputSplit>(); } } //TESTED } else { count = splitDocCount; } //if maxdocssplit and maxsplits is set and there are less documents than splits*docspersplit then use the new splitter //otherwise use the old splitter if (conf.getMaxDocsPerSplit() > 0 && conf.getMaxSplits() > 0 && (count < nMaxCount)) { _logger.debug("Calculating splits manually"); int splits_needed = (count / conf.getMaxDocsPerSplit()) + 1; return calculateManualSplits(conf, confQuery, splits_needed, conf.getMaxDocsPerSplit(), coll); } //TESTED } if (newShardScheme && !confQuery.containsField(DocumentPojo.sourceKey_)) { // OK if we're going to do the sharded version then we will want to calculate splitPrecalculations_newShardScheme(confQuery, null); // (modifies confQuery if returns true) } //TESTED: checked did nothing when had sourceKey, added sourceKey when necessary (eg entities.index case) if (!newShardScheme) { // unlike new sharding scheme, in this case the query is fixed, so overwrite now: conf.setQuery(confQuery); } List<InputSplit> splits = MongoSplitter.calculateSplits(conf); // (unless manually set, like above, runs with the _original_ query) int initialSplitSize = splits.size(); // We have the MongoDB-calculated splits, now calculate their intersection vs the query @SuppressWarnings("rawtypes") Map<String, TreeSet<Comparable>> orderedArraySet = new HashMap<String, TreeSet<Comparable>>(); @SuppressWarnings("rawtypes") Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin = new HashMap<String, NavigableSet<Comparable>>(); BasicDBObject originalQuery = confQuery; ArrayList<InputSplit> newsplits = new ArrayList<InputSplit>(splits.size()); Iterator<InputSplit> splitIt = splits.iterator(); while (splitIt.hasNext()) { try { orderedArraySet_afterMin.clear(); MongoInputSplit mongoSplit = (MongoInputSplit) splitIt.next(); BasicDBObject min = (BasicDBObject) mongoSplit.getQuerySpec().get("$min"); BasicDBObject max = (BasicDBObject) mongoSplit.getQuerySpec().get("$max"); //DEBUG //_logger.info("+----------------- NEW SPLIT ----------------: " + min + " /" + max); //System.out.println("+----------------- NEW SPLIT ----------------: " + min + " /" + max); if (null != min) { // How does the min fit in with the general query try { if (compareFields(-1, originalQuery, min, max, orderedArraySet, orderedArraySet_afterMin) < 0) { splitIt.remove(); continue; } } catch (Exception e) { } // do nothing probably just some comparable issue } //TESTED if (null != max) { // How does the min fit in with the general query try { if (compareFields(1, originalQuery, max, min, orderedArraySet, orderedArraySet_afterMin) > 0) { splitIt.remove(); continue; } } catch (Exception e) { } // do nothing probably just some comparable issue } //TESTED //DEBUG //_logger.info("(retained split)"); //System.out.println("(retained split)"); // (don't worry about edge cases, won't happen very often and will just result in a spurious empty mapper) //////////////////////////////// // Now some infinit.e specific processing... if (newShardScheme) { @SuppressWarnings("rawtypes") TreeSet<Comparable> sourceKeyOrderedArray = orderedArraySet.get(DocumentPojo.sourceKey_); if ((null != sourceKeyOrderedArray) && !sourceKeyOrderedArray.isEmpty()) { @SuppressWarnings("rawtypes") Comparable minSourceKey = null; Object minSourceKeyObj = (null == min) ? null : min.get(DocumentPojo.sourceKey_); if (minSourceKeyObj instanceof String) { minSourceKey = (String) minSourceKeyObj; } if (null == minSourceKey) { minSourceKey = sourceKeyOrderedArray.first(); } //TESTED @SuppressWarnings("rawtypes") Comparable maxSourceKey = null; Object maxSourceKeyObj = (null == max) ? null : max.get(DocumentPojo.sourceKey_); if (maxSourceKeyObj instanceof String) { maxSourceKey = (String) maxSourceKeyObj; } if (null == maxSourceKey) { maxSourceKey = sourceKeyOrderedArray.last(); } //TESTED DBObject splitQuery = mongoSplit.getQuerySpec(); BasicDBObject splitQueryQuery = new BasicDBObject( (BasicBSONObject) splitQuery.get("$query")); if (0 == minSourceKey.compareTo(maxSourceKey)) { // single matching sourceKEy splitQueryQuery.put(DocumentPojo.sourceKey_, maxSourceKey); } //TESTED (array of sources, only one matches) else { // multiple matching source keys splitQueryQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeyOrderedArray.subSet(minSourceKey, true, maxSourceKey, true))); } //TESTED (array of sources, multiple match) newsplits.add( new InfiniteMongoInputSplit(mongoSplit, splitQueryQuery, conf.isNoTimeout())); } else { // original query is of sufficient simplicity newsplits.add( new InfiniteMongoInputSplit(mongoSplit, originalQuery, conf.isNoTimeout())); } //TESTED (no change to existing source) } //TESTED else { // old sharding scheme, remove min/max and replace with normal _id based query where possible DBObject splitQuery = mongoSplit.getQuerySpec(); // Step 1: create a query range for _id: BasicDBObject idRange = null; Object idMin = (min == null) ? null : min.get(DocumentPojo._id_); Object idMax = (max == null) ? null : max.get(DocumentPojo._id_); if (!(idMin instanceof ObjectId)) idMin = null; if (!(idMax instanceof ObjectId)) idMax = null; if ((null != idMin) || (null != idMax)) { idRange = new BasicDBObject(); if (null != idMin) { idRange.put(DbManager.gte_, idMin); } if (null != idMax) { idRange.put(DbManager.lt_, idMax); } } //TESTED // Step 2: merge with whatever we have at the moment: if (null != idRange) { BasicDBObject splitQueryQuery = new BasicDBObject( (BasicBSONObject) splitQuery.get("$query")); Object idQueryElement = splitQueryQuery.get(DocumentPojo._id_); boolean convertedAwayFromMinMax = false; if (null == idQueryElement) { // nice and easy, add _id range splitQueryQuery.put(DocumentPojo._id_, idRange); convertedAwayFromMinMax = true; } //TESTED else if (!splitQueryQuery.containsField(DbManager.and_)) { // OK we're going to just going to make life easy splitQueryQuery.remove(DocumentPojo._id_); splitQueryQuery.put(DbManager.and_, Arrays.asList(new BasicDBObject(DocumentPojo._id_, idQueryElement), new BasicDBObject(DocumentPojo._id_, idRange))); convertedAwayFromMinMax = true; } //TESTED // (else stick with min/max) if (convertedAwayFromMinMax) { // can construct an _id query splitQuery.removeField("$min"); splitQuery.removeField("$max"); } //TESTED splitQuery.put("$query", splitQueryQuery); } newsplits.add(new InfiniteMongoInputSplit(mongoSplit, conf.isNoTimeout())); } //TESTED } catch (Exception e) { //DEBUG //e.printStackTrace(); } // do nothing must be some other type of input split } //TESTED //DEBUG //System.out.println("Calculating splits via mongo-hadoop: " + initialSplitSize + " reduced to " + splits.size()); _logger.info("Calculating (converted) splits via mongo-hadoop: " + initialSplitSize + " reduced to " + newsplits.size()); return newsplits; } }//TESTED /** * Creates numSplits amount of splits with limit items in each split * using limit and skip to determine the sets * * @param conf * @param numSplits * @param count * @param coll * @return */ private static List<InputSplit> calculateManualSplits(InfiniteMongoConfig conf, BasicDBObject confQuery, int numSplits, int limit, DBCollection coll) { final List<InputSplit> splits = new ArrayList<InputSplit>(numSplits); _logger.debug("using a limit of " + limit + " for " + numSplits + " splits"); for (int i = 0; i < numSplits; i++) { splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), confQuery, conf.getFields(), conf.getSort(), limit, i * limit, conf.isNoTimeout())); } return splits; } /////////////////////////////////////////////////////////// // UTILITY CODE // Comparison code to calculate if there is a non-zero intersection between the query and the chunk // Note that (eg) if you have [key:A, _id:B] as your min (/max) // then _id>B only applies if key==A ... if key>A then the entire _id space is allowed @SuppressWarnings({ "unchecked", "rawtypes" }) private static int compareFields(int direction, BasicDBObject query, BasicDBObject minOrMax, BasicDBObject maxOrMin, Map<String, TreeSet<Comparable>> orderedArraySet, Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin) { for (String field : minOrMax.keySet()) { //DEBUG //System.out.println("1] Compare: " + field + ": " + direction); try { Object queryOfThisField = query.get(field); Object minField = minOrMax.get(field); if ((null != queryOfThisField) && (minField instanceof Comparable)) { int result = 0; Comparable comparableMinOrMaxElement = (Comparable) minField; if (queryOfThisField instanceof BasicDBObject) { result = compareComplexObject(field, direction, (BasicDBObject) queryOfThisField, comparableMinOrMaxElement, orderedArraySet, orderedArraySet_afterMin); } //TESTED else { // -1 if comparableQueryElement < comparableMinOrMaxElement Comparable comparableQueryElement = (Comparable) queryOfThisField; result = comparableQueryElement.compareTo(comparableMinOrMaxElement); //DEBUG //System.out.println("3] Vals: " + comparableQueryElement + " vs " + comparableMinOrMaxElement + " = " + result); } //TESTED if (result != 0) { // if we ever get a strict inequality then stop checking fields.. if ((result == direction) || !minOrMax.equals(maxOrMin)) { // (fail) (pass but min/max keys different so not point checking any more) return result; } //TESTED } // else equality, pass but keep checking fields } } catch (Exception e) { //DEBUG //e.printStackTrace(); } // do nothing probably some odd comparable issue } return -direction; // (ie pass by default) }//TESTED // returns direction to pass without checking further fields, 0 to pass but check further fields, -direction to fail immediately // in practice won't ever return 0 (because it's not trivial to work out exact equality with complex operators) @SuppressWarnings({ "rawtypes", "unchecked" }) private static int compareComplexObject(String parentField, int direction, BasicDBObject complexQueryElement, Comparable minOrMaxElement, Map<String, TreeSet<Comparable>> orderedArraySet, Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin) { for (String field : complexQueryElement.keySet()) { //DEBUG //System.out.println("2] Compare operator: " + field + ", vs " + minOrMaxElement); if (field.equals(MongoDbManager.in_)) { NavigableSet<Comparable> orderedArray = null; if (1 == direction) { // try orderedArraySet_afterMin first... orderedArray = orderedArraySet_afterMin.get(parentField); //DEBUG //System.out.println("2.0] Found orderered sub-array for: " + parentField + ", size= " + orderedArray.size()); } //TESTED if (null == orderedArray) { // (min, or max but min didn't set a sub-array) orderedArray = orderedArraySet.get(parentField); if (null == orderedArray) { // First time for this field, order the $in for easy comparison orderedArray = new TreeSet<Comparable>(); Collection queryList = (Collection) complexQueryElement.get(MongoDbManager.in_); for (Object o : queryList) { Comparable c = (Comparable) o; orderedArray.add(c); } //DEBUG //System.out.println("2.1] Created orderered array for: " + parentField + ", size= " + orderedArray.size()); //DEBUG: // if (!orderedArray.isEmpty()) { // System.out.println("2.1.1] Head: " + orderedArray.iterator().next()); // System.out.println("2.1.2] Tail: " + orderedArray.descendingIterator().next()); // } orderedArraySet.put(parentField, (TreeSet<Comparable>) orderedArray); // (know this cast is valid by construction) } //TESTED } if (-1 == direction) { // comparing vs min //DEBUG //System.out.println("2.2] tailSet: " + orderedArray.tailSet(minOrMaxElement, true).size()); NavigableSet<Comparable> minElements = orderedArray.tailSet(minOrMaxElement, true); if (minElements.isEmpty()) { // (elements >= minElement) return direction; // will always fail } else { orderedArraySet_afterMin.put(parentField, minElements); } //TESTED } //TESTED else if (1 == direction) { // comparing vs max //DEBUG //System.out.println("2.2] headSet: " + orderedArray.headSet(minOrMaxElement, true).size()); if (orderedArray.headSet(minOrMaxElement, true).isEmpty()) { // (elements <= maxElement) return direction; // will always fail } } //TESTED } else if (field.equals(MongoDbManager.gt_) || field.equals(MongoDbManager.gte_)) { // (don't worry about the boundaries, just results in spurious empty chunks) if (1 == direction) { // can't do anything about $gt vs min Comparable comparableQueryElement = (Comparable) complexQueryElement.get(field); //DEBUG //System.out.println("2.3.1] GT Vals: " + comparableQueryElement + " vs " + minOrMaxElement + " = " + comparableQueryElement.compareTo(minOrMaxElement)); if (comparableQueryElement.compareTo(minOrMaxElement) > 0) // ie query _lower_ limit > chunk max return direction; // ie fail } } //TESTED else if (field.equals(MongoDbManager.lt_) || field.equals(MongoDbManager.lte_)) { // (don't worry about the boundaries, just results in spurious empty chunks) if (-1 == direction) { // can't do anything about $lt vs max Comparable comparableQueryElement = (Comparable) complexQueryElement.get(field); //DEBUG //System.out.println("2.3.2] LT Vals: " + comparableQueryElement + " vs " + minOrMaxElement + " = " + comparableQueryElement.compareTo(minOrMaxElement)); if (comparableQueryElement.compareTo(minOrMaxElement) < 0) // ie query upper limit < chunk min return direction; // ie fail } } //TESTED } return -direction; // (ie pass by default, don't check other fields unless they have the same min/max) }//TESTED (tested $in, $gte?, $lte?, $gte?/$lte? combinations) //TEST INFO: // shardKey = { sourceKey:1, _id: 1 } // FIRST TESTED AGAINST $in 114 different keys starting with jdbc* // THEN query={"sourceKey": "jdbc.oracle.thin.@ec2-54-205-223-166.compute-1.amazonaws.com.152.1438"} ... left 226 chunks, hand checked // THEN query={"sourceKey": "jdbc.oracle.thin.@ec2-54-205-223-166.compute-1.amazonaws.com.152.1438"}, _id: { $oid: "52702a06e4b0b912ee0615f1" } ... left 1 chunk, hand checked // THEN query={"sourceKey": "jdbc.oracle.thin.@ec2-54-205-223-166.compute-1.amazonaws.com.152.1438"}, _id: {"$gte": {"$oid": "52702a06e4b0b912ee0615f0"}, "$lt": {"$oid": "52753c1fe4b019e585827285"} } ... left 3 chunks, hand checked // THEN query={_id: {"$gte": {"$oid": "52702a06e4b0b912ee0615f0"}, "$lt": {"$oid": "52753c1fe4b019e585827285"} } ... left 89 chunks, hand checked a few //////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// // Util - returns a list of shards @SuppressWarnings("unchecked") public static boolean splitPrecalculations_newShardScheme(BasicDBObject query, BasicDBObject srcTagsQuery) { // Get the communityIds from the query Collection<ObjectId> communityIds = null; try { BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_); communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_); if (null == communityIds) { return false; } } catch (Exception e) { //DEBUG //e.printStackTrace(); return false; // back out } BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_, new BasicDBObject(DbManager.in_, communityIds)); BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1); keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1); keyFields.put(SourcePojo.highestDistributionFactorStored_, 1); // Get and remove the sourceKey information, incorporate into source query, // so it's nice and simple by the time it gets to the actual query Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_); if (null != srcTagsQuery) { // Simpler case: src tags specified, so going to get a list of all the sources regardless if (null != sourceKeyQueryTerm) { keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); } keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_)); } //TESTED (including $all to test that "$srctags":{"$all": ["tagtest","db"]} matches on tags: ["tagtest","db", "tagtest2" ] else if (null != sourceKeyQueryTerm) { boolean sourceKeyQueryComplex = false; if (sourceKeyQueryTerm instanceof BasicDBObject) { BasicDBObject sourceKeyQueryTermDbo = (BasicDBObject) sourceKeyQueryTerm; if (sourceKeyQueryTermDbo.size() <= 2) { // every term must be lt/lte/gt/gte for (String sourceKeyQueryTermEl : sourceKeyQueryTermDbo.keySet()) { if (!sourceKeyQueryTermEl.equals(DbManager.in_) && !sourceKeyQueryTermEl.equals(DbManager.lt_) && !sourceKeyQueryTermEl.equals(DbManager.lte_) && !sourceKeyQueryTermEl.equals(DbManager.gt_) && !sourceKeyQueryTermEl.equals(DbManager.gte_)) { sourceKeyQueryComplex = true; break; } //TESTED (eg ne) else if (sourceKeyQueryTermEl.equals(DbManager.in_) && (1 != sourceKeyQueryTermDbo.size())) { sourceKeyQueryComplex = true; break; } //TESTED ((lt,in)) } } //TESTED: (in, (gte,lt), ne) else { sourceKeyQueryComplex = true; } //TESTED ({ "sourceKey": { "$in": ["test"], "$gt": "alex", "$lte":"test" } }) } else if (sourceKeyQueryTerm instanceof java.util.regex.Pattern) { // probably a sourceKeyQueryComplex = true; } //TESTED ($regex) if (sourceKeyQueryComplex) { keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); // ie we'll simplify it below } else { return false; // already have a perfectly good source key specification } } //TESTED (See combinations above) DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(keyFields); int count = dbc.count(); if (count > 5000) { // (too many source keys to process, just going to leave well alone... note will mean $srctags will fail open) return false; } else { ArrayList<String> sources = new ArrayList<String>(count); while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); String sourceKey = (String) dbo.get(SourcePojo.key_); Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_); sources.addAll(SourcePojo.getDistributedKeys(sourceKey, distributionFactor)); } if (sources.isEmpty()) { throw new RuntimeException(); // will just return no splits at all, no problem } //TESTED if (1 == sources.size()) { query.put(DocumentPojo.sourceKey_, sources.get(0)); } //TESTED else { query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sources)); } //TESTED return true; } }//TESTED (See combinations above) // Util for creating a useful object containing source info (old sharding, _id - or new sharding but debug mode) @SuppressWarnings("unchecked") public static BasicDBList splitPrecalculations_oldShardSchemeOrDebug(BasicDBObject query, BasicDBObject srcTagsQuery, int maxCountPerTask) { // Get the communityIds from the query Collection<ObjectId> communityIds = null; try { BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_); communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_); if (null == communityIds) { return null; } } catch (Exception e) { return null; // back out } BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_, new BasicDBObject(DbManager.in_, communityIds)); BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1); keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1); BasicDBObject sortFields = new BasicDBObject(SourcePojo.key_, 1); // Get and remove the sourceKey information, incorporate into source query: Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_); if (null != sourceKeyQueryTerm) { keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); } //TESTED if (null != srcTagsQuery) { keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_)); } //TESTED DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(sortFields); // (note the sort is needed so that the potentially expensive doc query has a sensibly ordered $in clause) if (dbc.count() > 5000) { // (too many source keys to process, just going to leave well alone... note this means $srctags will fail open) return null; } else { //TreeMap<String, Long> sourceKeys = new TreeMap<String, Long>(); // Build collections of objects of format { sourceKey: string or [], totalDocs } BasicDBList sourceKeyListCollection = new BasicDBList(); BasicDBList sourceKeyList = null; int runningDocs = 0; int runningSources = 0; while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); String sourceKey = (String) dbo.get(SourcePojo.key_); if (null != sourceKey) { long docCount = 0L; try { BasicDBObject harvestStatus = (BasicDBObject) dbo.get(SourcePojo.harvest_); if (null != harvestStatus) { docCount = harvestStatus.getLong(SourceHarvestStatusPojo.doccount_, 0L); } } catch (Exception e) { } //DEBUG //System.out.println("SOURCE=" + sourceKey + " DOC_COUNT=" + docCount + " RUNNING=" + runningDocs +"," + runningSources + ": " + sourceKeyList); if (docCount > maxCountPerTask) { // source is large enough by itself // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKey); collection.put(SourceHarvestStatusPojo.doccount_, docCount); sourceKeyListCollection.add(collection); // (leaving running* alone, can keep building that) } //TESTED (by eye, system community of demo cluster) else if ((runningDocs + docCount) > maxCountPerTask) { // have now got a large enough collection of sources if (null == sourceKeyList) { sourceKeyList = new BasicDBList(); } sourceKeyList.add(sourceKey); // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount); sourceKeyListCollection.add(collection); sourceKeyList = null; runningDocs = 0; runningSources = 0; } //TESTED (by eye, system community of demo cluster) else if (runningSources >= 15) { // have a limit on the number of sources per query, to keep the queries manageable sourceKeyList.add(sourceKey); // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount); sourceKeyListCollection.add(collection); sourceKeyList = null; runningDocs = 0; runningSources = 0; } //TESTED (by eye, system community of demo cluster) else { // (keep) build(ing) list if (null == sourceKeyList) { sourceKeyList = new BasicDBList(); } sourceKeyList.add(sourceKey); runningDocs += docCount; runningSources++; } //TESTED (by eye, system community of demo cluster) } //(end if has source key) } //(end loop over cursor) // Finish off: if (null != sourceKeyList) { // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs); sourceKeyListCollection.add(collection); } //TESTED (by eye, system community of demo cluster) if (sourceKeyListCollection.isEmpty()) { // query returns empty throw new RuntimeException("Communities contain no sources"); } return sourceKeyListCollection; } // (end if too many source keys across the communities) }//TESTED // Utility - has user specified fields other than community Id, index, or sourceKey private static boolean isQueryNonTrivial(BasicDBObject query) { if ((query.size() > 3) || ((query.size() > 2) && !query.containsField(DocumentPojo.sourceKey_))) { return true; } return false; }//TESTED // Utility - create new query with overwritten sourceKey private static BasicDBObject convertQuery(BasicDBObject originalQuery, Object sourceObj) { BasicDBObject modQuery = null; if (null != sourceObj) { if (sourceObj instanceof Collection) { modQuery = new BasicDBObject(originalQuery.toMap()); @SuppressWarnings("rawtypes") Collection sources = (Collection) sourceObj; modQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sources)); } //TESTED else if (sourceObj instanceof String) { modQuery = new BasicDBObject(originalQuery.toMap()); String source = (String) sourceObj; modQuery.put(DocumentPojo.sourceKey_, source); } //TESTED } return modQuery; }//TESTED }