com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java Source code

Introduction

Here is the source code for com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
Source

/*******************************************************************************
 * Copyright 2012 The Infinit.e Open Source Project
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.ikanow.infinit.e.data_model.custom;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NavigableSet;
import java.util.TreeSet;

import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.log4j.Logger;
import org.bson.BasicBSONObject;
import org.bson.types.ObjectId;

import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.MongoURI;
import com.mongodb.hadoop.input.MongoInputSplit;
import com.mongodb.hadoop.util.MongoSplitter;

public class InfiniteMongoSplitter {
    public static final int MAX_SPLITS = 10000000;

    private static Logger _logger = Logger.getLogger(InfiniteMongoSplitter.class);

    /**
     * Checks if the new params MAX_SPLITS and MAX_DOCS_PER_SPLIT are set
     * in the config.  If they are it will use those to do splits via limit/skip
     * otherwise it will call the previous chunking splitter in MongoSplitter.
     * 
     * @param conf
     * @return
     */

    public static List<InputSplit> calculateSplits(InfiniteMongoConfig conf) {
        // First off: What is our sharding scheme?

        boolean shardingPolicyNew = false;
        try {
            BasicDBObject shardQuery = new BasicDBObject("_id", "doc_metadata.metadata");
            BasicDBObject shardInfo = (BasicDBObject) DbManager.getCollection("config", "collections")
                    .findOne(shardQuery);
            if (null != shardInfo) {
                BasicDBObject shardInfoKey = (BasicDBObject) shardInfo.get("key");
                if (null != shardInfoKey) {
                    shardingPolicyNew = (shardInfoKey.size() > 1);
                }
            }
        } //TESTED (new and old)
        catch (Exception e) {
        } // stick with the old sharding, it's probably going to die soon after though, honestly

        // conf.getQuery returns a new copy of the query, so get once and use everywhere...
        BasicDBObject confQuery = (BasicDBObject) conf.getQuery();

        BasicDBObject srcTagsQuery = (BasicDBObject) conf.getSourceTags();

        String collection = conf.getInputURI().getCollection();
        if (!collection.equals(DbManager.getDocument().getContent().getName())
                && !collection.equals(DbManager.getDocument().getMetadata().getName())) {
            // Case 1: feature table or custom table
            // Just run legacy code
            return calculateSplits_phase2(conf, confQuery, false, false, null);
        } else { // complex cases...
            boolean simpleOtherIndex = false;
            // Check whether a simple query has been performed on a different indexed field         
            if (null == srcTagsQuery) { // (if srcTags specified, then going to want to use sourceKey as the index)
                for (String s : Arrays.asList(EntityPojo.docQuery_index_, DocumentPojo.url_)) {
                    Object selector = confQuery.get(s);
                    if (selector instanceof String) {
                        simpleOtherIndex = true;
                        break;
                    } else if (selector instanceof DBObject) {
                        DBObject selectorDbo = (DBObject) selector;
                        if (selectorDbo.containsField(DbManager.in_)) {
                            simpleOtherIndex = true;
                            break;
                        }
                    }
                } //TESTED (both types, plus check complex indexes don't work)         
                  // ALLOWED: {"entities.index": { "$in": [ "xxx", "yyy"] }, {"entities.index": "xxx" }, ditto for "url"
                  // NOT ALLOWED: { "entities.index": { "$ne": "xxx" } }
            }
            //TESTED check ignored if eg entity_index specified

            if (simpleOtherIndex) {
                // Case 2: we have a simple query on an indexed field 
                // Just run legacy code

                return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);
            } //TESTED
            else if (conf.getLimit() > 0) { // debug
                //Case 3: Ensure we have small sets of sources to search over
                BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery,
                        conf.getMaxDocsPerSplit());
                final List<InputSplit> splits = new ArrayList<InputSplit>();

                boolean queryNonTrivial = isQueryNonTrivial(confQuery);
                if (!queryNonTrivial) {
                    //Case 3a: query is trivial, so can just create splits directly from the split pre-calcs
                    int toProcess = conf.getLimit();
                    Iterator<Object> itSplit = collectionOfSplits.iterator();
                    while ((toProcess > 0) && (itSplit.hasNext())) {
                        BasicDBObject split = (BasicDBObject) itSplit.next();

                        int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                        int toGet = (docCount > toProcess) ? toProcess : docCount;
                        BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
                        if (null != modQuery) {
                            splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery,
                                    conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
                            toProcess -= docCount;
                        }
                    } //TESTED
                } else {
                    // Case 3b: annoying, some extra query terms, gonna need to do it the hard way...
                    int toProcess = conf.getLimit();
                    Iterator<Object> itSplit = collectionOfSplits.iterator();
                    DBCollection coll = InfiniteMongoConfigUtil.getCollection(conf.getInputURI());
                    while ((toProcess > 0) && (itSplit.hasNext())) {
                        BasicDBObject split = (BasicDBObject) itSplit.next();

                        BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
                        if (null != modQuery) {
                            int docsCounted = (int) coll.getCount(modQuery, null, toProcess, 0);
                            int toGet = (docsCounted > toProcess) ? toProcess : docsCounted;
                            if (docsCounted > 0) {
                                splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(),
                                        modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
                                toProcess -= docsCounted;
                            }
                        } //TESTED
                    }
                } //TESTED

                return splits;
            } else { // More complex cases:

                if (shardingPolicyNew) {
                    // Case 4a: NEW SHARDING SCHEME

                    // Always fetch the new sources, eg convert communityId to sourceKeys
                    try {
                        splitPrecalculations_newShardScheme(confQuery, srcTagsQuery); // (modifies confQuery if returns true)            
                        boolean queryNonTrivial = isQueryNonTrivial(confQuery);

                        return calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null);

                        // (ie trivial query => always use chunks, bypass skip/limit test)
                    } //TESTED (trivial + non-trivial)
                    catch (Exception e) { // Didn't match any sources, no problem
                        return new ArrayList<InputSplit>();
                    } //TESTED

                } //TESTED
                else {

                    BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery,
                            srcTagsQuery, conf.getMaxDocsPerSplit());

                    if (null == collectionOfSplits) {
                        // Case 4b: OLD SHARDING SCHEME can't get a partition by source keys, just back off to old code
                        return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);
                    } //TESTED (old code)
                    else {
                        conf.setMaxDocsPerSplit(2 * conf.getMaxDocsPerSplit());
                        // (because we stop creating splits when the exceed the size)

                        // Case 4c: OLD SHARDING SCHEME, have a source key partition
                        int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits();
                        boolean queryNonTrivial = isQueryNonTrivial(confQuery);
                        final List<InputSplit> splits = new ArrayList<InputSplit>();

                        BasicDBObject savedQuery = confQuery;

                        Iterator<Object> itSplit = collectionOfSplits.iterator();
                        BasicDBList bigSplit = null;
                        while (itSplit.hasNext()) {
                            BasicDBObject split = (BasicDBObject) itSplit.next();
                            int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                            if (docCount < nMaxCount) { // small split, will use skip/limit
                                BasicDBObject modQuery = convertQuery(savedQuery,
                                        split.get(DocumentPojo.sourceKey_));
                                if (null != modQuery) {

                                    final int SPLIT_THRESHOLD = 3;
                                    // A few cases:
                                    if ((docCount < (SPLIT_THRESHOLD * conf.getMaxDocsPerSplit()))
                                            || !queryNonTrivial) {
                                        splits.addAll(calculateSplits_phase2(conf, modQuery, false,
                                                shardingPolicyNew, (Integer) docCount));
                                    } //TESTED (based on limit, based on query)
                                    else {
                                        // My guess at the point at which you might as well as do the full query in the hope you're going
                                        // to save some (empty) splits
                                        splits.addAll(calculateSplits_phase2(conf, modQuery, false,
                                                shardingPolicyNew, null));
                                    } //TESTED
                                } //TESTED
                            } else { // large split, combine all these guys into an array of source keys
                                if (null == bigSplit) {
                                    bigSplit = new BasicDBList();
                                }
                                bigSplit.add(split.get(DocumentPojo.sourceKey_));
                                // (guaranteed to be a single element)
                            }
                        } //(end loop over collections)

                        if (null != bigSplit) {

                            // If we have a big left over community then create a set of splits for that - always chunks if query trivial
                            if (1 == bigSplit.size()) {
                                confQuery.put(DocumentPojo.sourceKey_, bigSplit.iterator().next());
                            } else {
                                confQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, bigSplit));
                            }
                            splits.addAll(calculateSplits_phase2(conf, confQuery, !queryNonTrivial,
                                    shardingPolicyNew, null));
                        } //TESTED: singleton+trivial (sandy), array+trivial (sentiment/enron), array+non-trivial (sentiment/enron, docGeo), singleton+non-trivial (sandy, docGeo)

                        return splits;

                    } //TESTED: end if Cases 4a, 4b, 4c

                } //(end if old vs new sharding policy)

            } //(non-debug case)
        } //(content or metadata table are most complex)
    }

    @SuppressWarnings("unchecked")
    public static List<InputSplit> calculateSplits_phase2(InfiniteMongoConfig conf, BasicDBObject confQuery,
            boolean alwaysUseChunks, boolean newShardScheme, Integer splitDocCount) {
        alwaysUseChunks &= (conf.getMaxSplits() != MAX_SPLITS);
        // (in standalone mode, never use chunks)

        MongoURI uri = conf.getInputURI();
        DBCollection coll = InfiniteMongoConfigUtil.getCollection(uri);
        if (conf.getLimit() > 0) {
            return calculateManualSplits(conf, confQuery, 1, conf.getLimit(), coll);
        } else {
            if (!alwaysUseChunks) {
                int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits();
                int count = 0;
                if (null == splitDocCount) {
                    if (nMaxCount <= 1) {
                        nMaxCount = 0;
                    } else {
                        //DEBUG
                        //System.out.println(coll.find(confQuery).limit(1).explain());

                        count = (int) coll.getCount(confQuery, null, nMaxCount, 0);
                        if (0 == count) {
                            return new ArrayList<InputSplit>();
                        }
                    } //TESTED
                } else {
                    count = splitDocCount;
                }

                //if maxdocssplit and maxsplits is set and there are less documents than splits*docspersplit then use the new splitter
                //otherwise use the old splitter
                if (conf.getMaxDocsPerSplit() > 0 && conf.getMaxSplits() > 0 && (count < nMaxCount)) {
                    _logger.debug("Calculating splits manually");
                    int splits_needed = (count / conf.getMaxDocsPerSplit()) + 1;

                    return calculateManualSplits(conf, confQuery, splits_needed, conf.getMaxDocsPerSplit(), coll);
                } //TESTED
            }
            if (newShardScheme && !confQuery.containsField(DocumentPojo.sourceKey_)) {
                // OK if we're going to do the sharded version then we will want to calculate
                splitPrecalculations_newShardScheme(confQuery, null); // (modifies confQuery if returns true)            
            } //TESTED: checked did nothing when had sourceKey, added sourceKey when necessary (eg entities.index case)

            if (!newShardScheme) { // unlike new sharding scheme, in this case the query is fixed, so overwrite now:
                conf.setQuery(confQuery);
            }

            List<InputSplit> splits = MongoSplitter.calculateSplits(conf);
            // (unless manually set, like above, runs with the _original_ query)
            int initialSplitSize = splits.size();

            // We have the MongoDB-calculated splits, now calculate their intersection vs the query
            @SuppressWarnings("rawtypes")
            Map<String, TreeSet<Comparable>> orderedArraySet = new HashMap<String, TreeSet<Comparable>>();
            @SuppressWarnings("rawtypes")
            Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin = new HashMap<String, NavigableSet<Comparable>>();
            BasicDBObject originalQuery = confQuery;

            ArrayList<InputSplit> newsplits = new ArrayList<InputSplit>(splits.size());
            Iterator<InputSplit> splitIt = splits.iterator();
            while (splitIt.hasNext()) {
                try {
                    orderedArraySet_afterMin.clear();

                    MongoInputSplit mongoSplit = (MongoInputSplit) splitIt.next();
                    BasicDBObject min = (BasicDBObject) mongoSplit.getQuerySpec().get("$min");
                    BasicDBObject max = (BasicDBObject) mongoSplit.getQuerySpec().get("$max");

                    //DEBUG
                    //_logger.info("+----------------- NEW SPLIT ----------------: " + min + " /" + max);
                    //System.out.println("+----------------- NEW SPLIT ----------------: " + min + " /" + max);

                    if (null != min) { // How does the min fit in with the general query
                        try {
                            if (compareFields(-1, originalQuery, min, max, orderedArraySet,
                                    orderedArraySet_afterMin) < 0) {
                                splitIt.remove();
                                continue;
                            }
                        } catch (Exception e) {
                        } // do nothing probably just some comparable issue
                    } //TESTED

                    if (null != max) { // How does the min fit in with the general query
                        try {
                            if (compareFields(1, originalQuery, max, min, orderedArraySet,
                                    orderedArraySet_afterMin) > 0) {
                                splitIt.remove();
                                continue;
                            }
                        } catch (Exception e) {
                        } // do nothing probably just some comparable issue
                    } //TESTED

                    //DEBUG
                    //_logger.info("(retained split)");
                    //System.out.println("(retained split)");

                    // (don't worry about edge cases, won't happen very often and will just result in a spurious empty mapper)

                    ////////////////////////////////

                    // Now some infinit.e specific processing...

                    if (newShardScheme) {
                        @SuppressWarnings("rawtypes")
                        TreeSet<Comparable> sourceKeyOrderedArray = orderedArraySet.get(DocumentPojo.sourceKey_);
                        if ((null != sourceKeyOrderedArray) && !sourceKeyOrderedArray.isEmpty()) {
                            @SuppressWarnings("rawtypes")
                            Comparable minSourceKey = null;
                            Object minSourceKeyObj = (null == min) ? null : min.get(DocumentPojo.sourceKey_);
                            if (minSourceKeyObj instanceof String) {
                                minSourceKey = (String) minSourceKeyObj;
                            }
                            if (null == minSourceKey) {
                                minSourceKey = sourceKeyOrderedArray.first();
                            } //TESTED
                            @SuppressWarnings("rawtypes")
                            Comparable maxSourceKey = null;
                            Object maxSourceKeyObj = (null == max) ? null : max.get(DocumentPojo.sourceKey_);
                            if (maxSourceKeyObj instanceof String) {
                                maxSourceKey = (String) maxSourceKeyObj;
                            }
                            if (null == maxSourceKey) {
                                maxSourceKey = sourceKeyOrderedArray.last();
                            } //TESTED

                            DBObject splitQuery = mongoSplit.getQuerySpec();
                            BasicDBObject splitQueryQuery = new BasicDBObject(
                                    (BasicBSONObject) splitQuery.get("$query"));
                            if (0 == minSourceKey.compareTo(maxSourceKey)) { // single matching sourceKEy
                                splitQueryQuery.put(DocumentPojo.sourceKey_, maxSourceKey);
                            } //TESTED (array of sources, only one matches)
                            else { // multiple matching source keys
                                splitQueryQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_,
                                        sourceKeyOrderedArray.subSet(minSourceKey, true, maxSourceKey, true)));
                            } //TESTED (array of sources, multiple match)               
                            newsplits.add(
                                    new InfiniteMongoInputSplit(mongoSplit, splitQueryQuery, conf.isNoTimeout()));
                        } else { // original query is of sufficient simplicity
                            newsplits.add(
                                    new InfiniteMongoInputSplit(mongoSplit, originalQuery, conf.isNoTimeout()));
                        } //TESTED (no change to existing source)

                    } //TESTED
                    else { // old sharding scheme, remove min/max and replace with normal _id based query where possible

                        DBObject splitQuery = mongoSplit.getQuerySpec();
                        // Step 1: create a query range for _id:
                        BasicDBObject idRange = null;
                        Object idMin = (min == null) ? null : min.get(DocumentPojo._id_);
                        Object idMax = (max == null) ? null : max.get(DocumentPojo._id_);
                        if (!(idMin instanceof ObjectId))
                            idMin = null;
                        if (!(idMax instanceof ObjectId))
                            idMax = null;

                        if ((null != idMin) || (null != idMax)) {
                            idRange = new BasicDBObject();
                            if (null != idMin) {
                                idRange.put(DbManager.gte_, idMin);
                            }
                            if (null != idMax) {
                                idRange.put(DbManager.lt_, idMax);
                            }
                        } //TESTED                  

                        // Step 2: merge with whatever we have at the moment:
                        if (null != idRange) {
                            BasicDBObject splitQueryQuery = new BasicDBObject(
                                    (BasicBSONObject) splitQuery.get("$query"));
                            Object idQueryElement = splitQueryQuery.get(DocumentPojo._id_);
                            boolean convertedAwayFromMinMax = false;
                            if (null == idQueryElement) { // nice and easy, add _id range
                                splitQueryQuery.put(DocumentPojo._id_, idRange);
                                convertedAwayFromMinMax = true;
                            } //TESTED
                            else if (!splitQueryQuery.containsField(DbManager.and_)) { // OK we're going to just going to make life easy
                                splitQueryQuery.remove(DocumentPojo._id_);
                                splitQueryQuery.put(DbManager.and_,
                                        Arrays.asList(new BasicDBObject(DocumentPojo._id_, idQueryElement),
                                                new BasicDBObject(DocumentPojo._id_, idRange)));
                                convertedAwayFromMinMax = true;
                            } //TESTED
                              // (else stick with min/max)

                            if (convertedAwayFromMinMax) { // can construct an _id query
                                splitQuery.removeField("$min");
                                splitQuery.removeField("$max");
                            } //TESTED
                            splitQuery.put("$query", splitQueryQuery);
                        }
                        newsplits.add(new InfiniteMongoInputSplit(mongoSplit, conf.isNoTimeout()));
                    } //TESTED         
                } catch (Exception e) {
                    //DEBUG
                    //e.printStackTrace();
                } // do nothing must be some other type of input split
            } //TESTED

            //DEBUG
            //System.out.println("Calculating splits via mongo-hadoop: " + initialSplitSize + " reduced to " + splits.size());

            _logger.info("Calculating (converted) splits via mongo-hadoop: " + initialSplitSize + " reduced to "
                    + newsplits.size());
            return newsplits;
        }
    }//TESTED

    /**
     * Creates numSplits amount of splits with limit items in each split
     * using limit and skip to determine the sets
     * 
     * @param conf
     * @param numSplits
     * @param count
     * @param coll
     * @return
     */
    private static List<InputSplit> calculateManualSplits(InfiniteMongoConfig conf, BasicDBObject confQuery,
            int numSplits, int limit, DBCollection coll) {
        final List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
        _logger.debug("using a limit of " + limit + " for " + numSplits + " splits");
        for (int i = 0; i < numSplits; i++) {
            splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), confQuery,
                    conf.getFields(), conf.getSort(), limit, i * limit, conf.isNoTimeout()));
        }
        return splits;
    }

    ///////////////////////////////////////////////////////////

    // UTILITY CODE

    // Comparison code to calculate if there is a non-zero intersection between the query and the chunk
    // Note that (eg) if you have [key:A, _id:B] as your min (/max)
    // then _id>B only applies if key==A ... if key>A then the entire _id space is allowed

    @SuppressWarnings({ "unchecked", "rawtypes" })
    private static int compareFields(int direction, BasicDBObject query, BasicDBObject minOrMax,
            BasicDBObject maxOrMin, Map<String, TreeSet<Comparable>> orderedArraySet,
            Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin) {
        for (String field : minOrMax.keySet()) {
            //DEBUG
            //System.out.println("1] Compare: " + field + ": " + direction);

            try {
                Object queryOfThisField = query.get(field);
                Object minField = minOrMax.get(field);
                if ((null != queryOfThisField) && (minField instanceof Comparable)) {
                    int result = 0;
                    Comparable comparableMinOrMaxElement = (Comparable) minField;
                    if (queryOfThisField instanceof BasicDBObject) {
                        result = compareComplexObject(field, direction, (BasicDBObject) queryOfThisField,
                                comparableMinOrMaxElement, orderedArraySet, orderedArraySet_afterMin);
                    } //TESTED
                    else { // -1 if comparableQueryElement < comparableMinOrMaxElement 
                        Comparable comparableQueryElement = (Comparable) queryOfThisField;
                        result = comparableQueryElement.compareTo(comparableMinOrMaxElement);
                        //DEBUG
                        //System.out.println("3] Vals: " + comparableQueryElement + " vs " + comparableMinOrMaxElement + " = " + result);
                    } //TESTED      
                    if (result != 0) { // if we ever get a strict inequality then stop checking fields..
                        if ((result == direction) || !minOrMax.equals(maxOrMin)) {
                            // (fail)                 (pass but min/max keys different so not point checking any more)
                            return result;
                        } //TESTED
                    }
                    // else equality, pass but keep checking fields 
                }
            } catch (Exception e) {
                //DEBUG
                //e.printStackTrace();
            } // do nothing probably some odd comparable issue
        }
        return -direction; // (ie pass by default)
    }//TESTED

    // returns direction to pass without checking further fields, 0 to pass but check further fields, -direction to fail immediately
    // in practice won't ever return 0 (because it's not trivial to work out exact equality with complex operators)

    @SuppressWarnings({ "rawtypes", "unchecked" })
    private static int compareComplexObject(String parentField, int direction, BasicDBObject complexQueryElement,
            Comparable minOrMaxElement, Map<String, TreeSet<Comparable>> orderedArraySet,
            Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin) {
        for (String field : complexQueryElement.keySet()) {
            //DEBUG
            //System.out.println("2] Compare operator: " + field + ", vs " + minOrMaxElement);

            if (field.equals(MongoDbManager.in_)) {

                NavigableSet<Comparable> orderedArray = null;
                if (1 == direction) { // try orderedArraySet_afterMin first...
                    orderedArray = orderedArraySet_afterMin.get(parentField);
                    //DEBUG
                    //System.out.println("2.0] Found orderered sub-array for: " + parentField + ", size= " + orderedArray.size());
                } //TESTED
                if (null == orderedArray) { // (min, or max but min didn't set a sub-array)
                    orderedArray = orderedArraySet.get(parentField);
                    if (null == orderedArray) {
                        // First time for this field, order the $in for easy comparison
                        orderedArray = new TreeSet<Comparable>();
                        Collection queryList = (Collection) complexQueryElement.get(MongoDbManager.in_);
                        for (Object o : queryList) {
                            Comparable c = (Comparable) o;
                            orderedArray.add(c);
                        }
                        //DEBUG
                        //System.out.println("2.1] Created orderered array for: " + parentField + ", size= " + orderedArray.size());

                        //DEBUG:
                        //                  if (!orderedArray.isEmpty()) {
                        //                     System.out.println("2.1.1] Head: " + orderedArray.iterator().next());               
                        //                     System.out.println("2.1.2] Tail: " + orderedArray.descendingIterator().next());               
                        //                  }

                        orderedArraySet.put(parentField, (TreeSet<Comparable>) orderedArray);
                        // (know this cast is valid by construction)
                    } //TESTED
                }
                if (-1 == direction) { // comparing vs min
                    //DEBUG
                    //System.out.println("2.2] tailSet: " + orderedArray.tailSet(minOrMaxElement, true).size());
                    NavigableSet<Comparable> minElements = orderedArray.tailSet(minOrMaxElement, true);
                    if (minElements.isEmpty()) { // (elements >= minElement)
                        return direction; // will always fail
                    } else {
                        orderedArraySet_afterMin.put(parentField, minElements);
                    } //TESTED
                } //TESTED
                else if (1 == direction) { // comparing vs max
                    //DEBUG
                    //System.out.println("2.2] headSet: " + orderedArray.headSet(minOrMaxElement, true).size());

                    if (orderedArray.headSet(minOrMaxElement, true).isEmpty()) { // (elements <= maxElement)
                        return direction; // will always fail
                    }
                } //TESTED
            } else if (field.equals(MongoDbManager.gt_) || field.equals(MongoDbManager.gte_)) { // (don't worry about the boundaries, just results in spurious empty chunks)
                if (1 == direction) { // can't do anything about $gt vs min
                    Comparable comparableQueryElement = (Comparable) complexQueryElement.get(field);
                    //DEBUG
                    //System.out.println("2.3.1] GT Vals: " + comparableQueryElement + " vs " + minOrMaxElement + " = " + comparableQueryElement.compareTo(minOrMaxElement));

                    if (comparableQueryElement.compareTo(minOrMaxElement) > 0) // ie query _lower_ limit > chunk max 
                        return direction; // ie fail
                }
            } //TESTED
            else if (field.equals(MongoDbManager.lt_) || field.equals(MongoDbManager.lte_)) { // (don't worry about the boundaries, just results in spurious empty chunks)
                if (-1 == direction) { // can't do anything about $lt vs max
                    Comparable comparableQueryElement = (Comparable) complexQueryElement.get(field);
                    //DEBUG
                    //System.out.println("2.3.2] LT Vals: " + comparableQueryElement + " vs " + minOrMaxElement + " = " + comparableQueryElement.compareTo(minOrMaxElement));

                    if (comparableQueryElement.compareTo(minOrMaxElement) < 0) // ie query upper limit < chunk min
                        return direction; // ie fail
                }
            } //TESTED
        }
        return -direction; // (ie pass by default, don't check other fields unless they have the same min/max)
    }//TESTED (tested $in, $gte?, $lte?, $gte?/$lte? combinations)

    //TEST INFO:
    // shardKey = { sourceKey:1, _id: 1 }
    // FIRST TESTED  AGAINST $in 114 different keys starting with jdbc*
    // THEN query={"sourceKey": "jdbc.oracle.thin.@ec2-54-205-223-166.compute-1.amazonaws.com.152.1438"} ... left 226 chunks, hand checked
    // THEN query={"sourceKey": "jdbc.oracle.thin.@ec2-54-205-223-166.compute-1.amazonaws.com.152.1438"}, _id: { $oid: "52702a06e4b0b912ee0615f1" } ... left 1 chunk, hand checked
    // THEN query={"sourceKey": "jdbc.oracle.thin.@ec2-54-205-223-166.compute-1.amazonaws.com.152.1438"}, _id: {"$gte": {"$oid": "52702a06e4b0b912ee0615f0"}, "$lt": {"$oid":  "52753c1fe4b019e585827285"} } ...  left 3 chunks, hand checked
    // THEN query={_id: {"$gte": {"$oid": "52702a06e4b0b912ee0615f0"}, "$lt": {"$oid":  "52753c1fe4b019e585827285"} } ...  left 89 chunks, hand checked a few

    ////////////////////////////////////////////////////////////////////////
    ////////////////////////////////////////////////////////////////////////

    // Util - returns a list of shards   

    @SuppressWarnings("unchecked")
    public static boolean splitPrecalculations_newShardScheme(BasicDBObject query, BasicDBObject srcTagsQuery) {
        // Get the communityIds from the query
        Collection<ObjectId> communityIds = null;
        try {
            BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_);
            communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_);
            if (null == communityIds) {
                return false;
            }
        } catch (Exception e) {
            //DEBUG
            //e.printStackTrace();

            return false; // back out
        }

        BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_,
                new BasicDBObject(DbManager.in_, communityIds));
        BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
        keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
        keyFields.put(SourcePojo.highestDistributionFactorStored_, 1);

        // Get and remove the sourceKey information, incorporate into source query,
        // so it's nice and simple by the time it gets to the actual query
        Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_);

        if (null != srcTagsQuery) { // Simpler case: src tags specified, so going to get a list of all the sources regardless 
            if (null != sourceKeyQueryTerm) {
                keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm);
            }
            keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_));
        } //TESTED (including $all to test that "$srctags":{"$all": ["tagtest","db"]} matches on tags: ["tagtest","db", "tagtest2" ]
        else if (null != sourceKeyQueryTerm) {
            boolean sourceKeyQueryComplex = false;

            if (sourceKeyQueryTerm instanceof BasicDBObject) {
                BasicDBObject sourceKeyQueryTermDbo = (BasicDBObject) sourceKeyQueryTerm;
                if (sourceKeyQueryTermDbo.size() <= 2) { // every term must be lt/lte/gt/gte
                    for (String sourceKeyQueryTermEl : sourceKeyQueryTermDbo.keySet()) {
                        if (!sourceKeyQueryTermEl.equals(DbManager.in_)
                                && !sourceKeyQueryTermEl.equals(DbManager.lt_)
                                && !sourceKeyQueryTermEl.equals(DbManager.lte_)
                                && !sourceKeyQueryTermEl.equals(DbManager.gt_)
                                && !sourceKeyQueryTermEl.equals(DbManager.gte_)) {
                            sourceKeyQueryComplex = true;
                            break;
                        } //TESTED (eg ne)
                        else if (sourceKeyQueryTermEl.equals(DbManager.in_)
                                && (1 != sourceKeyQueryTermDbo.size())) {
                            sourceKeyQueryComplex = true;
                            break;
                        } //TESTED ((lt,in))
                    }
                } //TESTED: (in, (gte,lt), ne)
                else {
                    sourceKeyQueryComplex = true;
                } //TESTED ({ "sourceKey": { "$in": ["test"], "$gt": "alex", "$lte":"test" } })
            } else if (sourceKeyQueryTerm instanceof java.util.regex.Pattern) { // probably a
                sourceKeyQueryComplex = true;
            }
            //TESTED ($regex)

            if (sourceKeyQueryComplex) {
                keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); // ie we'll simplify it below
            } else {
                return false; // already have a perfectly good source key specification
            }
        } //TESTED (See combinations above)

        DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(keyFields);
        int count = dbc.count();

        if (count > 5000) {
            // (too many source keys to process, just going to leave well alone... note will mean $srctags will fail open)
            return false;
        } else {
            ArrayList<String> sources = new ArrayList<String>(count);
            while (dbc.hasNext()) {
                BasicDBObject dbo = (BasicDBObject) dbc.next();
                String sourceKey = (String) dbo.get(SourcePojo.key_);
                Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_);
                sources.addAll(SourcePojo.getDistributedKeys(sourceKey, distributionFactor));
            }
            if (sources.isEmpty()) {
                throw new RuntimeException(); // will just return no splits at all, no problem
            } //TESTED
            if (1 == sources.size()) {
                query.put(DocumentPojo.sourceKey_, sources.get(0));
            } //TESTED
            else {
                query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sources));
            } //TESTED

            return true;
        }
    }//TESTED (See combinations above)

    // Util for creating a useful object containing source info (old sharding, _id - or new sharding but debug mode) 

    @SuppressWarnings("unchecked")
    public static BasicDBList splitPrecalculations_oldShardSchemeOrDebug(BasicDBObject query,
            BasicDBObject srcTagsQuery, int maxCountPerTask) {
        // Get the communityIds from the query
        Collection<ObjectId> communityIds = null;
        try {
            BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_);
            communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_);
            if (null == communityIds) {
                return null;
            }
        } catch (Exception e) {
            return null; // back out
        }

        BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_,
                new BasicDBObject(DbManager.in_, communityIds));
        BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
        keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
        BasicDBObject sortFields = new BasicDBObject(SourcePojo.key_, 1);

        // Get and remove the sourceKey information, incorporate into source query:
        Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_);
        if (null != sourceKeyQueryTerm) {
            keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm);
        } //TESTED
        if (null != srcTagsQuery) {
            keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_));
        } //TESTED

        DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(sortFields);
        // (note the sort is needed so that the potentially expensive doc query has a sensibly ordered $in clause)
        if (dbc.count() > 5000) {
            // (too many source keys to process, just going to leave well alone... note this means $srctags will fail open)
            return null;
        } else {
            //TreeMap<String, Long> sourceKeys = new TreeMap<String, Long>();
            // Build collections of objects of format { sourceKey: string or [], totalDocs }
            BasicDBList sourceKeyListCollection = new BasicDBList();
            BasicDBList sourceKeyList = null;
            int runningDocs = 0;
            int runningSources = 0;
            while (dbc.hasNext()) {
                BasicDBObject dbo = (BasicDBObject) dbc.next();
                String sourceKey = (String) dbo.get(SourcePojo.key_);
                if (null != sourceKey) {
                    long docCount = 0L;
                    try {
                        BasicDBObject harvestStatus = (BasicDBObject) dbo.get(SourcePojo.harvest_);
                        if (null != harvestStatus) {
                            docCount = harvestStatus.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                        }
                    } catch (Exception e) {
                    }

                    //DEBUG
                    //System.out.println("SOURCE=" + sourceKey + " DOC_COUNT=" + docCount + " RUNNING=" + runningDocs +"," + runningSources + ": " + sourceKeyList);

                    if (docCount > maxCountPerTask) { // source is large enough by itself
                        // Create collection
                        BasicDBObject collection = new BasicDBObject();
                        collection.put(DocumentPojo.sourceKey_, sourceKey);
                        collection.put(SourceHarvestStatusPojo.doccount_, docCount);
                        sourceKeyListCollection.add(collection);
                        // (leaving running* alone, can keep building that)
                    } //TESTED (by eye, system community of demo cluster)
                    else if ((runningDocs + docCount) > maxCountPerTask) { // have now got a large enough collection of sources 
                        if (null == sourceKeyList) {
                            sourceKeyList = new BasicDBList();
                        }
                        sourceKeyList.add(sourceKey);
                        // Create collection
                        BasicDBObject collection = new BasicDBObject();
                        collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                        collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                        sourceKeyListCollection.add(collection);
                        sourceKeyList = null;
                        runningDocs = 0;
                        runningSources = 0;
                    } //TESTED (by eye, system community of demo cluster)
                    else if (runningSources >= 15) { // have a limit on the number of sources per query, to keep the queries manageable
                        sourceKeyList.add(sourceKey);
                        // Create collection
                        BasicDBObject collection = new BasicDBObject();
                        collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                        collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                        sourceKeyListCollection.add(collection);
                        sourceKeyList = null;
                        runningDocs = 0;
                        runningSources = 0;
                    } //TESTED (by eye, system community of demo cluster)
                    else { // (keep) build(ing) list
                        if (null == sourceKeyList) {
                            sourceKeyList = new BasicDBList();
                        }
                        sourceKeyList.add(sourceKey);
                        runningDocs += docCount;
                        runningSources++;
                    } //TESTED (by eye, system community of demo cluster)
                } //(end if has source key)
            } //(end loop over cursor)

            // Finish off:
            if (null != sourceKeyList) {
                // Create collection
                BasicDBObject collection = new BasicDBObject();
                collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                collection.put(SourceHarvestStatusPojo.doccount_, runningDocs);
                sourceKeyListCollection.add(collection);
            } //TESTED (by eye, system community of demo cluster)

            if (sourceKeyListCollection.isEmpty()) { // query returns empty
                throw new RuntimeException("Communities contain no sources");
            }
            return sourceKeyListCollection;

        } // (end if too many source keys across the communities)
    }//TESTED

    // Utility - has user specified fields other than community Id, index, or sourceKey

    private static boolean isQueryNonTrivial(BasicDBObject query) {
        if ((query.size() > 3) || ((query.size() > 2) && !query.containsField(DocumentPojo.sourceKey_))) {
            return true;
        }
        return false;
    }//TESTED

    // Utility - create new query with overwritten sourceKey

    private static BasicDBObject convertQuery(BasicDBObject originalQuery, Object sourceObj) {
        BasicDBObject modQuery = null;
        if (null != sourceObj) {
            if (sourceObj instanceof Collection) {
                modQuery = new BasicDBObject(originalQuery.toMap());
                @SuppressWarnings("rawtypes")
                Collection sources = (Collection) sourceObj;
                modQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sources));
            } //TESTED
            else if (sourceObj instanceof String) {
                modQuery = new BasicDBObject(originalQuery.toMap());
                String source = (String) sourceObj;
                modQuery.put(DocumentPojo.sourceKey_, source);
            } //TESTED
        }
        return modQuery;
    }//TESTED   
}