com.mongodb.hadoop.util.MongoSplitter.java Source code

Java tutorial

Introduction

Here is the source code for com.mongodb.hadoop.util.MongoSplitter.java

Source

package com.mongodb.hadoop.util;

import com.mongodb.*;
import com.mongodb.hadoop.*;
import com.mongodb.hadoop.input.*;
import org.apache.commons.logging.*;
import org.apache.hadoop.mapreduce.*;
import org.bson.types.MinKey;
import org.bson.types.MaxKey;

import java.net.UnknownHostException;
import java.util.*;

/**
 * Copyright (c) 2010, 2011 10gen, Inc. <http://10gen.com>
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */

public class MongoSplitter {

    public static List<InputSplit> calculateSplits(MongoConfig conf) {

        if (conf.getLimit() > 0 || conf.getSkip() > 0) {
            /**
             * TODO - If they specify skip or limit we create only one input
             * split
             */
            throw new IllegalArgumentException(
                    "skip() and limit() is not currently supported due to input split issues.");
        }

        /**
         * On the jobclient side we want *ONLY* the min and max ids for each
         * split; Actual querying will be done on the individual mappers.
         */
        MongoURI uri = conf.getInputURI();
        DBCollection coll = MongoConfigUtil.getCollection(uri);
        DB db = coll.getDB();
        Mongo mongo = db.getMongo();

        if (conf.getAuthURI() != null) {
            MongoURI authURI = conf.getAuthURI();
            if (authURI.getUsername() != null && authURI.getPassword() != null
                    && !authURI.getDatabase().equals(db.getName())) {
                DB authTargetDB = mongo.getDB(authURI.getDatabase());
                authTargetDB.authenticate(authURI.getUsername(), authURI.getPassword());
            }
        }

        final CommandResult stats = coll.getStats();

        final boolean isSharded = stats.getBoolean("sharded", false);

        //connecting to the individual backend mongods is not safe, do not do so by default
        final boolean useShards = conf.canReadSplitsFromShards();

        final boolean useChunks = conf.isShardChunkedSplittingEnabled();

        final boolean slaveOk = conf.canReadSplitsFromSecondary();

        final boolean useRangeQuery = conf.isRangeQueryEnabled();

        log.info("MongoSplitter calculating splits");
        log.info("use shards: " + useShards);
        log.info("use chunks: " + useChunks);
        log.info("collection sharded: " + isSharded);
        log.info("use range queries: " + useRangeQuery);

        List<InputSplit> retVal;
        if (conf.createInputSplits()) {
            log.info("Creation of Input Splits is enabled.");
            if (isSharded && (useShards || useChunks)) {
                log.info("Sharding mode calculation entering.");
                retVal = calculateShardedSplits(conf, useShards, useChunks, slaveOk, uri, mongo);
            } else {
                // perfectly ok for sharded setups to run with a normally calculated split.
                // May even be more efficient for some cases
                log.info("Using Unsharded Split mode (Calculating multiple splits though)");
                retVal = calculateUnshardedSplits(conf, slaveOk, uri, coll);
            }
        } else {
            log.info("Creation of Input Splits is disabled; Non-Split mode calculation entering.");
            retVal = calculateSingleSplit(conf);
        }
        if (retVal == null) {
            log.info("MongoSplitter returning null InputSplits.");
        } else {
            log.info("MongoSplitter found " + retVal.size() + " splits.");
        }
        return retVal;

    }

    private static List<InputSplit> calculateUnshardedSplits(MongoConfig conf, boolean slaveOk, MongoURI uri,
            DBCollection coll) {
        final List<InputSplit> splits = new ArrayList<InputSplit>();
        final DBObject splitKey = conf.getInputSplitKey(); // a bit slower but forces validation of the JSON
        final int splitSize = conf.getSplitSize(); // in MB
        final String ns = coll.getFullName();
        final DBObject q = conf.getQuery();

        log.info("Calculating unsharded input splits on namespace '" + ns + "' with Split Key '"
                + splitKey.toString() + "' and a split size of '" + splitSize + "'mb per");

        final DBObject cmd = BasicDBObjectBuilder.start("splitVector", ns).add("keyPattern", splitKey)
                .add("force", false). // force:True is misbehaving it seems
                add("maxChunkSize", splitSize).get();

        log.trace("Issuing Command: " + cmd);
        CommandResult data = coll.getDB().command(cmd);

        if (data.containsField("$err"))
            throw new IllegalArgumentException("Error calculating splits: " + data);
        else if ((Double) data.get("ok") != 1.0)
            throw new IllegalArgumentException(
                    "Unable to calculate input splits: " + ((String) data.get("errmsg")));

        // Comes in a format where "min" and "max" are implicit and each entry is just a boundary key; not ranged
        BasicDBList splitData = (BasicDBList) data.get("splitKeys");

        if (splitData.size() <= 1) {
            if (splitData.size() < 1)
                log.warn("WARNING: No Input Splits were calculated by the split code. "
                        + "Proceeding with a *single* split. Data may be too small, try lowering 'mongo.input.split_size' "
                        + "if this is undesirable.");
            splits.add(_split(conf, q, null, null)); // no splits really. Just do the whole thing data is likely small
        } else {
            log.info("Calculated " + splitData.size() + " splits.");

            DBObject lastKey = (DBObject) splitData.get(0);

            splits.add(_split(conf, q, null, lastKey)); // first "min" split

            for (int i = 1; i < splitData.size(); i++) {
                final DBObject _tKey = (DBObject) splitData.get(i);
                splits.add(_split(conf, q, lastKey, _tKey));
                lastKey = _tKey;
            }

            splits.add(_split(conf, q, lastKey, null)); // last "max" split
        }

        return splits;

    }

    private static MongoInputSplit _split(MongoConfig conf, DBObject q, DBObject min, DBObject max) {
        BasicDBObjectBuilder b = BasicDBObjectBuilder.start("$query", q);
        final DBObject query = b.get();
        log.trace("Assembled Query: " + query);

        return new MongoInputSplit(conf.getInputURI(), conf.getInputKey(), query, conf.getFields(), conf.getSort(),
                min, max, conf.getLimit(), conf.getSkip(), conf.isNoTimeout());
    }

    private static List<InputSplit> calculateSingleSplit(MongoConfig conf) {
        final List<InputSplit> splits = new ArrayList<InputSplit>(1);
        // no splits, no sharding
        splits.add(new MongoInputSplit(conf.getInputURI(), conf.getInputKey(), conf.getQuery(), conf.getFields(),
                conf.getSort(), null, null, conf.getLimit(), conf.getSkip(), conf.isNoTimeout()));

        log.info("Calculated " + splits.size() + " split objects.");
        log.debug("Dump of calculated splits ... ");
        for (InputSplit split : splits) {
            log.debug("\t Split: " + split.toString());
        }

        return splits;
    }

    private static List<InputSplit> calculateShardedSplits(MongoConfig conf, boolean useShards, boolean useChunks,
            boolean slaveOk, MongoURI uri, Mongo mongo) {
        final List<InputSplit> splits;
        try {
            if (useChunks)
                splits = fetchSplitsViaChunks(conf, uri, mongo, useShards, slaveOk);
            else if (useShards) {
                log.warn("Fetching Input Splits directly from shards is potentially dangerous for data "
                        + "consistency should migrations occur during the retrieval.");
                splits = fetchSplitsFromShards(conf, uri, mongo, slaveOk);
            } else
                throw new IllegalStateException(
                        "Neither useChunks nor useShards enabled; failed to pick a valid state. ");

            if (splits == null)
                throw new IllegalStateException(
                        "Failed to create/calculate Input Splits from Shard Chunks; final splits content is 'null'.");

            if (log.isDebugEnabled()) {
                log.debug("Calculated splits and returning them - splits: " + splits);
            }

            return splits;
        } catch (Exception e) {
            log.error("Could not get splits (use_shards: " + useShards + ", use_chunks: " + useChunks + ")", e);
            throw new IllegalStateException(e);
        }
    }

    /**
     * This gets the URIs to the backend {@code mongod}s and returns splits that connect directly to those backends (one
     * split for each backend). There are two potential problems with this: <ol><li>clients that can connect to {@code
     * mongos} can't necessarily connect to the individual {@code mongod}s. <li>there concurrency issues (if chunks are
     * in the process of getting moved around). </ol>
     */
    private static List<InputSplit> fetchSplitsFromShards(final MongoConfig conf, MongoURI uri, Mongo mongo,
            Boolean slaveOk) {
        log.warn("WARNING getting splits that connect directly to the backend mongods"
                + " is risky and might not produce correct results");
        DB configDb = mongo.getDB("config");
        DBCollection shardsColl = configDb.getCollection("shards");

        Set<String> shardSet = new java.util.HashSet<String>();

        DBCursor cur = shardsColl.find();
        try {
            while (cur.hasNext()) {
                final BasicDBObject row = (BasicDBObject) cur.next();
                String host = row.getString("host");
                int slashIndex = host.indexOf('/');
                if (slashIndex > 0)
                    host = host.substring(slashIndex + 1);
                shardSet.add(host);
            }
        } finally {
            if (cur != null)
                cur.close();
            cur = null;
        }
        final List<InputSplit> splits = new ArrayList<InputSplit>(shardSet.size());
        //todo: using stats only get the shards that actually host data for this collection
        for (String host : shardSet) {
            MongoURI thisUri = getNewURI(uri, host, slaveOk);
            splits.add(new MongoInputSplit(thisUri, conf.getInputKey(), conf.getQuery(), conf.getFields(),
                    conf.getSort(), null, null, conf.getLimit(), conf.getSkip(), conf.isNoTimeout())); // TODO - Should the input Key be the shard key?
        }
        return splits;
    }

    /**
     * This constructs splits using the chunk boundaries.
     */
    private static List<InputSplit> fetchSplitsViaChunks(final MongoConfig conf, MongoURI uri, Mongo mongo,
            boolean useShards, Boolean slaveOk) {
        DBObject originalQuery = conf.getQuery();

        if (useShards)
            log.warn("WARNING getting splits that connect directly to the backend mongods"
                    + " is risky and might not produce correct results");

        if (conf.isRangeQueryEnabled()) {
            log.warn("WARNING using range queries can produce incorrect results if values"
                    + " stored under the splitting key have different types.");
        }

        if (log.isDebugEnabled()) {
            log.debug("getSplitsUsingChunks(): originalQuery: " + originalQuery);
        }

        DB configDB = mongo.getDB("config");

        Map<String, String> shardMap = null; //key: shardname, value: host

        if (useShards) {

            shardMap = new HashMap<String, String>();
            DBCollection shardsCollection = configDB.getCollection("shards");
            DBCursor cur = shardsCollection.find();
            try {
                while (cur.hasNext()) {
                    final BasicDBObject row = (BasicDBObject) cur.next();

                    String host = row.getString("host");

                    // for replica sets host will look like: "setname/localhost:20003,localhost:20004"
                    int slashIndex = host.indexOf('/');

                    if (slashIndex > 0)
                        host = host.substring(slashIndex + 1);

                    shardMap.put((String) row.get("_id"), host);
                }
            } finally {
                if (cur != null)
                    cur.close();
            }
        }

        if (log.isDebugEnabled()) {
            log.debug("MongoInputFormat.getSplitsUsingChunks(): shard map is: " + shardMap);
        }

        DBCollection chunksCollection = configDB.getCollection("chunks");

        /* Chunks looks like:
        { "_id" : "test.lines-_id_ObjectId('4d60b839874a8ad69ad8adf6')", "lastmod" : { "t" : 3000, "i" : 1 }, "ns" : "test.lines", "min" : { "_id" : ObjectId("4d60b839874a8ad69ad8adf6") }, "max" : { "_id" : ObjectId("4d60b83a874a8ad69ad8d1a9") }, "shard" : "shard0000" }
        { "_id" : "test.lines-_id_ObjectId('4d60b848874a8ad69ada8756')", "lastmod" : { "t" : 3000, "i" : 19 }, "ns" : "test.lines", "min" : { "_id" : ObjectId("4d60b848874a8ad69ada8756") }, "max" : { "_id" : { $maxKey : 1 } }, "shard" : "shard0002" }
        */

        BasicDBObject query = new BasicDBObject();
        query.put("ns", uri.getDatabase() + "." + uri.getCollection());

        DBCursor cur = chunksCollection.find(query);

        try {
            int numChunks = 0;
            final int numExpectedChunks = cur.size();

            final List<InputSplit> splits = new ArrayList<InputSplit>(numExpectedChunks);
            while (cur.hasNext()) {
                numChunks++;
                final BasicDBObject row = (BasicDBObject) cur.next();
                DBObject minObj = ((DBObject) row.get("min"));
                DBObject shardKeyQuery = new BasicDBObject();
                BasicDBObject min = new BasicDBObject();
                BasicDBObject max = new BasicDBObject();

                for (String keyName : minObj.keySet()) {
                    Object tMin = minObj.get(keyName);
                    Object tMax = ((DBObject) row.get("max")).get(keyName);
                    /** The shard key can be of any possible type, so this must be kept as Object */
                    if (!(tMin == SplitFriendlyDBCallback.MIN_KEY_TYPE || tMin.equals("MinKey")))
                        min.put(keyName, tMin);
                    if (!(tMax == SplitFriendlyDBCallback.MAX_KEY_TYPE || tMax.equals("MaxKey")))
                        max.put(keyName, tMax);
                }

                /** We have to put something for $query or we'll fail; if no original query use an empty DBObj */
                if (originalQuery == null)
                    originalQuery = new BasicDBObject();

                DBObject splitQuery = originalQuery;

                boolean useMinMax = true;
                if (conf.isRangeQueryEnabled()) {
                    Map.Entry<String, Object> minKey = min.size() == 1 ? min.entrySet().iterator().next() : null;
                    Map.Entry<String, Object> maxKey = max.size() == 1 ? max.entrySet().iterator().next() : null;
                    if (minKey == null && maxKey == null) {
                        throw new IllegalArgumentException(
                                "Range query is enabled but one or more split boundaries contains a compound key:\n"
                                        + "minKey:  " + min.toString() + "\n" + "maxKey:  " + max.toString());
                    }

                    if ((minKey != null && originalQuery.containsKey(minKey.getKey()))
                            || (maxKey != null && originalQuery.containsKey(maxKey.getKey()))) {
                        throw new IllegalArgumentException(
                                "Range query is enabled but split key conflicts with query filter:\n" + "minKey:  "
                                        + min.toString() + "\n" + "maxKey:  " + max.toString() + "\n" + "query:  "
                                        + originalQuery.toString());
                    }
                    BasicDBObject rangeObj = new BasicDBObject();
                    if (minKey != null)//&& !SplitFriendlyDBCallback.MIN_KEY_TYPE.equals(minKey.getValue())){
                        rangeObj.put("$gte", minKey.getValue());
                    //}
                    if (maxKey != null)//&& !SplitFriendlyDBCallback.MAX_KEY_TYPE.equals(maxKey.getValue())){
                        rangeObj.put("$lt", maxKey.getValue());
                    //}
                    splitQuery = new BasicDBObject();
                    splitQuery.putAll(originalQuery);
                    splitQuery.put(minKey.getKey(), rangeObj);
                    useMinMax = false;
                }

                shardKeyQuery.put("$query", originalQuery);

                if (log.isDebugEnabled()) {
                    log.debug("[" + numChunks + "/" + numExpectedChunks + "] new query is: " + shardKeyQuery);
                }

                MongoURI inputURI = conf.getInputURI();

                if (useShards) {
                    final String shardname = row.getString("shard");

                    String host = shardMap.get(shardname);

                    inputURI = getNewURI(inputURI, host, slaveOk);
                }
                if (useMinMax) {
                    MongoInputSplit split = new MongoInputSplit(inputURI, conf.getInputKey(), splitQuery,
                            conf.getFields(), conf.getSort(), // TODO - should inputKey be the shard key?
                            min, max, conf.getLimit(), conf.getSkip(), conf.isNoTimeout());
                    splits.add(split);
                } else {
                    MongoInputSplit split = new MongoInputSplit(inputURI, conf.getInputKey(), splitQuery,
                            conf.getFields(), conf.getSort(), // TODO - should inputKey be the shard key?
                            null, null, conf.getLimit(), conf.getSkip(), conf.isNoTimeout());
                    splits.add(split);
                }
            }

            if (log.isDebugEnabled()) {
                log.debug("MongoInputFormat.getSplitsUsingChunks(): There were " + numChunks + " chunks, returning "
                        + splits.size() + " splits: " + splits);
            }

            return splits;

        } finally {
            if (cur != null)
                cur.close();
        }
    }

    private static MongoURI getNewURI(MongoURI originalUri, String newServerUri, Boolean slaveok) {

        String originalUriString = originalUri.toString();
        originalUriString = originalUriString.substring(MongoURI.MONGODB_PREFIX.length());

        // uris look like: mongodb://fred:foobar@server1[,server2]/path?options

        int serverEnd = -1;
        int serverStart = 0;

        int idx = originalUriString.lastIndexOf("/");
        if (idx < 0) {
            serverEnd = originalUriString.length();
        } else {
            serverEnd = idx;
        }
        idx = originalUriString.indexOf("@");

        if (idx > 0) {
            serverStart = idx + 1;
        }
        StringBuilder sb = new StringBuilder(originalUriString);
        sb.replace(serverStart, serverEnd, newServerUri);
        if (slaveok != null) {
            //If uri already contains options append option to end of uri.
            //This will override any slaveok option already in the uri
            if (originalUriString.contains("?"))
                sb.append("&slaveok=").append(slaveok);
            else
                sb.append("?slaveok=").append(slaveok);
        }
        String ans = MongoURI.MONGODB_PREFIX + sb.toString();
        log.debug("getNewURI(): original " + originalUri + " new uri: " + ans);
        return new MongoURI(ans);
    }

    private static final Log log = LogFactory.getLog(MongoSplitter.class);
}