public Object get(final String key) 

Gets a value from this object


From source file:com.ikanow.infinit.e.application.utils.LogstashConfigUtils.java

License:Open Source License

public static BasicDBObject parseLogstashConfig(String configFile, StringBuffer error) {

    BasicDBObject tree = new BasicDBObject();

    // Stage 0: remove escaped "s and 's (for the purpose of the validation):
    // (prevents tricksies with escaped "s and then #s)
    // (http://stackoverflow.com/questions/5082398/regex-to-replace-single-backslashes-excluding-those-followed-by-certain-chars)
    configFile = configFile.replaceAll("(?<!\\\\)(?:((\\\\\\\\)*)\\\\)[\"']", "X");
    //TESTED (by hand - using last 2 fields of success_2_1)

    // Stage 1: remove #s, and anything in quotes (for the purpose of the validation)
    configFile = configFile.replaceAll("(?m)(?:([\"'])(?:(?!\\1).)*\\1)", "VALUE").replaceAll("(?m)(?:#.*$)",
            "");//from ww  w  .j ava  2  s.  c  om
    //TESTED (2_1 - including with a # inside the ""s - Event_Date -> Event_#Date)
    //TESTED (2_2 - various combinations of "s nested inside 's) ... yes that is a negative lookahead up there - yikes!

    // Stage 2: get a nested list of objects
    int depth = 0;
    int ifdepth = -1;
    Stack<Integer> ifStack = new Stack<Integer>();
    BasicDBObject inputOrFilter = null;
    Matcher m = _navigateLogstash.matcher(configFile);
    // State:
    String currTopLevelBlockName = null;
    String currSecondLevelBlockName = null;
    BasicDBObject currSecondLevelBlock = null;
    while (m.find()) {
        boolean simpleField = false;

        //System.out.println("--DEPTH="+depth + " GROUP=" + m.group() + " IFS" + Arrays.toString(ifStack.toArray()));
        //System.out.println("STATES: " + currTopLevelBlockName + " AND " + currSecondLevelBlockName);

        if (m.group().equals("}")) {

            if (ifdepth == depth) { // closing an if statement
                if (ifStack.isEmpty()) {
                    ifdepth = -1;
                } else {
                    ifdepth = ifStack.peek();
            } //TESTED (1_1bc, 2_1)
            else { // closing a processing block

                if (depth < 0) { // {} Mismatch
                    error.append("{} Mismatch (})");
                    return null;
                } //TESTED (1_1abc)
        } else { // new attribute!

            String typeName = m.group(1);
            if (null == typeName) { // it's an if statement or a string value
                typeName = m.group(4);
                if (null != typeName) {
                    simpleField = true;
            } else if (typeName.equalsIgnoreCase("else")) { // It's an if statement..
                typeName = null;
            if (null == typeName) { // if statement after all
                // Just keep track of ifs so we can ignore them
                ifdepth = depth;
                // (don't increment depth)
            } //TESTED (1_1bc, 2_1)
            else { // processing block
                String subTypeName = m.group(3);
                if (null != subTypeName) { // eg codec.multiline
                    typeName = typeName + "." + subTypeName;
                } //TESTED (2_1, 2_3)

                if (depth == 0) { // has to be one of input/output/filter)
                    String topLevelType = typeName.toLowerCase();
                    if (topLevelType.equalsIgnoreCase("input") || topLevelType.equalsIgnoreCase("filter")) {
                        if (tree.containsField(topLevelType)) {
                            error.append("Multiple input or filter blocks: " + topLevelType);
                            return null;
                        } //TESTED (1_3ab)
                        else {
                            inputOrFilter = new BasicDBObject();
                            tree.put(topLevelType, inputOrFilter);

                            // Store state:
                            currTopLevelBlockName = topLevelType;
                        } //TESTED (*)
                    } else {
                        if (topLevelType.equalsIgnoreCase("output")) {
                                    "Not allowed output blocks - these are appended automatically by the logstash harvester");
                        } else {
                            error.append("Unrecognized processing block: " + topLevelType);
                        return null;
                    } //TESTED (1_4a)
                } else if (depth == 1) { // processing blocks
                    String subElType = typeName.toLowerCase();

                    // Some validation: can't include a type called "filter" anywhere
                    if ((null != currTopLevelBlockName) && currTopLevelBlockName.equals("input")) {
                        if (subElType.equals("filter") || subElType.endsWith(".filter")) {
                            error.append("Not allowed sub-elements of input called 'filter' (1)");
                            return null;
                    } //TESTED (1_5b)

                    BasicDBList subElements = (BasicDBList) inputOrFilter.get(subElType);
                    if (null == subElements) {
                        subElements = new BasicDBList();
                        inputOrFilter.put(subElType, subElements);
                    BasicDBObject newEl = new BasicDBObject();

                    // Store state:
                    currSecondLevelBlockName = subElType;
                    currSecondLevelBlock = newEl;
                } //TESTED (*)
                else if (depth == 2) { // attributes of processing blocks
                    // we'll just store the field names for these and do any simple validation that was too complicated for the regexes
                    String subSubElType = typeName.toLowerCase();

                    // Validation:
                    if (null != currTopLevelBlockName) {
                        // 1] sincedb path
                        if (currTopLevelBlockName.equals("input") && (null != currSecondLevelBlockName)) {
                            // (don't care what the second level block name is - no sincedb allowed)
                            if (subSubElType.equalsIgnoreCase("sincedb_path")) {
                                error.append("Not allowed sincedb_path in input.* block");
                                return null;
                            } //TESTED (1_5a)
                              // 2] no sub-(-sub etc)-elements of input called filter
                            if (subSubElType.equals("filter") || subSubElType.endsWith(".filter")) {
                                error.append("Not allowed sub-elements of input called 'filter' (2)");
                                return null;
                            } //TESTED (1_5c)

                    // Store in map:
                    if (null != currSecondLevelBlock) {
                        currSecondLevelBlock.put(subSubElType, new BasicDBObject());
                // (won't go any deeper than this)
                if (!simpleField) {

    if (0 != depth) {
        error.append("{} Mismatch ({)");
        return null;
    } //TESTED (1_2a)

    return tree;

From source file:com.ikanow.infinit.e.core.mapreduce.HadoopJobRunner.java

License:Open Source License

private void createConfigXML(Writer out, String title, String input, String fields, boolean isCustomTable,
        String outputDatabase, String output, String tempOutputCollection, String mapper, String reducer,
        String combiner, String query, List<ObjectId> communityIds, String outputKey, String outputValue,
        String arguments) throws IOException {
    String dbserver = prop_general.getDatabaseServer();
    output = outputDatabase + "." + tempOutputCollection;

    int nSplits = 8;
    int nDocsPerSplit = 12500;

    //add communities to query if this is not a custom table
    if (!isCustomTable) {
        // Start with the old query:
        BasicDBObject oldQueryObj = null;
        if (query.startsWith("{")) {
            oldQueryObj = (BasicDBObject) com.mongodb.util.JSON.parse(query);
        } else {// w  ww. j av a2s .  co  m
            oldQueryObj = new BasicDBObject();

        // Community Ids aren't indexed in the metadata collection, but source keys are, so we need to transform to that
        BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_,
                new BasicDBObject(DbManager.in_, communityIds));
        boolean bAdminOverride = false;
        if (oldQueryObj.containsField("admin")) { // For testing only...
            if (1 == communityIds.size()) {
                ObjectId communityId = communityIds.get(0);
                if (RESTTools.adminLookup(communityId.toString())) {
                    bAdminOverride = true;
                    if (oldQueryObj.containsField("max.splits")) {
                        nSplits = oldQueryObj.getInt("max.splits");
                    if (oldQueryObj.containsField("max.docs.per.split")) {
                        nDocsPerSplit = oldQueryObj.getInt("max.docs.per.split");
        } //(end diagnostic/benchmarking/test code for admins only part 1)
        if (bAdminOverride) {
            oldQueryObj = (BasicDBObject) oldQueryObj.get("admin");
            //(end diagnostic/benchmarking/test code for admins only part 2)
        } else if (oldQueryObj.containsField(DocumentPojo.sourceKey_) || input.startsWith("feature.")) {
            // Source Key specified by user, stick communityIds check in for security
            oldQueryObj.put(DocumentPojo.communityId_, new BasicDBObject(DbManager.in_, communityIds));
        } else { // Source key not specified by user, transform communities->sourcekeys
            BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
            DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields);
            if (dbc.count() > 500) {
                // (too many source keys let's keep the query size sensible...)
                oldQueryObj.put(DocumentPojo.communityId_, new BasicDBObject(DbManager.in_, communityIds));
            } else {
                HashSet<String> sourceKeys = new HashSet<String>();
                while (dbc.hasNext()) {
                    DBObject dbo = dbc.next();
                    String sourceKey = (String) dbo.get(SourcePojo.key_);
                    if (null != sourceKey) {
                if (sourceKeys.isEmpty()) { // query returns empty
                    throw new RuntimeException("Communities contain no sources");
                BasicDBObject newQueryClauseObj = new BasicDBObject(DbManager.in_, sourceKeys);
                // Now combine the queries...
                oldQueryObj.put(DocumentPojo.sourceKey_, newQueryClauseObj);

            } // (end if too many source keys across the communities)
        } //(end if need to break source keys down into communities)
        query = oldQueryObj.toString();
    } else {
        //get the custom table (and database)
        input = getCustomDbAndCollection(input);
    if (arguments == null)
        arguments = "";

    // Generic configuration
    out.write("<?xml version=\"1.0\"?>\n<configuration>");

    // Mongo specific configuration

    out.write("\n\t<property><!-- name of job shown in jobtracker --><name>mongo.job.name</name><value>" + title
            + "</value></property>"
            + "\n\t<property><!-- run the job verbosely ? --><name>mongo.job.verbose</name><value>true</value></property>"
            + "\n\t<property><!-- Run the job in the foreground and wait for response, or background it? --><name>mongo.job.background</name><value>false</value></property>"
            + "\n\t<property><!-- If you are reading from mongo, the URI --><name>mongo.input.uri</name><value>mongodb://"
            + dbserver + "/" + input + "</value></property>"
            + "\n\t<property><!-- If you are writing to mongo, the URI --><name>mongo.output.uri</name><value>mongodb://"
            + dbserver + "/" + output + "</value>  </property>"
            + "\n\t<property><!-- The query, in JSON, to execute [OPTIONAL] --><name>mongo.input.query</name><value>"
            + query + "</value></property>"
            + "\n\t<property><!-- The fields, in JSON, to read [OPTIONAL] --><name>mongo.input.fields</name><value>"
            + ((fields == null) ? ("") : fields) + "</value></property>"
            + "\n\t<property><!-- A JSON sort specification for read [OPTIONAL] --><name>mongo.input.sort</name><value></value></property>"
            + "\n\t<property><!-- The number of documents to limit to for read [OPTIONAL] --><name>mongo.input.limit</name><value>0</value><!-- 0 == no limit --></property>"
            + "\n\t<property><!-- The number of documents to skip in read [OPTIONAL] --><!-- TODO - Are we running limit() or skip() first? --><name>mongo.input.skip</name><value>0</value> <!-- 0 == no skip --></property>"
            + "\n\t<property><!-- Class for the mapper --><name>mongo.job.mapper</name><value>" + mapper
            + "</value></property>"
            + "\n\t<property><!-- Reducer class --><name>mongo.job.reducer</name><value>" + reducer
            + "</value></property>"
            + "\n\t<property><!-- InputFormat Class --><name>mongo.job.input.format</name><value>com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat</value></property>"
            + "\n\t<property><!-- OutputFormat Class --><name>mongo.job.output.format</name><value>com.mongodb.hadoop.MongoOutputFormat</value></property>"
            + "\n\t<property><!-- Output key class for the output format --><name>mongo.job.output.key</name><value>"
            + outputKey + "</value></property>"
            + "\n\t<property><!-- Output value class for the output format --><name>mongo.job.output.value</name><value>"
            + outputValue + "</value></property>"
            + "\n\t<property><!-- Output key class for the mapper [optional] --><name>mongo.job.mapper.output.key</name><value></value></property>"
            + "\n\t<property><!-- Output value class for the mapper [optional] --><name>mongo.job.mapper.output.value</name><value></value></property>"
            + "\n\t<property><!-- Class for the combiner [optional] --><name>mongo.job.combiner</name><value>"
            + combiner + "</value></property>"
            + "\n\t<property><!-- Partitioner class [optional] --><name>mongo.job.partitioner</name><value></value></property>"
            + "\n\t<property><!-- Sort Comparator class [optional] --><name>mongo.job.sort_comparator</name><value></value></property>"
            + "\n\t<property><!-- Split Size [optional] --><name>mongo.input.split_size</name><value>32</value></property>");

    // Infinit.e specific configuration

    out.write("\n\t<property><!-- User Arguments [optional] --><name>arguments</name><value>"
            + StringEscapeUtils.escapeXml(arguments) + "</value></property>"
            + "\n\t<property><!-- Maximum number of splits [optional] --><name>max.splits</name><value>"
            + nSplits + "</value></property>"
            + "\n\t<property><!-- Maximum number of docs per split [optional] --><name>max.docs.per.split</name><value>"
            + nDocsPerSplit + "</value></property>");

    // Closing thoughts:


From source file:com.ikanow.infinit.e.core.utils.SourceUtils.java

License:Open Source License

public static boolean checkDbSyncLock() {
    DBCursor dbc = DbManager.getFeature().getSyncLock().find();
    if (!dbc.hasNext()) {
        return false; // working fine
    }/*from   w w  w .j  a v a2s  .  c o  m*/
    Date now = new Date();
    while (dbc.hasNext()) {
        BasicDBObject sync_lock = (BasicDBObject) dbc.next();
        Object lastSyncObj = sync_lock.get("last_sync");
        if (null != lastSyncObj) {
            try {
                Date last_sync = (Date) lastSyncObj;
                if (last_sync.getTime() + _ONEDAY > now.getTime()) {

                    return true; // (ie sync object exists and is < 1 day old)
            } catch (Exception e) {
                // class cast, do nothing
    } // (end "loop over" 1 object in sync_lock DB)

    return false;

From source file:com.ikanow.infinit.e.core.utils.SourceUtils.java

License:Open Source License

public static void updateHarvestStatus(SourcePojo source, HarvestEnum harvestStatus, List<DocumentPojo> added,
        long nDocsDeleted, String extraMessage) {
    // Handle successful harvests where the max docs were reached, so don't want to respect the searchCycle
    if ((harvestStatus == HarvestEnum.success) && (source.reachedMaxDocs())) {
        harvestStatus = HarvestEnum.success_iteration;
    }//from  ww w. j a  v a 2s  . co  m
    // Always update status object in order to release the "in_progress" lock
    // (make really really sure we don't exception out before doing this!)

    BasicDBObject query = new BasicDBObject(SourcePojo._id_, source.getId());
    BasicDBObject setClause = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_,
    if ((null != added) && !added.isEmpty()) {
        setClause.put(SourceHarvestStatusPojo.sourceQuery_extracted_, new Date());
    if (null != extraMessage) {
        if ((null == source.getHarvestStatus()) || (null == source.getHarvestStatus().getHarvest_message())) {
            setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_message_, extraMessage);
        } else {
                    .setHarvest_message(source.getHarvestStatus().getHarvest_message() + "\n" + extraMessage);
    BasicDBObject update = new BasicDBObject(MongoDbManager.set_, setClause);

    int docsAdded = 0;
    if (null != added) {
        docsAdded = added.size();
    BasicDBObject incClause = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_,
            docsAdded - nDocsDeleted);
    update.put(MongoDbManager.inc_, incClause);

    // Special case, if searchCycle_secs == 0 and not success_iteration, then suspend:
    if ((harvestStatus != HarvestEnum.success_iteration) && (null != source.getSearchCycle_secs())
            && (0 == source.getSearchCycle_secs())) {
        setClause.put(SourcePojo.searchCycle_secs_, -1);

    if (null != source.getDistributionTokens()) { // Distribution logic (specified and also enabled - eg ignore Feed/DB)
        updateHarvestDistributionState_tokenComplete(source, harvestStatus, incClause, setClause);
    if (setClause.isEmpty()) { // (ie got removed by the distribution logic above)
    } //TESTED

    long nTotalDocsAfterInsert = 0;
    BasicDBObject fieldsToReturn = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
    BasicDBObject updatedSource = (BasicDBObject) DbManager.getIngest().getSource().findAndModify(query,
            fieldsToReturn, null, false, update, true, false);
    BasicDBObject harvestStatusObj = (BasicDBObject) updatedSource.get(SourcePojo.harvest_);
    if (null != harvestStatusObj) {
        Long docCount = harvestStatusObj.getLong(SourceHarvestStatusPojo.doccount_);
        if (null != docCount) {
            nTotalDocsAfterInsert = docCount;

    // Prune documents if necessary
    if ((null != source.getMaxDocs()) && (nTotalDocsAfterInsert > source.getMaxDocs())) {
        long nToPrune = (nTotalDocsAfterInsert - source.getMaxDocs());
        SourceUtils.pruneSource(source, (int) nToPrune, -1);
        nDocsDeleted += nToPrune;

        // And update to reflect that it now has max docs...
        BasicDBObject update2_1 = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_,
        BasicDBObject update2 = new BasicDBObject(DbManager.set_, update2_1);
        DbManager.getIngest().getSource().update(query, update2);

    if ((null != source.getTimeToLive_days())) {
        nDocsDeleted += SourceUtils.pruneSource(source, Integer.MAX_VALUE, source.getTimeToLive_days());
    } //TODO: TOTEST

    // (OK now the only thing we really had to do is complete, add some handy metadata)

    // Also update the document count table in doc_metadata:
    if (docsAdded > 0) {
        if (1 == source.getCommunityIds().size()) { // (simple/usual case, just 1 community)
            query = new BasicDBObject(DocCountPojo._id_, source.getCommunityIds().iterator().next());
            update = new BasicDBObject(MongoDbManager.inc_,
                    new BasicDBObject(DocCountPojo.doccount_, docsAdded - nDocsDeleted));
            if ((docsAdded != 0) || (nDocsDeleted != 0)) {
                update.put(DbManager.set_, new BasicDBObject(DocCountPojo.extracted_, new Date()));
            DbManager.getDocument().getCounts().update(query, update, true, false);
        } else if (!source.getCommunityIds().isEmpty()) { // Complex case since docs can belong to diff communities (but they're usually somewhat grouped)
            Map<ObjectId, Integer> communityMap = new HashMap<ObjectId, Integer>();
            for (DocumentPojo doc : added) {
                ObjectId communityId = doc.getCommunityId();
                Integer count = communityMap.get(communityId);
                communityMap.put(communityId, (count == null ? 1 : count + 1));
            } //end loop over added documents (updating the separate community counts)
            long nDocsDeleted_byCommunity = nDocsDeleted / source.getCommunityIds().size();
            // (can't do better than assume a uniform distribution - the whole thing gets recalculated weekly anyway...)

            for (Map.Entry<ObjectId, Integer> communityInfo : communityMap.entrySet()) {
                query = new BasicDBObject(DocCountPojo._id_, communityInfo.getKey());
                update = new BasicDBObject(MongoDbManager.inc_, new BasicDBObject(DocCountPojo.doccount_,
                        communityInfo.getValue() - nDocsDeleted_byCommunity));
                if ((communityInfo.getValue() != 0) || (nDocsDeleted_byCommunity != 0)) {
                    update.put(DbManager.set_, new BasicDBObject(DocCountPojo.extracted_, new Date()));
                DbManager.getDocument().getCounts().update(query, update, true, false);
                // (true for upsert, false for multi add)
        } //(never called in practice - tested up until 5/2/2014)

From source file:com.ikanow.infinit.e.core.utils.SourceUtils.java

License:Open Source License

private static boolean updateHarvestDistributionState_tokenComplete(SourcePojo source,
        HarvestEnum harvestStatus, BasicDBObject incClause, BasicDBObject setClause) {

    // Update tokens complete, and retrieve modified version 
    int nTokensToBeCleared = source.getDistributionTokens().size();
    BasicDBObject query = new BasicDBObject(SourcePojo._id_, source.getId());
    BasicDBObject modify = new BasicDBObject(MongoDbManager.inc_, new BasicDBObject(
            SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, nTokensToBeCleared));
    BasicDBObject fields = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_,
            1);/*from   w w  w. ja va2s.  co m*/
    fields.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, 1);
    fields.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, 1);
    BasicDBObject partial = (BasicDBObject) MongoDbManager.getIngest().getSource().findAndModify(query, fields,
            null, false, modify, true, false);
    //(return new version - ensures previous increments have been taken into account)

    // Two cases: source complete (all tokens obtained), source incomplete:

    if (null != partial) { // (else yikes!)
        BasicDBObject partialStatus = (BasicDBObject) partial.get(SourcePojo.harvest_);
        if (null != partialStatus) { // (else yikes!)
            int nTokensComplete = partialStatus.getInt(SourceHarvestStatusPojo.distributionTokensComplete_, 0);
            // (note after increment)

            // COMPLETE: reset parameters, status -> error (if anything has errored), success (all done), success_iteration (more to do)

            if (nTokensComplete == source.getDistributionFactor()) {
                if (!source.reachedMaxDocs()) { // (Can only do this if we've finished the source...
                    //...else the different threads can be at different points, so the most recent doc for one thread might be
                    // before the most recent doc of another)
                            new Date());

                setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, 0);
                setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, false); // (resetting this)
                // This source is now complete
                String status = partialStatus.getString(SourceHarvestStatusPojo.harvest_status_, null);
                Boolean reachedLimit = partialStatus.getBoolean(
                        SourceHarvestStatusPojo.distributionReachedLimit_, false) || source.reachedMaxDocs();

                if ((null != status) && ((status.equalsIgnoreCase(HarvestEnum.error.toString())
                        || (HarvestEnum.error == harvestStatus)))) {
                } //TESTED (current and previous state == error)
                else if (reachedLimit || (HarvestEnum.success_iteration == harvestStatus)) {

                } //TESTED (from previous or current state)

                // (else leave with default of success)

                //System.out.println(Thread.currentThread().getName() + " COMPLETE_SRC COMPLETE_TOKEN=" + source.getKey() + " / " + setClause.toString() + " / " + incClause.toString() + " / " + nTokensComplete);

                return true;

            } //TESTED
            else { // Not complete

                // If we're here then we're only allowed to update the status to error
                if (HarvestEnum.error != harvestStatus) {
                } //TESTED
                if (source.reachedMaxDocs()) {
                    setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, true);
                } //TESTED

                //System.out.println(Thread.currentThread().getName() + " COMPLETE_TOKEN=" + source.getKey() + " / " + setClause.toString() + " / " + incClause.toString() + " / " + nTokensComplete);

                return false;

            } //(end is complete or not)
              //TESTED (reached max limit)

        } //(end found partial source status, else catastrophic failure)
    } //(end found partial source, else catastrophic failure)

    return false;


From source file:com.ikanow.infinit.e.data_model.api.ResponsePojo.java

License:Apache License

public static ResponsePojo fromDb(BasicDBObject bson) {
    BasicDBObject bson2 = new BasicDBObject();
    bson2.put("stats", bson.get("stats"));
    bson2.put("response", bson.get("response"));
    ResponsePojo rp = ResponsePojo.fromApi(bson2.toString(), ResponsePojo.class);

    // Now all the elements!
    Object evtTimeline = null, facets = null, times = null, entities = null, events = null, facts = null,
            summaries = null, sources = null, sourceMetaTags = null, sourceMetaTypes = null, moments = null,
            other = null;/*from   w ww  .j  a va  2 s  . co m*/

    evtTimeline = bson.get("eventsTimeline");
    facets = bson.get("facets");
    times = bson.get("times");
    entities = bson.get("entities");
    events = bson.get("events");
    facts = bson.get("facts");
    summaries = bson.get("summaries");
    sources = bson.get("sources");
    sourceMetaTags = bson.get("sourceMetatags");
    sourceMetaTypes = bson.get("sourceMetaTypes");
    moments = bson.get("moments");
    other = bson.get("other");

    rp.setTimes(times, rp.getTimeInterval() == null ? 0 : rp.getTimeInterval());
    rp.setMoments(moments, rp.getMomentInterval());

    // The main data object is discarded in the original fromApi() call, so put it back now
    Object docData = bson.get("data");
    if (null != docData) {
        rp.setData((BasicDBList) docData, (BasePojoApiMap<BasicDBList>) null);
    } else { // (ensure there's always an empty list)
        rp.setData(new ArrayList<BasicDBObject>(0), (BasePojoApiMap<BasicDBObject>) null);
    return rp;

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

 * Checks if the new params MAX_SPLITS and MAX_DOCS_PER_SPLIT are set
 * in the config.  If they are it will use those to do splits via limit/skip
 * otherwise it will call the previous chunking splitter in MongoSplitter.
 * /*  w w w  .  j ava2s . c  o  m*/
 * @param conf
 * @return

public static List<InputSplit> calculateSplits(InfiniteMongoConfig conf) {
    // First off: What is our sharding scheme?

    boolean shardingPolicyNew = false;
    try {
        BasicDBObject shardQuery = new BasicDBObject("_id", "doc_metadata.metadata");
        BasicDBObject shardInfo = (BasicDBObject) DbManager.getCollection("config", "collections")
        if (null != shardInfo) {
            BasicDBObject shardInfoKey = (BasicDBObject) shardInfo.get("key");
            if (null != shardInfoKey) {
                shardingPolicyNew = (shardInfoKey.size() > 1);
    } //TESTED (new and old)
    catch (Exception e) {
    } // stick with the old sharding, it's probably going to die soon after though, honestly

    // conf.getQuery returns a new copy of the query, so get once and use everywhere...
    BasicDBObject confQuery = (BasicDBObject) conf.getQuery();

    BasicDBObject srcTagsQuery = (BasicDBObject) conf.getSourceTags();

    String collection = conf.getInputURI().getCollection();
    if (!collection.equals(DbManager.getDocument().getContent().getName())
            && !collection.equals(DbManager.getDocument().getMetadata().getName())) {
        // Case 1: feature table or custom table
        // Just run legacy code
        return calculateSplits_phase2(conf, confQuery, false, false, null);
    } else { // complex cases...
        boolean simpleOtherIndex = false;
        // Check whether a simple query has been performed on a different indexed field         
        if (null == srcTagsQuery) { // (if srcTags specified, then going to want to use sourceKey as the index)
            for (String s : Arrays.asList(EntityPojo.docQuery_index_, DocumentPojo.url_)) {
                Object selector = confQuery.get(s);
                if (selector instanceof String) {
                    simpleOtherIndex = true;
                } else if (selector instanceof DBObject) {
                    DBObject selectorDbo = (DBObject) selector;
                    if (selectorDbo.containsField(DbManager.in_)) {
                        simpleOtherIndex = true;
            } //TESTED (both types, plus check complex indexes don't work)         
              // ALLOWED: {"entities.index": { "$in": [ "xxx", "yyy"] }, {"entities.index": "xxx" }, ditto for "url"
              // NOT ALLOWED: { "entities.index": { "$ne": "xxx" } }
        //TESTED check ignored if eg entity_index specified

        if (simpleOtherIndex) {
            // Case 2: we have a simple query on an indexed field 
            // Just run legacy code

            return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);
        } //TESTED
        else if (conf.getLimit() > 0) { // debug
            //Case 3: Ensure we have small sets of sources to search over
            BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery,
            final List<InputSplit> splits = new ArrayList<InputSplit>();

            boolean queryNonTrivial = isQueryNonTrivial(confQuery);
            if (!queryNonTrivial) {
                //Case 3a: query is trivial, so can just create splits directly from the split pre-calcs
                int toProcess = conf.getLimit();
                Iterator<Object> itSplit = collectionOfSplits.iterator();
                while ((toProcess > 0) && (itSplit.hasNext())) {
                    BasicDBObject split = (BasicDBObject) itSplit.next();

                    int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                    int toGet = (docCount > toProcess) ? toProcess : docCount;
                    BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
                    if (null != modQuery) {
                        splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery,
                                conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
                        toProcess -= docCount;
                } //TESTED
            } else {
                // Case 3b: annoying, some extra query terms, gonna need to do it the hard way...
                int toProcess = conf.getLimit();
                Iterator<Object> itSplit = collectionOfSplits.iterator();
                DBCollection coll = InfiniteMongoConfigUtil.getCollection(conf.getInputURI());
                while ((toProcess > 0) && (itSplit.hasNext())) {
                    BasicDBObject split = (BasicDBObject) itSplit.next();

                    BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
                    if (null != modQuery) {
                        int docsCounted = (int) coll.getCount(modQuery, null, toProcess, 0);
                        int toGet = (docsCounted > toProcess) ? toProcess : docsCounted;
                        if (docsCounted > 0) {
                            splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(),
                                    modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
                            toProcess -= docsCounted;
                    } //TESTED
            } //TESTED

            return splits;
        } else { // More complex cases:

            if (shardingPolicyNew) {
                // Case 4a: NEW SHARDING SCHEME

                // Always fetch the new sources, eg convert communityId to sourceKeys
                try {
                    splitPrecalculations_newShardScheme(confQuery, srcTagsQuery); // (modifies confQuery if returns true)            
                    boolean queryNonTrivial = isQueryNonTrivial(confQuery);

                    return calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null);

                    // (ie trivial query => always use chunks, bypass skip/limit test)
                } //TESTED (trivial + non-trivial)
                catch (Exception e) { // Didn't match any sources, no problem
                    return new ArrayList<InputSplit>();
                } //TESTED

            } //TESTED
            else {

                BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery,
                        srcTagsQuery, conf.getMaxDocsPerSplit());

                if (null == collectionOfSplits) {
                    // Case 4b: OLD SHARDING SCHEME can't get a partition by source keys, just back off to old code
                    return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);
                } //TESTED (old code)
                else {
                    conf.setMaxDocsPerSplit(2 * conf.getMaxDocsPerSplit());
                    // (because we stop creating splits when the exceed the size)

                    // Case 4c: OLD SHARDING SCHEME, have a source key partition
                    int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits();
                    boolean queryNonTrivial = isQueryNonTrivial(confQuery);
                    final List<InputSplit> splits = new ArrayList<InputSplit>();

                    BasicDBObject savedQuery = confQuery;

                    Iterator<Object> itSplit = collectionOfSplits.iterator();
                    BasicDBList bigSplit = null;
                    while (itSplit.hasNext()) {
                        BasicDBObject split = (BasicDBObject) itSplit.next();
                        int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                        if (docCount < nMaxCount) { // small split, will use skip/limit
                            BasicDBObject modQuery = convertQuery(savedQuery,
                            if (null != modQuery) {

                                final int SPLIT_THRESHOLD = 3;
                                // A few cases:
                                if ((docCount < (SPLIT_THRESHOLD * conf.getMaxDocsPerSplit()))
                                        || !queryNonTrivial) {
                                    splits.addAll(calculateSplits_phase2(conf, modQuery, false,
                                            shardingPolicyNew, (Integer) docCount));
                                } //TESTED (based on limit, based on query)
                                else {
                                    // My guess at the point at which you might as well as do the full query in the hope you're going
                                    // to save some (empty) splits
                                    splits.addAll(calculateSplits_phase2(conf, modQuery, false,
                                            shardingPolicyNew, null));
                                } //TESTED
                            } //TESTED
                        } else { // large split, combine all these guys into an array of source keys
                            if (null == bigSplit) {
                                bigSplit = new BasicDBList();
                            // (guaranteed to be a single element)
                    } //(end loop over collections)

                    if (null != bigSplit) {

                        // If we have a big left over community then create a set of splits for that - always chunks if query trivial
                        if (1 == bigSplit.size()) {
                            confQuery.put(DocumentPojo.sourceKey_, bigSplit.iterator().next());
                        } else {
                            confQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, bigSplit));
                        splits.addAll(calculateSplits_phase2(conf, confQuery, !queryNonTrivial,
                                shardingPolicyNew, null));
                    } //TESTED: singleton+trivial (sandy), array+trivial (sentiment/enron), array+non-trivial (sentiment/enron, docGeo), singleton+non-trivial (sandy, docGeo)

                    return splits;

                } //TESTED: end if Cases 4a, 4b, 4c

            } //(end if old vs new sharding policy)

        } //(non-debug case)
    } //(content or metadata table are most complex)

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

public static List<InputSplit> calculateSplits_phase2(InfiniteMongoConfig conf, BasicDBObject confQuery,
        boolean alwaysUseChunks, boolean newShardScheme, Integer splitDocCount) {
    alwaysUseChunks &= (conf.getMaxSplits() != MAX_SPLITS);
    // (in standalone mode, never use chunks)

    MongoURI uri = conf.getInputURI();//  w w w .  jav  a2  s  .  c om
    DBCollection coll = InfiniteMongoConfigUtil.getCollection(uri);
    if (conf.getLimit() > 0) {
        return calculateManualSplits(conf, confQuery, 1, conf.getLimit(), coll);
    } else {
        if (!alwaysUseChunks) {
            int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits();
            int count = 0;
            if (null == splitDocCount) {
                if (nMaxCount <= 1) {
                    nMaxCount = 0;
                } else {

                    count = (int) coll.getCount(confQuery, null, nMaxCount, 0);
                    if (0 == count) {
                        return new ArrayList<InputSplit>();
                } //TESTED
            } else {
                count = splitDocCount;

            //if maxdocssplit and maxsplits is set and there are less documents than splits*docspersplit then use the new splitter
            //otherwise use the old splitter
            if (conf.getMaxDocsPerSplit() > 0 && conf.getMaxSplits() > 0 && (count < nMaxCount)) {
                _logger.debug("Calculating splits manually");
                int splits_needed = (count / conf.getMaxDocsPerSplit()) + 1;

                return calculateManualSplits(conf, confQuery, splits_needed, conf.getMaxDocsPerSplit(), coll);
            } //TESTED
        if (newShardScheme && !confQuery.containsField(DocumentPojo.sourceKey_)) {
            // OK if we're going to do the sharded version then we will want to calculate
            splitPrecalculations_newShardScheme(confQuery, null); // (modifies confQuery if returns true)            
        } //TESTED: checked did nothing when had sourceKey, added sourceKey when necessary (eg entities.index case)

        if (!newShardScheme) { // unlike new sharding scheme, in this case the query is fixed, so overwrite now:

        List<InputSplit> splits = MongoSplitter.calculateSplits(conf);
        // (unless manually set, like above, runs with the _original_ query)
        int initialSplitSize = splits.size();

        // We have the MongoDB-calculated splits, now calculate their intersection vs the query
        Map<String, TreeSet<Comparable>> orderedArraySet = new HashMap<String, TreeSet<Comparable>>();
        Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin = new HashMap<String, NavigableSet<Comparable>>();
        BasicDBObject originalQuery = confQuery;

        ArrayList<InputSplit> newsplits = new ArrayList<InputSplit>(splits.size());
        Iterator<InputSplit> splitIt = splits.iterator();
        while (splitIt.hasNext()) {
            try {

                MongoInputSplit mongoSplit = (MongoInputSplit) splitIt.next();
                BasicDBObject min = (BasicDBObject) mongoSplit.getQuerySpec().get("$min");
                BasicDBObject max = (BasicDBObject) mongoSplit.getQuerySpec().get("$max");

                //_logger.info("+----------------- NEW SPLIT ----------------: " + min + " /" + max);
                //System.out.println("+----------------- NEW SPLIT ----------------: " + min + " /" + max);

                if (null != min) { // How does the min fit in with the general query
                    try {
                        if (compareFields(-1, originalQuery, min, max, orderedArraySet,
                                orderedArraySet_afterMin) < 0) {
                    } catch (Exception e) {
                    } // do nothing probably just some comparable issue
                } //TESTED

                if (null != max) { // How does the min fit in with the general query
                    try {
                        if (compareFields(1, originalQuery, max, min, orderedArraySet,
                                orderedArraySet_afterMin) > 0) {
                    } catch (Exception e) {
                    } // do nothing probably just some comparable issue
                } //TESTED

                //_logger.info("(retained split)");
                //System.out.println("(retained split)");

                // (don't worry about edge cases, won't happen very often and will just result in a spurious empty mapper)


                // Now some infinit.e specific processing...

                if (newShardScheme) {
                    TreeSet<Comparable> sourceKeyOrderedArray = orderedArraySet.get(DocumentPojo.sourceKey_);
                    if ((null != sourceKeyOrderedArray) && !sourceKeyOrderedArray.isEmpty()) {
                        Comparable minSourceKey = null;
                        Object minSourceKeyObj = (null == min) ? null : min.get(DocumentPojo.sourceKey_);
                        if (minSourceKeyObj instanceof String) {
                            minSourceKey = (String) minSourceKeyObj;
                        if (null == minSourceKey) {
                            minSourceKey = sourceKeyOrderedArray.first();
                        } //TESTED
                        Comparable maxSourceKey = null;
                        Object maxSourceKeyObj = (null == max) ? null : max.get(DocumentPojo.sourceKey_);
                        if (maxSourceKeyObj instanceof String) {
                            maxSourceKey = (String) maxSourceKeyObj;
                        if (null == maxSourceKey) {
                            maxSourceKey = sourceKeyOrderedArray.last();
                        } //TESTED

                        DBObject splitQuery = mongoSplit.getQuerySpec();
                        BasicDBObject splitQueryQuery = new BasicDBObject(
                                (BasicBSONObject) splitQuery.get("$query"));
                        if (0 == minSourceKey.compareTo(maxSourceKey)) { // single matching sourceKEy
                            splitQueryQuery.put(DocumentPojo.sourceKey_, maxSourceKey);
                        } //TESTED (array of sources, only one matches)
                        else { // multiple matching source keys
                            splitQueryQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_,
                                    sourceKeyOrderedArray.subSet(minSourceKey, true, maxSourceKey, true)));
                        } //TESTED (array of sources, multiple match)               
                                new InfiniteMongoInputSplit(mongoSplit, splitQueryQuery, conf.isNoTimeout()));
                    } else { // original query is of sufficient simplicity
                                new InfiniteMongoInputSplit(mongoSplit, originalQuery, conf.isNoTimeout()));
                    } //TESTED (no change to existing source)

                } //TESTED
                else { // old sharding scheme, remove min/max and replace with normal _id based query where possible

                    DBObject splitQuery = mongoSplit.getQuerySpec();
                    // Step 1: create a query range for _id:
                    BasicDBObject idRange = null;
                    Object idMin = (min == null) ? null : min.get(DocumentPojo._id_);
                    Object idMax = (max == null) ? null : max.get(DocumentPojo._id_);
                    if (!(idMin instanceof ObjectId))
                        idMin = null;
                    if (!(idMax instanceof ObjectId))
                        idMax = null;

                    if ((null != idMin) || (null != idMax)) {
                        idRange = new BasicDBObject();
                        if (null != idMin) {
                            idRange.put(DbManager.gte_, idMin);
                        if (null != idMax) {
                            idRange.put(DbManager.lt_, idMax);
                    } //TESTED                  

                    // Step 2: merge with whatever we have at the moment:
                    if (null != idRange) {
                        BasicDBObject splitQueryQuery = new BasicDBObject(
                                (BasicBSONObject) splitQuery.get("$query"));
                        Object idQueryElement = splitQueryQuery.get(DocumentPojo._id_);
                        boolean convertedAwayFromMinMax = false;
                        if (null == idQueryElement) { // nice and easy, add _id range
                            splitQueryQuery.put(DocumentPojo._id_, idRange);
                            convertedAwayFromMinMax = true;
                        } //TESTED
                        else if (!splitQueryQuery.containsField(DbManager.and_)) { // OK we're going to just going to make life easy
                                    Arrays.asList(new BasicDBObject(DocumentPojo._id_, idQueryElement),
                                            new BasicDBObject(DocumentPojo._id_, idRange)));
                            convertedAwayFromMinMax = true;
                        } //TESTED
                          // (else stick with min/max)

                        if (convertedAwayFromMinMax) { // can construct an _id query
                        } //TESTED
                        splitQuery.put("$query", splitQueryQuery);
                    newsplits.add(new InfiniteMongoInputSplit(mongoSplit, conf.isNoTimeout()));
                } //TESTED         
            } catch (Exception e) {
            } // do nothing must be some other type of input split
        } //TESTED

        //System.out.println("Calculating splits via mongo-hadoop: " + initialSplitSize + " reduced to " + splits.size());

        _logger.info("Calculating (converted) splits via mongo-hadoop: " + initialSplitSize + " reduced to "
                + newsplits.size());
        return newsplits;

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
private static int compareFields(int direction, BasicDBObject query, BasicDBObject minOrMax,
        BasicDBObject maxOrMin, Map<String, TreeSet<Comparable>> orderedArraySet,
        Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin) {
    for (String field : minOrMax.keySet()) {
        //DEBUG// w  ww  .j  ava2  s  . c  o  m
        //System.out.println("1] Compare: " + field + ": " + direction);

        try {
            Object queryOfThisField = query.get(field);
            Object minField = minOrMax.get(field);
            if ((null != queryOfThisField) && (minField instanceof Comparable)) {
                int result = 0;
                Comparable comparableMinOrMaxElement = (Comparable) minField;
                if (queryOfThisField instanceof BasicDBObject) {
                    result = compareComplexObject(field, direction, (BasicDBObject) queryOfThisField,
                            comparableMinOrMaxElement, orderedArraySet, orderedArraySet_afterMin);
                } //TESTED
                else { // -1 if comparableQueryElement < comparableMinOrMaxElement 
                    Comparable comparableQueryElement = (Comparable) queryOfThisField;
                    result = comparableQueryElement.compareTo(comparableMinOrMaxElement);
                    //System.out.println("3] Vals: " + comparableQueryElement + " vs " + comparableMinOrMaxElement + " = " + result);
                } //TESTED      
                if (result != 0) { // if we ever get a strict inequality then stop checking fields..
                    if ((result == direction) || !minOrMax.equals(maxOrMin)) {
                        // (fail)                 (pass but min/max keys different so not point checking any more)
                        return result;
                    } //TESTED
                // else equality, pass but keep checking fields 
        } catch (Exception e) {
        } // do nothing probably some odd comparable issue
    return -direction; // (ie pass by default)

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
private static int compareComplexObject(String parentField, int direction, BasicDBObject complexQueryElement,
        Comparable minOrMaxElement, Map<String, TreeSet<Comparable>> orderedArraySet,
        Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin) {
    for (String field : complexQueryElement.keySet()) {
        //DEBUG//from   www  .  j  a v  a 2  s  .  c  o m
        //System.out.println("2] Compare operator: " + field + ", vs " + minOrMaxElement);

        if (field.equals(MongoDbManager.in_)) {

            NavigableSet<Comparable> orderedArray = null;
            if (1 == direction) { // try orderedArraySet_afterMin first...
                orderedArray = orderedArraySet_afterMin.get(parentField);
                //System.out.println("2.0] Found orderered sub-array for: " + parentField + ", size= " + orderedArray.size());
            } //TESTED
            if (null == orderedArray) { // (min, or max but min didn't set a sub-array)
                orderedArray = orderedArraySet.get(parentField);
                if (null == orderedArray) {
                    // First time for this field, order the $in for easy comparison
                    orderedArray = new TreeSet<Comparable>();
                    Collection queryList = (Collection) complexQueryElement.get(MongoDbManager.in_);
                    for (Object o : queryList) {
                        Comparable c = (Comparable) o;
                    //System.out.println("2.1] Created orderered array for: " + parentField + ", size= " + orderedArray.size());

                    //                  if (!orderedArray.isEmpty()) {
                    //                     System.out.println("2.1.1] Head: " + orderedArray.iterator().next());               
                    //                     System.out.println("2.1.2] Tail: " + orderedArray.descendingIterator().next());               
                    //                  }

                    orderedArraySet.put(parentField, (TreeSet<Comparable>) orderedArray);
                    // (know this cast is valid by construction)
                } //TESTED
            if (-1 == direction) { // comparing vs min
                //System.out.println("2.2] tailSet: " + orderedArray.tailSet(minOrMaxElement, true).size());
                NavigableSet<Comparable> minElements = orderedArray.tailSet(minOrMaxElement, true);
                if (minElements.isEmpty()) { // (elements >= minElement)
                    return direction; // will always fail
                } else {
                    orderedArraySet_afterMin.put(parentField, minElements);
                } //TESTED
            } //TESTED
            else if (1 == direction) { // comparing vs max
                //System.out.println("2.2] headSet: " + orderedArray.headSet(minOrMaxElement, true).size());

                if (orderedArray.headSet(minOrMaxElement, true).isEmpty()) { // (elements <= maxElement)
                    return direction; // will always fail
            } //TESTED
        } else if (field.equals(MongoDbManager.gt_) || field.equals(MongoDbManager.gte_)) { // (don't worry about the boundaries, just results in spurious empty chunks)
            if (1 == direction) { // can't do anything about $gt vs min
                Comparable comparableQueryElement = (Comparable) complexQueryElement.get(field);
                //System.out.println("2.3.1] GT Vals: " + comparableQueryElement + " vs " + minOrMaxElement + " = " + comparableQueryElement.compareTo(minOrMaxElement));

                if (comparableQueryElement.compareTo(minOrMaxElement) > 0) // ie query _lower_ limit > chunk max 
                    return direction; // ie fail
        } //TESTED
        else if (field.equals(MongoDbManager.lt_) || field.equals(MongoDbManager.lte_)) { // (don't worry about the boundaries, just results in spurious empty chunks)
            if (-1 == direction) { // can't do anything about $lt vs max
                Comparable comparableQueryElement = (Comparable) complexQueryElement.get(field);
                //System.out.println("2.3.2] LT Vals: " + comparableQueryElement + " vs " + minOrMaxElement + " = " + comparableQueryElement.compareTo(minOrMaxElement));

                if (comparableQueryElement.compareTo(minOrMaxElement) < 0) // ie query upper limit < chunk min
                    return direction; // ie fail
        } //TESTED
    return -direction; // (ie pass by default, don't check other fields unless they have the same min/max)