List of usage examples for com.mongodb BasicDBObject get
public Object get(final String key)
From source file:com.ikanow.infinit.e.application.utils.LogstashConfigUtils.java
License:Open Source License
public static BasicDBObject parseLogstashConfig(String configFile, StringBuffer error) { BasicDBObject tree = new BasicDBObject(); // Stage 0: remove escaped "s and 's (for the purpose of the validation): // (prevents tricksies with escaped "s and then #s) // (http://stackoverflow.com/questions/5082398/regex-to-replace-single-backslashes-excluding-those-followed-by-certain-chars) configFile = configFile.replaceAll("(?<!\\\\)(?:((\\\\\\\\)*)\\\\)[\"']", "X"); //TESTED (by hand - using last 2 fields of success_2_1) // Stage 1: remove #s, and anything in quotes (for the purpose of the validation) configFile = configFile.replaceAll("(?m)(?:([\"'])(?:(?!\\1).)*\\1)", "VALUE").replaceAll("(?m)(?:#.*$)", "");//from ww w .j ava 2 s. c om //TESTED (2_1 - including with a # inside the ""s - Event_Date -> Event_#Date) //TESTED (2_2 - various combinations of "s nested inside 's) ... yes that is a negative lookahead up there - yikes! // Stage 2: get a nested list of objects int depth = 0; int ifdepth = -1; Stack<Integer> ifStack = new Stack<Integer>(); BasicDBObject inputOrFilter = null; Matcher m = _navigateLogstash.matcher(configFile); // State: String currTopLevelBlockName = null; String currSecondLevelBlockName = null; BasicDBObject currSecondLevelBlock = null; while (m.find()) { boolean simpleField = false; //DEBUG //System.out.println("--DEPTH="+depth + " GROUP=" + m.group() + " IFS" + Arrays.toString(ifStack.toArray())); //System.out.println("STATES: " + currTopLevelBlockName + " AND " + currSecondLevelBlockName); if (m.group().equals("}")) { if (ifdepth == depth) { // closing an if statement ifStack.pop(); if (ifStack.isEmpty()) { ifdepth = -1; } else { ifdepth = ifStack.peek(); } } //TESTED (1_1bc, 2_1) else { // closing a processing block depth--; if (depth < 0) { // {} Mismatch error.append("{} Mismatch (})"); return null; } //TESTED (1_1abc) } } else { // new attribute! String typeName = m.group(1); if (null == typeName) { // it's an if statement or a string value typeName = m.group(4); if (null != typeName) { simpleField = true; } } else if (typeName.equalsIgnoreCase("else")) { // It's an if statement.. typeName = null; } if (null == typeName) { // if statement after all // Just keep track of ifs so we can ignore them ifStack.push(depth); ifdepth = depth; // (don't increment depth) } //TESTED (1_1bc, 2_1) else { // processing block String subTypeName = m.group(3); if (null != subTypeName) { // eg codec.multiline typeName = typeName + "." + subTypeName; } //TESTED (2_1, 2_3) if (depth == 0) { // has to be one of input/output/filter) String topLevelType = typeName.toLowerCase(); if (topLevelType.equalsIgnoreCase("input") || topLevelType.equalsIgnoreCase("filter")) { if (tree.containsField(topLevelType)) { error.append("Multiple input or filter blocks: " + topLevelType); return null; } //TESTED (1_3ab) else { inputOrFilter = new BasicDBObject(); tree.put(topLevelType, inputOrFilter); // Store state: currTopLevelBlockName = topLevelType; } //TESTED (*) } else { if (topLevelType.equalsIgnoreCase("output")) { error.append( "Not allowed output blocks - these are appended automatically by the logstash harvester"); } else { error.append("Unrecognized processing block: " + topLevelType); } return null; } //TESTED (1_4a) } else if (depth == 1) { // processing blocks String subElType = typeName.toLowerCase(); // Some validation: can't include a type called "filter" anywhere if ((null != currTopLevelBlockName) && currTopLevelBlockName.equals("input")) { if (subElType.equals("filter") || subElType.endsWith(".filter")) { error.append("Not allowed sub-elements of input called 'filter' (1)"); return null; } } //TESTED (1_5b) BasicDBList subElements = (BasicDBList) inputOrFilter.get(subElType); if (null == subElements) { subElements = new BasicDBList(); inputOrFilter.put(subElType, subElements); } BasicDBObject newEl = new BasicDBObject(); subElements.add(newEl); // Store state: currSecondLevelBlockName = subElType; currSecondLevelBlock = newEl; } //TESTED (*) else if (depth == 2) { // attributes of processing blocks // we'll just store the field names for these and do any simple validation that was too complicated for the regexes String subSubElType = typeName.toLowerCase(); // Validation: if (null != currTopLevelBlockName) { // 1] sincedb path if (currTopLevelBlockName.equals("input") && (null != currSecondLevelBlockName)) { // (don't care what the second level block name is - no sincedb allowed) if (subSubElType.equalsIgnoreCase("sincedb_path")) { error.append("Not allowed sincedb_path in input.* block"); return null; } //TESTED (1_5a) // 2] no sub-(-sub etc)-elements of input called filter if (subSubElType.equals("filter") || subSubElType.endsWith(".filter")) { error.append("Not allowed sub-elements of input called 'filter' (2)"); return null; } //TESTED (1_5c) } } // Store in map: if (null != currSecondLevelBlock) { currSecondLevelBlock.put(subSubElType, new BasicDBObject()); } } // (won't go any deeper than this) if (!simpleField) { depth++; } } } } if (0 != depth) { error.append("{} Mismatch ({)"); return null; } //TESTED (1_2a) return tree; }
From source file:com.ikanow.infinit.e.core.mapreduce.HadoopJobRunner.java
License:Open Source License
private void createConfigXML(Writer out, String title, String input, String fields, boolean isCustomTable, String outputDatabase, String output, String tempOutputCollection, String mapper, String reducer, String combiner, String query, List<ObjectId> communityIds, String outputKey, String outputValue, String arguments) throws IOException { String dbserver = prop_general.getDatabaseServer(); output = outputDatabase + "." + tempOutputCollection; int nSplits = 8; int nDocsPerSplit = 12500; //add communities to query if this is not a custom table if (!isCustomTable) { // Start with the old query: BasicDBObject oldQueryObj = null; if (query.startsWith("{")) { oldQueryObj = (BasicDBObject) com.mongodb.util.JSON.parse(query); } else {// w ww. j av a2s . co m oldQueryObj = new BasicDBObject(); } // Community Ids aren't indexed in the metadata collection, but source keys are, so we need to transform to that BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_, new BasicDBObject(DbManager.in_, communityIds)); boolean bAdminOverride = false; if (oldQueryObj.containsField("admin")) { // For testing only... if (1 == communityIds.size()) { ObjectId communityId = communityIds.get(0); if (RESTTools.adminLookup(communityId.toString())) { bAdminOverride = true; if (oldQueryObj.containsField("max.splits")) { nSplits = oldQueryObj.getInt("max.splits"); } if (oldQueryObj.containsField("max.docs.per.split")) { nDocsPerSplit = oldQueryObj.getInt("max.docs.per.split"); } } } } //(end diagnostic/benchmarking/test code for admins only part 1) if (bAdminOverride) { oldQueryObj = (BasicDBObject) oldQueryObj.get("admin"); //(end diagnostic/benchmarking/test code for admins only part 2) } else if (oldQueryObj.containsField(DocumentPojo.sourceKey_) || input.startsWith("feature.")) { // Source Key specified by user, stick communityIds check in for security oldQueryObj.put(DocumentPojo.communityId_, new BasicDBObject(DbManager.in_, communityIds)); } else { // Source key not specified by user, transform communities->sourcekeys BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1); DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields); if (dbc.count() > 500) { // (too many source keys let's keep the query size sensible...) oldQueryObj.put(DocumentPojo.communityId_, new BasicDBObject(DbManager.in_, communityIds)); } else { HashSet<String> sourceKeys = new HashSet<String>(); while (dbc.hasNext()) { DBObject dbo = dbc.next(); String sourceKey = (String) dbo.get(SourcePojo.key_); if (null != sourceKey) { sourceKeys.add(sourceKey); } } if (sourceKeys.isEmpty()) { // query returns empty throw new RuntimeException("Communities contain no sources"); } BasicDBObject newQueryClauseObj = new BasicDBObject(DbManager.in_, sourceKeys); // Now combine the queries... oldQueryObj.put(DocumentPojo.sourceKey_, newQueryClauseObj); } // (end if too many source keys across the communities) } //(end if need to break source keys down into communities) query = oldQueryObj.toString(); } else { //get the custom table (and database) input = getCustomDbAndCollection(input); } if (arguments == null) arguments = ""; // Generic configuration out.write("<?xml version=\"1.0\"?>\n<configuration>"); // Mongo specific configuration out.write("\n\t<property><!-- name of job shown in jobtracker --><name>mongo.job.name</name><value>" + title + "</value></property>" + "\n\t<property><!-- run the job verbosely ? --><name>mongo.job.verbose</name><value>true</value></property>" + "\n\t<property><!-- Run the job in the foreground and wait for response, or background it? --><name>mongo.job.background</name><value>false</value></property>" + "\n\t<property><!-- If you are reading from mongo, the URI --><name>mongo.input.uri</name><value>mongodb://" + dbserver + "/" + input + "</value></property>" + "\n\t<property><!-- If you are writing to mongo, the URI --><name>mongo.output.uri</name><value>mongodb://" + dbserver + "/" + output + "</value> </property>" + "\n\t<property><!-- The query, in JSON, to execute [OPTIONAL] --><name>mongo.input.query</name><value>" + query + "</value></property>" + "\n\t<property><!-- The fields, in JSON, to read [OPTIONAL] --><name>mongo.input.fields</name><value>" + ((fields == null) ? ("") : fields) + "</value></property>" + "\n\t<property><!-- A JSON sort specification for read [OPTIONAL] --><name>mongo.input.sort</name><value></value></property>" + "\n\t<property><!-- The number of documents to limit to for read [OPTIONAL] --><name>mongo.input.limit</name><value>0</value><!-- 0 == no limit --></property>" + "\n\t<property><!-- The number of documents to skip in read [OPTIONAL] --><!-- TODO - Are we running limit() or skip() first? --><name>mongo.input.skip</name><value>0</value> <!-- 0 == no skip --></property>" + "\n\t<property><!-- Class for the mapper --><name>mongo.job.mapper</name><value>" + mapper + "</value></property>" + "\n\t<property><!-- Reducer class --><name>mongo.job.reducer</name><value>" + reducer + "</value></property>" + "\n\t<property><!-- InputFormat Class --><name>mongo.job.input.format</name><value>com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat</value></property>" + "\n\t<property><!-- OutputFormat Class --><name>mongo.job.output.format</name><value>com.mongodb.hadoop.MongoOutputFormat</value></property>" + "\n\t<property><!-- Output key class for the output format --><name>mongo.job.output.key</name><value>" + outputKey + "</value></property>" + "\n\t<property><!-- Output value class for the output format --><name>mongo.job.output.value</name><value>" + outputValue + "</value></property>" + "\n\t<property><!-- Output key class for the mapper [optional] --><name>mongo.job.mapper.output.key</name><value></value></property>" + "\n\t<property><!-- Output value class for the mapper [optional] --><name>mongo.job.mapper.output.value</name><value></value></property>" + "\n\t<property><!-- Class for the combiner [optional] --><name>mongo.job.combiner</name><value>" + combiner + "</value></property>" + "\n\t<property><!-- Partitioner class [optional] --><name>mongo.job.partitioner</name><value></value></property>" + "\n\t<property><!-- Sort Comparator class [optional] --><name>mongo.job.sort_comparator</name><value></value></property>" + "\n\t<property><!-- Split Size [optional] --><name>mongo.input.split_size</name><value>32</value></property>"); // Infinit.e specific configuration out.write("\n\t<property><!-- User Arguments [optional] --><name>arguments</name><value>" + StringEscapeUtils.escapeXml(arguments) + "</value></property>" + "\n\t<property><!-- Maximum number of splits [optional] --><name>max.splits</name><value>" + nSplits + "</value></property>" + "\n\t<property><!-- Maximum number of docs per split [optional] --><name>max.docs.per.split</name><value>" + nDocsPerSplit + "</value></property>"); // Closing thoughts: out.write("\n</configuration>"); out.flush(); out.close(); }
From source file:com.ikanow.infinit.e.core.utils.SourceUtils.java
License:Open Source License
public static boolean checkDbSyncLock() { DBCursor dbc = DbManager.getFeature().getSyncLock().find(); if (!dbc.hasNext()) { return false; // working fine }/*from w w w .j a v a2s . c o m*/ Date now = new Date(); while (dbc.hasNext()) { BasicDBObject sync_lock = (BasicDBObject) dbc.next(); Object lastSyncObj = sync_lock.get("last_sync"); if (null != lastSyncObj) { try { Date last_sync = (Date) lastSyncObj; if (last_sync.getTime() + _ONEDAY > now.getTime()) { return true; // (ie sync object exists and is < 1 day old) } } catch (Exception e) { // class cast, do nothing } } } // (end "loop over" 1 object in sync_lock DB) return false; }
From source file:com.ikanow.infinit.e.core.utils.SourceUtils.java
License:Open Source License
public static void updateHarvestStatus(SourcePojo source, HarvestEnum harvestStatus, List<DocumentPojo> added, long nDocsDeleted, String extraMessage) { // Handle successful harvests where the max docs were reached, so don't want to respect the searchCycle if ((harvestStatus == HarvestEnum.success) && (source.reachedMaxDocs())) { harvestStatus = HarvestEnum.success_iteration; }//from ww w. j a v a 2s . co m // Always update status object in order to release the "in_progress" lock // (make really really sure we don't exception out before doing this!) BasicDBObject query = new BasicDBObject(SourcePojo._id_, source.getId()); BasicDBObject setClause = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_, harvestStatus.toString()); if ((null != added) && !added.isEmpty()) { setClause.put(SourceHarvestStatusPojo.sourceQuery_extracted_, new Date()); } if (null != extraMessage) { if ((null == source.getHarvestStatus()) || (null == source.getHarvestStatus().getHarvest_message())) { setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_message_, extraMessage); } else { source.getHarvestStatus() .setHarvest_message(source.getHarvestStatus().getHarvest_message() + "\n" + extraMessage); setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_message_, source.getHarvestStatus().getHarvest_message()); } } BasicDBObject update = new BasicDBObject(MongoDbManager.set_, setClause); int docsAdded = 0; if (null != added) { docsAdded = added.size(); } BasicDBObject incClause = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_, docsAdded - nDocsDeleted); update.put(MongoDbManager.inc_, incClause); // Special case, if searchCycle_secs == 0 and not success_iteration, then suspend: if ((harvestStatus != HarvestEnum.success_iteration) && (null != source.getSearchCycle_secs()) && (0 == source.getSearchCycle_secs())) { setClause.put(SourcePojo.searchCycle_secs_, -1); } if (null != source.getDistributionTokens()) { // Distribution logic (specified and also enabled - eg ignore Feed/DB) updateHarvestDistributionState_tokenComplete(source, harvestStatus, incClause, setClause); } if (setClause.isEmpty()) { // (ie got removed by the distribution logic above) update.remove(MongoDbManager.set_); } //TESTED long nTotalDocsAfterInsert = 0; BasicDBObject fieldsToReturn = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_, 1); BasicDBObject updatedSource = (BasicDBObject) DbManager.getIngest().getSource().findAndModify(query, fieldsToReturn, null, false, update, true, false); BasicDBObject harvestStatusObj = (BasicDBObject) updatedSource.get(SourcePojo.harvest_); if (null != harvestStatusObj) { Long docCount = harvestStatusObj.getLong(SourceHarvestStatusPojo.doccount_); if (null != docCount) { nTotalDocsAfterInsert = docCount; } } //TESTED // Prune documents if necessary if ((null != source.getMaxDocs()) && (nTotalDocsAfterInsert > source.getMaxDocs())) { long nToPrune = (nTotalDocsAfterInsert - source.getMaxDocs()); SourceUtils.pruneSource(source, (int) nToPrune, -1); nDocsDeleted += nToPrune; // And update to reflect that it now has max docs... BasicDBObject update2_1 = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_, source.getMaxDocs()); BasicDBObject update2 = new BasicDBObject(DbManager.set_, update2_1); DbManager.getIngest().getSource().update(query, update2); } //TESTED if ((null != source.getTimeToLive_days())) { nDocsDeleted += SourceUtils.pruneSource(source, Integer.MAX_VALUE, source.getTimeToLive_days()); } //TODO: TOTEST // (OK now the only thing we really had to do is complete, add some handy metadata) // Also update the document count table in doc_metadata: if (docsAdded > 0) { if (1 == source.getCommunityIds().size()) { // (simple/usual case, just 1 community) query = new BasicDBObject(DocCountPojo._id_, source.getCommunityIds().iterator().next()); update = new BasicDBObject(MongoDbManager.inc_, new BasicDBObject(DocCountPojo.doccount_, docsAdded - nDocsDeleted)); if ((docsAdded != 0) || (nDocsDeleted != 0)) { update.put(DbManager.set_, new BasicDBObject(DocCountPojo.extracted_, new Date())); } DbManager.getDocument().getCounts().update(query, update, true, false); } else if (!source.getCommunityIds().isEmpty()) { // Complex case since docs can belong to diff communities (but they're usually somewhat grouped) Map<ObjectId, Integer> communityMap = new HashMap<ObjectId, Integer>(); for (DocumentPojo doc : added) { ObjectId communityId = doc.getCommunityId(); Integer count = communityMap.get(communityId); communityMap.put(communityId, (count == null ? 1 : count + 1)); } //end loop over added documents (updating the separate community counts) long nDocsDeleted_byCommunity = nDocsDeleted / source.getCommunityIds().size(); // (can't do better than assume a uniform distribution - the whole thing gets recalculated weekly anyway...) for (Map.Entry<ObjectId, Integer> communityInfo : communityMap.entrySet()) { query = new BasicDBObject(DocCountPojo._id_, communityInfo.getKey()); update = new BasicDBObject(MongoDbManager.inc_, new BasicDBObject(DocCountPojo.doccount_, communityInfo.getValue() - nDocsDeleted_byCommunity)); if ((communityInfo.getValue() != 0) || (nDocsDeleted_byCommunity != 0)) { update.put(DbManager.set_, new BasicDBObject(DocCountPojo.extracted_, new Date())); } DbManager.getDocument().getCounts().update(query, update, true, false); // (true for upsert, false for multi add) } } //(never called in practice - tested up until 5/2/2014) } }
From source file:com.ikanow.infinit.e.core.utils.SourceUtils.java
License:Open Source License
private static boolean updateHarvestDistributionState_tokenComplete(SourcePojo source, HarvestEnum harvestStatus, BasicDBObject incClause, BasicDBObject setClause) { // Update tokens complete, and retrieve modified version int nTokensToBeCleared = source.getDistributionTokens().size(); BasicDBObject query = new BasicDBObject(SourcePojo._id_, source.getId()); BasicDBObject modify = new BasicDBObject(MongoDbManager.inc_, new BasicDBObject( SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, nTokensToBeCleared)); BasicDBObject fields = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, 1);/*from w w w. ja va2s. co m*/ fields.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, 1); fields.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, 1); BasicDBObject partial = (BasicDBObject) MongoDbManager.getIngest().getSource().findAndModify(query, fields, null, false, modify, true, false); //(return new version - ensures previous increments have been taken into account) // Two cases: source complete (all tokens obtained), source incomplete: if (null != partial) { // (else yikes!) BasicDBObject partialStatus = (BasicDBObject) partial.get(SourcePojo.harvest_); if (null != partialStatus) { // (else yikes!) int nTokensComplete = partialStatus.getInt(SourceHarvestStatusPojo.distributionTokensComplete_, 0); // (note after increment) // COMPLETE: reset parameters, status -> error (if anything has errored), success (all done), success_iteration (more to do) if (nTokensComplete == source.getDistributionFactor()) { if (!source.reachedMaxDocs()) { // (Can only do this if we've finished the source... //...else the different threads can be at different points, so the most recent doc for one thread might be // before the most recent doc of another) setClause.put(SourceHarvestStatusPojo.sourceQuery_distributedLastCompletedCycle_, new Date()); } setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionTokensComplete_, 0); setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionTokensFree_, source.getDistributionFactor()); setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, false); // (resetting this) // This source is now complete String status = partialStatus.getString(SourceHarvestStatusPojo.harvest_status_, null); Boolean reachedLimit = partialStatus.getBoolean( SourceHarvestStatusPojo.distributionReachedLimit_, false) || source.reachedMaxDocs(); if ((null != status) && ((status.equalsIgnoreCase(HarvestEnum.error.toString()) || (HarvestEnum.error == harvestStatus)))) { setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, HarvestEnum.error.toString()); } //TESTED (current and previous state == error) else if (reachedLimit || (HarvestEnum.success_iteration == harvestStatus)) { setClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, HarvestEnum.success_iteration.toString()); } //TESTED (from previous or current state) // (else leave with default of success) //DEBUG //System.out.println(Thread.currentThread().getName() + " COMPLETE_SRC COMPLETE_TOKEN=" + source.getKey() + " / " + setClause.toString() + " / " + incClause.toString() + " / " + nTokensComplete); return true; } //TESTED else { // Not complete // If we're here then we're only allowed to update the status to error if (HarvestEnum.error != harvestStatus) { setClause.remove(SourceHarvestStatusPojo.sourceQuery_harvest_status_); } //TESTED if (source.reachedMaxDocs()) { setClause.put(SourceHarvestStatusPojo.sourceQuery_distributionReachedLimit_, true); } //TESTED //DEBUG //System.out.println(Thread.currentThread().getName() + " COMPLETE_TOKEN=" + source.getKey() + " / " + setClause.toString() + " / " + incClause.toString() + " / " + nTokensComplete); return false; } //(end is complete or not) //TESTED (reached max limit) } //(end found partial source status, else catastrophic failure) } //(end found partial source, else catastrophic failure) return false; }
From source file:com.ikanow.infinit.e.data_model.api.ResponsePojo.java
License:Apache License
public static ResponsePojo fromDb(BasicDBObject bson) { BasicDBObject bson2 = new BasicDBObject(); bson2.put("stats", bson.get("stats")); bson2.put("response", bson.get("response")); ResponsePojo rp = ResponsePojo.fromApi(bson2.toString(), ResponsePojo.class); // Now all the elements! Object evtTimeline = null, facets = null, times = null, entities = null, events = null, facts = null, summaries = null, sources = null, sourceMetaTags = null, sourceMetaTypes = null, moments = null, other = null;/*from w ww .j a va 2 s . co m*/ evtTimeline = bson.get("eventsTimeline"); facets = bson.get("facets"); times = bson.get("times"); entities = bson.get("entities"); events = bson.get("events"); facts = bson.get("facts"); summaries = bson.get("summaries"); sources = bson.get("sources"); sourceMetaTags = bson.get("sourceMetatags"); sourceMetaTypes = bson.get("sourceMetaTypes"); moments = bson.get("moments"); other = bson.get("other"); rp.setEventsTimeline(evtTimeline); rp.setFacets(facets); rp.setTimes(times, rp.getTimeInterval() == null ? 0 : rp.getTimeInterval()); rp.setEntities(entities); rp.setEvents(events); rp.setFacts(facts); rp.setSummaries(summaries); rp.setSources(sources); rp.setSourceMetaTags(sourceMetaTags); rp.setSourceMetaTypes(sourceMetaTypes); rp.setMoments(moments, rp.getMomentInterval()); rp.setOther(other); // The main data object is discarded in the original fromApi() call, so put it back now Object docData = bson.get("data"); if (null != docData) { rp.setData((BasicDBList) docData, (BasePojoApiMap<BasicDBList>) null); } else { // (ensure there's always an empty list) rp.setData(new ArrayList<BasicDBObject>(0), (BasePojoApiMap<BasicDBObject>) null); } return rp; }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
/** * Checks if the new params MAX_SPLITS and MAX_DOCS_PER_SPLIT are set * in the config. If they are it will use those to do splits via limit/skip * otherwise it will call the previous chunking splitter in MongoSplitter. * /* w w w . j ava2s . c o m*/ * @param conf * @return */ public static List<InputSplit> calculateSplits(InfiniteMongoConfig conf) { // First off: What is our sharding scheme? boolean shardingPolicyNew = false; try { BasicDBObject shardQuery = new BasicDBObject("_id", "doc_metadata.metadata"); BasicDBObject shardInfo = (BasicDBObject) DbManager.getCollection("config", "collections") .findOne(shardQuery); if (null != shardInfo) { BasicDBObject shardInfoKey = (BasicDBObject) shardInfo.get("key"); if (null != shardInfoKey) { shardingPolicyNew = (shardInfoKey.size() > 1); } } } //TESTED (new and old) catch (Exception e) { } // stick with the old sharding, it's probably going to die soon after though, honestly // conf.getQuery returns a new copy of the query, so get once and use everywhere... BasicDBObject confQuery = (BasicDBObject) conf.getQuery(); BasicDBObject srcTagsQuery = (BasicDBObject) conf.getSourceTags(); String collection = conf.getInputURI().getCollection(); if (!collection.equals(DbManager.getDocument().getContent().getName()) && !collection.equals(DbManager.getDocument().getMetadata().getName())) { // Case 1: feature table or custom table // Just run legacy code return calculateSplits_phase2(conf, confQuery, false, false, null); } else { // complex cases... boolean simpleOtherIndex = false; // Check whether a simple query has been performed on a different indexed field if (null == srcTagsQuery) { // (if srcTags specified, then going to want to use sourceKey as the index) for (String s : Arrays.asList(EntityPojo.docQuery_index_, DocumentPojo.url_)) { Object selector = confQuery.get(s); if (selector instanceof String) { simpleOtherIndex = true; break; } else if (selector instanceof DBObject) { DBObject selectorDbo = (DBObject) selector; if (selectorDbo.containsField(DbManager.in_)) { simpleOtherIndex = true; break; } } } //TESTED (both types, plus check complex indexes don't work) // ALLOWED: {"entities.index": { "$in": [ "xxx", "yyy"] }, {"entities.index": "xxx" }, ditto for "url" // NOT ALLOWED: { "entities.index": { "$ne": "xxx" } } } //TESTED check ignored if eg entity_index specified if (simpleOtherIndex) { // Case 2: we have a simple query on an indexed field // Just run legacy code return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null); } //TESTED else if (conf.getLimit() > 0) { // debug //Case 3: Ensure we have small sets of sources to search over BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery, conf.getMaxDocsPerSplit()); final List<InputSplit> splits = new ArrayList<InputSplit>(); boolean queryNonTrivial = isQueryNonTrivial(confQuery); if (!queryNonTrivial) { //Case 3a: query is trivial, so can just create splits directly from the split pre-calcs int toProcess = conf.getLimit(); Iterator<Object> itSplit = collectionOfSplits.iterator(); while ((toProcess > 0) && (itSplit.hasNext())) { BasicDBObject split = (BasicDBObject) itSplit.next(); int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L); int toGet = (docCount > toProcess) ? toProcess : docCount; BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout())); toProcess -= docCount; } } //TESTED } else { // Case 3b: annoying, some extra query terms, gonna need to do it the hard way... int toProcess = conf.getLimit(); Iterator<Object> itSplit = collectionOfSplits.iterator(); DBCollection coll = InfiniteMongoConfigUtil.getCollection(conf.getInputURI()); while ((toProcess > 0) && (itSplit.hasNext())) { BasicDBObject split = (BasicDBObject) itSplit.next(); BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { int docsCounted = (int) coll.getCount(modQuery, null, toProcess, 0); int toGet = (docsCounted > toProcess) ? toProcess : docsCounted; if (docsCounted > 0) { splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout())); toProcess -= docsCounted; } } //TESTED } } //TESTED return splits; } else { // More complex cases: if (shardingPolicyNew) { // Case 4a: NEW SHARDING SCHEME // Always fetch the new sources, eg convert communityId to sourceKeys try { splitPrecalculations_newShardScheme(confQuery, srcTagsQuery); // (modifies confQuery if returns true) boolean queryNonTrivial = isQueryNonTrivial(confQuery); return calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null); // (ie trivial query => always use chunks, bypass skip/limit test) } //TESTED (trivial + non-trivial) catch (Exception e) { // Didn't match any sources, no problem return new ArrayList<InputSplit>(); } //TESTED } //TESTED else { BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery, conf.getMaxDocsPerSplit()); if (null == collectionOfSplits) { // Case 4b: OLD SHARDING SCHEME can't get a partition by source keys, just back off to old code return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null); } //TESTED (old code) else { conf.setMaxDocsPerSplit(2 * conf.getMaxDocsPerSplit()); // (because we stop creating splits when the exceed the size) // Case 4c: OLD SHARDING SCHEME, have a source key partition int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits(); boolean queryNonTrivial = isQueryNonTrivial(confQuery); final List<InputSplit> splits = new ArrayList<InputSplit>(); BasicDBObject savedQuery = confQuery; Iterator<Object> itSplit = collectionOfSplits.iterator(); BasicDBList bigSplit = null; while (itSplit.hasNext()) { BasicDBObject split = (BasicDBObject) itSplit.next(); int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L); if (docCount < nMaxCount) { // small split, will use skip/limit BasicDBObject modQuery = convertQuery(savedQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { final int SPLIT_THRESHOLD = 3; // A few cases: if ((docCount < (SPLIT_THRESHOLD * conf.getMaxDocsPerSplit())) || !queryNonTrivial) { splits.addAll(calculateSplits_phase2(conf, modQuery, false, shardingPolicyNew, (Integer) docCount)); } //TESTED (based on limit, based on query) else { // My guess at the point at which you might as well as do the full query in the hope you're going // to save some (empty) splits splits.addAll(calculateSplits_phase2(conf, modQuery, false, shardingPolicyNew, null)); } //TESTED } //TESTED } else { // large split, combine all these guys into an array of source keys if (null == bigSplit) { bigSplit = new BasicDBList(); } bigSplit.add(split.get(DocumentPojo.sourceKey_)); // (guaranteed to be a single element) } } //(end loop over collections) if (null != bigSplit) { // If we have a big left over community then create a set of splits for that - always chunks if query trivial if (1 == bigSplit.size()) { confQuery.put(DocumentPojo.sourceKey_, bigSplit.iterator().next()); } else { confQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, bigSplit)); } splits.addAll(calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null)); } //TESTED: singleton+trivial (sandy), array+trivial (sentiment/enron), array+non-trivial (sentiment/enron, docGeo), singleton+non-trivial (sandy, docGeo) return splits; } //TESTED: end if Cases 4a, 4b, 4c } //(end if old vs new sharding policy) } //(non-debug case) } //(content or metadata table are most complex) }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
@SuppressWarnings("unchecked") public static List<InputSplit> calculateSplits_phase2(InfiniteMongoConfig conf, BasicDBObject confQuery, boolean alwaysUseChunks, boolean newShardScheme, Integer splitDocCount) { alwaysUseChunks &= (conf.getMaxSplits() != MAX_SPLITS); // (in standalone mode, never use chunks) MongoURI uri = conf.getInputURI();// w w w . jav a2 s . c om DBCollection coll = InfiniteMongoConfigUtil.getCollection(uri); if (conf.getLimit() > 0) { return calculateManualSplits(conf, confQuery, 1, conf.getLimit(), coll); } else { if (!alwaysUseChunks) { int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits(); int count = 0; if (null == splitDocCount) { if (nMaxCount <= 1) { nMaxCount = 0; } else { //DEBUG //System.out.println(coll.find(confQuery).limit(1).explain()); count = (int) coll.getCount(confQuery, null, nMaxCount, 0); if (0 == count) { return new ArrayList<InputSplit>(); } } //TESTED } else { count = splitDocCount; } //if maxdocssplit and maxsplits is set and there are less documents than splits*docspersplit then use the new splitter //otherwise use the old splitter if (conf.getMaxDocsPerSplit() > 0 && conf.getMaxSplits() > 0 && (count < nMaxCount)) { _logger.debug("Calculating splits manually"); int splits_needed = (count / conf.getMaxDocsPerSplit()) + 1; return calculateManualSplits(conf, confQuery, splits_needed, conf.getMaxDocsPerSplit(), coll); } //TESTED } if (newShardScheme && !confQuery.containsField(DocumentPojo.sourceKey_)) { // OK if we're going to do the sharded version then we will want to calculate splitPrecalculations_newShardScheme(confQuery, null); // (modifies confQuery if returns true) } //TESTED: checked did nothing when had sourceKey, added sourceKey when necessary (eg entities.index case) if (!newShardScheme) { // unlike new sharding scheme, in this case the query is fixed, so overwrite now: conf.setQuery(confQuery); } List<InputSplit> splits = MongoSplitter.calculateSplits(conf); // (unless manually set, like above, runs with the _original_ query) int initialSplitSize = splits.size(); // We have the MongoDB-calculated splits, now calculate their intersection vs the query @SuppressWarnings("rawtypes") Map<String, TreeSet<Comparable>> orderedArraySet = new HashMap<String, TreeSet<Comparable>>(); @SuppressWarnings("rawtypes") Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin = new HashMap<String, NavigableSet<Comparable>>(); BasicDBObject originalQuery = confQuery; ArrayList<InputSplit> newsplits = new ArrayList<InputSplit>(splits.size()); Iterator<InputSplit> splitIt = splits.iterator(); while (splitIt.hasNext()) { try { orderedArraySet_afterMin.clear(); MongoInputSplit mongoSplit = (MongoInputSplit) splitIt.next(); BasicDBObject min = (BasicDBObject) mongoSplit.getQuerySpec().get("$min"); BasicDBObject max = (BasicDBObject) mongoSplit.getQuerySpec().get("$max"); //DEBUG //_logger.info("+----------------- NEW SPLIT ----------------: " + min + " /" + max); //System.out.println("+----------------- NEW SPLIT ----------------: " + min + " /" + max); if (null != min) { // How does the min fit in with the general query try { if (compareFields(-1, originalQuery, min, max, orderedArraySet, orderedArraySet_afterMin) < 0) { splitIt.remove(); continue; } } catch (Exception e) { } // do nothing probably just some comparable issue } //TESTED if (null != max) { // How does the min fit in with the general query try { if (compareFields(1, originalQuery, max, min, orderedArraySet, orderedArraySet_afterMin) > 0) { splitIt.remove(); continue; } } catch (Exception e) { } // do nothing probably just some comparable issue } //TESTED //DEBUG //_logger.info("(retained split)"); //System.out.println("(retained split)"); // (don't worry about edge cases, won't happen very often and will just result in a spurious empty mapper) //////////////////////////////// // Now some infinit.e specific processing... if (newShardScheme) { @SuppressWarnings("rawtypes") TreeSet<Comparable> sourceKeyOrderedArray = orderedArraySet.get(DocumentPojo.sourceKey_); if ((null != sourceKeyOrderedArray) && !sourceKeyOrderedArray.isEmpty()) { @SuppressWarnings("rawtypes") Comparable minSourceKey = null; Object minSourceKeyObj = (null == min) ? null : min.get(DocumentPojo.sourceKey_); if (minSourceKeyObj instanceof String) { minSourceKey = (String) minSourceKeyObj; } if (null == minSourceKey) { minSourceKey = sourceKeyOrderedArray.first(); } //TESTED @SuppressWarnings("rawtypes") Comparable maxSourceKey = null; Object maxSourceKeyObj = (null == max) ? null : max.get(DocumentPojo.sourceKey_); if (maxSourceKeyObj instanceof String) { maxSourceKey = (String) maxSourceKeyObj; } if (null == maxSourceKey) { maxSourceKey = sourceKeyOrderedArray.last(); } //TESTED DBObject splitQuery = mongoSplit.getQuerySpec(); BasicDBObject splitQueryQuery = new BasicDBObject( (BasicBSONObject) splitQuery.get("$query")); if (0 == minSourceKey.compareTo(maxSourceKey)) { // single matching sourceKEy splitQueryQuery.put(DocumentPojo.sourceKey_, maxSourceKey); } //TESTED (array of sources, only one matches) else { // multiple matching source keys splitQueryQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeyOrderedArray.subSet(minSourceKey, true, maxSourceKey, true))); } //TESTED (array of sources, multiple match) newsplits.add( new InfiniteMongoInputSplit(mongoSplit, splitQueryQuery, conf.isNoTimeout())); } else { // original query is of sufficient simplicity newsplits.add( new InfiniteMongoInputSplit(mongoSplit, originalQuery, conf.isNoTimeout())); } //TESTED (no change to existing source) } //TESTED else { // old sharding scheme, remove min/max and replace with normal _id based query where possible DBObject splitQuery = mongoSplit.getQuerySpec(); // Step 1: create a query range for _id: BasicDBObject idRange = null; Object idMin = (min == null) ? null : min.get(DocumentPojo._id_); Object idMax = (max == null) ? null : max.get(DocumentPojo._id_); if (!(idMin instanceof ObjectId)) idMin = null; if (!(idMax instanceof ObjectId)) idMax = null; if ((null != idMin) || (null != idMax)) { idRange = new BasicDBObject(); if (null != idMin) { idRange.put(DbManager.gte_, idMin); } if (null != idMax) { idRange.put(DbManager.lt_, idMax); } } //TESTED // Step 2: merge with whatever we have at the moment: if (null != idRange) { BasicDBObject splitQueryQuery = new BasicDBObject( (BasicBSONObject) splitQuery.get("$query")); Object idQueryElement = splitQueryQuery.get(DocumentPojo._id_); boolean convertedAwayFromMinMax = false; if (null == idQueryElement) { // nice and easy, add _id range splitQueryQuery.put(DocumentPojo._id_, idRange); convertedAwayFromMinMax = true; } //TESTED else if (!splitQueryQuery.containsField(DbManager.and_)) { // OK we're going to just going to make life easy splitQueryQuery.remove(DocumentPojo._id_); splitQueryQuery.put(DbManager.and_, Arrays.asList(new BasicDBObject(DocumentPojo._id_, idQueryElement), new BasicDBObject(DocumentPojo._id_, idRange))); convertedAwayFromMinMax = true; } //TESTED // (else stick with min/max) if (convertedAwayFromMinMax) { // can construct an _id query splitQuery.removeField("$min"); splitQuery.removeField("$max"); } //TESTED splitQuery.put("$query", splitQueryQuery); } newsplits.add(new InfiniteMongoInputSplit(mongoSplit, conf.isNoTimeout())); } //TESTED } catch (Exception e) { //DEBUG //e.printStackTrace(); } // do nothing must be some other type of input split } //TESTED //DEBUG //System.out.println("Calculating splits via mongo-hadoop: " + initialSplitSize + " reduced to " + splits.size()); _logger.info("Calculating (converted) splits via mongo-hadoop: " + initialSplitSize + " reduced to " + newsplits.size()); return newsplits; } }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) private static int compareFields(int direction, BasicDBObject query, BasicDBObject minOrMax, BasicDBObject maxOrMin, Map<String, TreeSet<Comparable>> orderedArraySet, Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin) { for (String field : minOrMax.keySet()) { //DEBUG// w ww .j ava2 s . c o m //System.out.println("1] Compare: " + field + ": " + direction); try { Object queryOfThisField = query.get(field); Object minField = minOrMax.get(field); if ((null != queryOfThisField) && (minField instanceof Comparable)) { int result = 0; Comparable comparableMinOrMaxElement = (Comparable) minField; if (queryOfThisField instanceof BasicDBObject) { result = compareComplexObject(field, direction, (BasicDBObject) queryOfThisField, comparableMinOrMaxElement, orderedArraySet, orderedArraySet_afterMin); } //TESTED else { // -1 if comparableQueryElement < comparableMinOrMaxElement Comparable comparableQueryElement = (Comparable) queryOfThisField; result = comparableQueryElement.compareTo(comparableMinOrMaxElement); //DEBUG //System.out.println("3] Vals: " + comparableQueryElement + " vs " + comparableMinOrMaxElement + " = " + result); } //TESTED if (result != 0) { // if we ever get a strict inequality then stop checking fields.. if ((result == direction) || !minOrMax.equals(maxOrMin)) { // (fail) (pass but min/max keys different so not point checking any more) return result; } //TESTED } // else equality, pass but keep checking fields } } catch (Exception e) { //DEBUG //e.printStackTrace(); } // do nothing probably some odd comparable issue } return -direction; // (ie pass by default) }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" }) private static int compareComplexObject(String parentField, int direction, BasicDBObject complexQueryElement, Comparable minOrMaxElement, Map<String, TreeSet<Comparable>> orderedArraySet, Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin) { for (String field : complexQueryElement.keySet()) { //DEBUG//from www . j a v a 2 s . c o m //System.out.println("2] Compare operator: " + field + ", vs " + minOrMaxElement); if (field.equals(MongoDbManager.in_)) { NavigableSet<Comparable> orderedArray = null; if (1 == direction) { // try orderedArraySet_afterMin first... orderedArray = orderedArraySet_afterMin.get(parentField); //DEBUG //System.out.println("2.0] Found orderered sub-array for: " + parentField + ", size= " + orderedArray.size()); } //TESTED if (null == orderedArray) { // (min, or max but min didn't set a sub-array) orderedArray = orderedArraySet.get(parentField); if (null == orderedArray) { // First time for this field, order the $in for easy comparison orderedArray = new TreeSet<Comparable>(); Collection queryList = (Collection) complexQueryElement.get(MongoDbManager.in_); for (Object o : queryList) { Comparable c = (Comparable) o; orderedArray.add(c); } //DEBUG //System.out.println("2.1] Created orderered array for: " + parentField + ", size= " + orderedArray.size()); //DEBUG: // if (!orderedArray.isEmpty()) { // System.out.println("2.1.1] Head: " + orderedArray.iterator().next()); // System.out.println("2.1.2] Tail: " + orderedArray.descendingIterator().next()); // } orderedArraySet.put(parentField, (TreeSet<Comparable>) orderedArray); // (know this cast is valid by construction) } //TESTED } if (-1 == direction) { // comparing vs min //DEBUG //System.out.println("2.2] tailSet: " + orderedArray.tailSet(minOrMaxElement, true).size()); NavigableSet<Comparable> minElements = orderedArray.tailSet(minOrMaxElement, true); if (minElements.isEmpty()) { // (elements >= minElement) return direction; // will always fail } else { orderedArraySet_afterMin.put(parentField, minElements); } //TESTED } //TESTED else if (1 == direction) { // comparing vs max //DEBUG //System.out.println("2.2] headSet: " + orderedArray.headSet(minOrMaxElement, true).size()); if (orderedArray.headSet(minOrMaxElement, true).isEmpty()) { // (elements <= maxElement) return direction; // will always fail } } //TESTED } else if (field.equals(MongoDbManager.gt_) || field.equals(MongoDbManager.gte_)) { // (don't worry about the boundaries, just results in spurious empty chunks) if (1 == direction) { // can't do anything about $gt vs min Comparable comparableQueryElement = (Comparable) complexQueryElement.get(field); //DEBUG //System.out.println("2.3.1] GT Vals: " + comparableQueryElement + " vs " + minOrMaxElement + " = " + comparableQueryElement.compareTo(minOrMaxElement)); if (comparableQueryElement.compareTo(minOrMaxElement) > 0) // ie query _lower_ limit > chunk max return direction; // ie fail } } //TESTED else if (field.equals(MongoDbManager.lt_) || field.equals(MongoDbManager.lte_)) { // (don't worry about the boundaries, just results in spurious empty chunks) if (-1 == direction) { // can't do anything about $lt vs max Comparable comparableQueryElement = (Comparable) complexQueryElement.get(field); //DEBUG //System.out.println("2.3.2] LT Vals: " + comparableQueryElement + " vs " + minOrMaxElement + " = " + comparableQueryElement.compareTo(minOrMaxElement)); if (comparableQueryElement.compareTo(minOrMaxElement) < 0) // ie query upper limit < chunk min return direction; // ie fail } } //TESTED } return -direction; // (ie pass by default, don't check other fields unless they have the same min/max) }