List of usage examples for com.mongodb BasicDBList BasicDBList
BasicDBList
From source file:com.ikanow.infinit.e.api.knowledge.federated.SimpleFederatedQueryEngine.java
License:Open Source License
public BasicDBObject createDocFromJson(BasicDBList jsonList, String url, FederatedRequest request, SourceFederatedQueryConfigPojo endpointInfo) { BasicDBObject doc = null; // (don't create unless needed) BasicDBList ents = null;//from www .j av a 2s. c o m StringBuffer entVals = null; HashSet<String> entDedup = null; if (_testMode) { // In test mode, need to return the JSON even if no entities are specified doc = new BasicDBObject(); } if (null != endpointInfo.docConversionMap) { for (Map.Entry<String, String> docInfo : endpointInfo.docConversionMap.entrySet()) { for (Object jsonObj : jsonList) { BasicDBObject json = (BasicDBObject) jsonObj; try { String key = docInfo.getKey(); // (allow user to not prepend array: if they don't want to) if ((1 == json.size()) && json.containsKey((Object) "array")) { if (!key.startsWith("array:") && !key.startsWith(":array") && !key.startsWith("$:array") && !key.startsWith("::") && !key.startsWith("$::")) { if (key.startsWith(":")) { // jpath key = ":array" + key; } else if (key.startsWith("$:")) { // jpath key = "$:array" + key.substring(1); } else { key = "array:" + key; } } } //TESTED (by hand) if (key.startsWith(":")) { // jpath key = "$" + key; } // NOTE: *not* org.json.JSONArray JSONArray candidateEntities = null; if (key.startsWith("$")) { JSONArray candidateEntities_tmp = JsonPath.read(json.toString(), key.replace(':', '.')); if (null != candidateEntities_tmp) { candidateEntities = new JSONArray(); for (Object o : candidateEntities_tmp) { if (o instanceof String) { candidateEntities.add(o); } else if (o instanceof JSONArray) { candidateEntities.addAll((JSONArray) o); } } //TESTED (displayUrl vs entities, 3.2) } //DEBUG //System.out.println(candidateEntities); } //(TESTED (permutations above by hand)) else { String s = (String) MongoDbUtil.getProperty(json, key.replace(':', '.')); if (null != s) { candidateEntities = new JSONArray(); candidateEntities.add(s); } } //TESTED (3.1) if (null != candidateEntities) for (int i = 0; i < candidateEntities.size(); ++i) { Object o = candidateEntities.get(i); if (!(o instanceof String)) { continue; } String s = o.toString(); if (null == doc) { doc = new BasicDBObject(); //(various fields added below) } if (docInfo.getValue().equalsIgnoreCase(DocumentPojo.displayUrl_)) { doc.put(DocumentPojo.displayUrl_, s); } //TESTED (3.1, 4.*) else { // Entities! if (null == ents) { ents = new BasicDBList(); } String index = s.toLowerCase() + "/" + docInfo.getValue().toLowerCase(); if (null == entDedup) { entDedup = new HashSet<String>(); } else if (entDedup.contains(index)) { // Entity deduplication continue; } //TESTED (3.2) entDedup.add(index); if (null == entVals) { entVals = new StringBuffer(": "); } else { entVals.append(", "); } entVals.append(s); String dimension = null; if (null != endpointInfo.typeToDimensionMap) { try { dimension = EntityPojo.Dimension .valueOf( endpointInfo.typeToDimensionMap.get(docInfo.getValue())) .toString(); } catch (Exception e) { } } if (null == dimension) { dimension = EntityPojo.Dimension.What.toString(); } //TESTED (by hand) // (alternative to "made up" values would be to go looking in the existing docs/ents?) // (we'll try to avoid that for now...) BasicDBObject ent = new BasicDBObject(); ent.put(EntityPojo.disambiguated_name_, s); ent.put(EntityPojo.type_, docInfo.getValue()); ent.put(EntityPojo.dimension_, dimension); ent.put(EntityPojo.relevance_, 1.0); ent.put(EntityPojo.doccount_, 1L); // (ie relative to this query) ent.put(EntityPojo.averageFreq_, 1.0); ent.put(EntityPojo.datasetSignificance_, 10.0); // (ie relative to this query) ent.put(EntityPojo.significance_, 10.0); // (ie relative to this query) ent.put(EntityPojo.frequency_, 1.0); ent.put(EntityPojo.index_, index); ent.put(EntityPojo.queryCoverage_, 100.0); // (ie relative to this query) ent.put(EntityPojo.totalfrequency_, 1.0); // (ie relative to this query) ents.add(ent); } //TESTED (3.1, 4.*) } } catch (Exception e) { //(do nothing? null or the wrong type) //e.printStackTrace(); } } //end loop over various JSON objects retrieved } //(End loop over doc conversion elements) } //TESTED (3.*, 4.*) if ((null == ents) && !_testMode) { // don't return unless there are any entities return null; } else if (null != doc) { // Insert mandatory fields: // (Note the query format is a little bit different, the following fields are converted to arrays: // sourceKey, source, communityId, mediaType) doc.put(DocumentPojo._id_, new ObjectId()); doc.put(DocumentPojo.url_, url); doc.put(DocumentPojo.created_, new Date()); doc.put(DocumentPojo.modified_, new Date()); doc.put(DocumentPojo.publishedDate_, new Date()); doc.put(DocumentPojo.sourceKey_, endpointInfo.parentSource.getKey()); doc.put(DocumentPojo.source_, endpointInfo.parentSource.getTitle()); doc.put(DocumentPojo.communityId_, new ObjectId(request.communityIdStrs[0])); doc.put(DocumentPojo.mediaType_, endpointInfo.parentSource.getMediaType()); doc.put(DocumentPojo.metadata_, new BasicDBObject("json", jsonList.toArray())); if ((null != entVals) && (entVals.length() > 165)) { // (arbitrary length) entVals.setLength(165); entVals.append("..."); } doc.put(DocumentPojo.title_, new StringBuffer(endpointInfo.titlePrefix).append(": ") .append(request.requestParameter).append(entVals).toString()); doc.put(DocumentPojo.entities_, ents); Gson gson = new GsonBuilder().setPrettyPrinting().create(); JsonParser jp = new JsonParser(); JsonElement je = jp.parse(jsonList.toString()); doc.put(DocumentPojo.description_, gson.toJson(je)); // (prettified JSON) } //TESTED (3.*, 4.*) return doc; }
From source file:com.ikanow.infinit.e.api.knowledge.processing.AggregationUtils.java
License:Open Source License
public static void loadAggregationResults(ResponsePojo rp, Facets facets, Aggregations aggs, AggregationOutputPojo aggOutParams, ScoringUtils scoreStats, AliasLookupTable aliasLookup, String[] entityTypeFilterStrings, String[] assocVerbFilterStrings, AggregationUtils.GeoContainer extraAliasAggregatedGeo) { HashMap<String, List<? extends Object>> moments = null; if ((null != facets) && (null != facets.getFacets())) for (Map.Entry<String, Facet> facet : facets.getFacets().entrySet()) { // Geo if (facet.getKey().equals("geo")) { TermsFacet geoFacet = (TermsFacet) facet.getValue(); Set<GeoAggregationPojo> geoCounts = null; int nHighestCount = -1; int nLowestCount = Integer.MAX_VALUE; // If we've got some geotags from the alias masters then start with them: if ((null != extraAliasAggregatedGeo) && (null != extraAliasAggregatedGeo.geotags)) { geoCounts = extraAliasAggregatedGeo.geotags; nHighestCount = (int) extraAliasAggregatedGeo.minCount; nLowestCount = (int) extraAliasAggregatedGeo.maxCount; } else { geoCounts = new TreeSet<GeoAggregationPojo>(); }/*w ww . j a va 2 s .c o m*/ for (TermsFacet.Entry geo : geoFacet.getEntries()) { String geohash = FacetUtils.getTerm(geo).substring(2); double[] loc = GeoHashUtils.decode(geohash); GeoAggregationPojo geoObj = new GeoAggregationPojo(loc[0], loc[1]); geoObj.count = geo.getCount(); geoObj.type = GeoOntologyMapping.decodeOntologyCode(FacetUtils.getTerm(geo).charAt(0)); geoCounts.add(geoObj); // (note this aggregates geo points whose decoded lat/logns are the same, which can result in slightly fewer records than requested) // (note the aggregation writes the aggregated count into geoObj.count) if (geoObj.count > nHighestCount) { // (the counts can be modified by the add command above) nHighestCount = geo.getCount(); } if (geoObj.count < nLowestCount) { nLowestCount = geo.getCount(); } } rp.setGeo(geoCounts, nHighestCount, nLowestCount); } //(TESTED) if (facet.getKey().equals("time")) { DateHistogramFacet timeFacet = (DateHistogramFacet) facet.getValue(); rp.setTimes(timeFacet.getEntries(), QueryHandler.getInterval(aggOutParams.timesInterval, 'm')); } //(TESTED) if (facet.getKey().equals("events")) { TermsFacet eventsFacet = (TermsFacet) facet.getValue(); rp.setEvents(parseEventAggregationOutput("Event", eventsFacet, scoreStats, aliasLookup, entityTypeFilterStrings, assocVerbFilterStrings)); } if (facet.getKey().equals("facts")) { TermsFacet factsFacet = (TermsFacet) facet.getValue(); rp.setFacts(parseEventAggregationOutput("Fact", factsFacet, scoreStats, aliasLookup, entityTypeFilterStrings, assocVerbFilterStrings)); } //TESTED x2 if (facet.getKey().equals("sourceTags")) { TermsFacet tagsFacet = (TermsFacet) facet.getValue(); rp.setSourceMetaTags(tagsFacet.getEntries()); } if (facet.getKey().equals("sourceTypes")) { TermsFacet typesFacet = (TermsFacet) facet.getValue(); rp.setSourceMetaTypes(typesFacet.getEntries()); } if (facet.getKey().equals("sourceKeys")) { TermsFacet keysFacet = (TermsFacet) facet.getValue(); rp.setSources(keysFacet.getEntries()); } //TESTED x3 // Moments (basic functionality) if (facet.getKey().startsWith("moments.")) { DateHistogramFacet momentFacet = (DateHistogramFacet) facet.getValue(); if (null == moments) { moments = new HashMap<String, List<? extends Object>>(); } moments.put(facet.getKey().substring(8), momentFacet.getEntries()); } //TESTED } //(end loop over generated facets) if ((null != aggs) && (null != aggs.asMap())) for (Map.Entry<String, Aggregation> agg : aggs.asMap().entrySet()) { if (agg.getKey().equals("moments")) { if (null == moments) { moments = new HashMap<String, List<? extends Object>>(); } DateHistogram val = (DateHistogram) agg.getValue(); //TODO (INF-2688): Finalize format BasicDBList dbl = new BasicDBList(); for (DateHistogram.Bucket dateBucket : val.getBuckets()) { if (dateBucket.getKeyAsNumber().longValue() > 0) { BasicDBObject dataBucketDbo = new BasicDBObject(); dataBucketDbo.put("time", dateBucket.getKeyAsNumber().longValue()); dataBucketDbo.put("count", dateBucket.getDocCount()); for (Map.Entry<String, Aggregation> dateAggs : dateBucket.getAggregations().asMap() .entrySet()) { if (dateAggs.getKey().equals("geo")) { BasicDBList dbl_geo = new BasicDBList(); MultiBucketsAggregation geoVal = (MultiBucketsAggregation) dateAggs.getValue(); long nHighestCount = Long.MIN_VALUE; for (MultiBucketsAggregation.Bucket geoBucket : geoVal.getBuckets()) { String geohash = geoBucket.getKey().substring(2); double[] loc = GeoHashUtils.decode(geohash); GeoAggregationPojo geoObj = new GeoAggregationPojo(loc[0], loc[1]); BasicDBObject geoDbo = new BasicDBObject(4); geoDbo.put("lat", geoObj.lat); geoDbo.put("lon", geoObj.lon); geoDbo.put("count", geoBucket.getDocCount()); geoDbo.put("type", GeoOntologyMapping .decodeOntologyCode(geoBucket.getKey().charAt(0))); dbl_geo.add(geoDbo); if (geoBucket.getDocCount() > nHighestCount) { // (the counts can be modified by the add command above) nHighestCount = geoBucket.getDocCount(); } } dataBucketDbo.put("maxGeoCount", nHighestCount); dataBucketDbo.put("geo", dbl_geo); } } dbl.add(dataBucketDbo); } } moments.put("times", dbl); } } //(end loop over generated aggregations) if ((null != moments) && !moments.isEmpty()) { rp.setMoments(moments, QueryHandler.getInterval(aggOutParams.moments.timesInterval, 'm')); } }
From source file:com.ikanow.infinit.e.application.utils.LogstashConfigUtils.java
License:Open Source License
public static BasicDBObject parseLogstashConfig(String configFile, StringBuffer error) { BasicDBObject tree = new BasicDBObject(); // Stage 0: remove escaped "s and 's (for the purpose of the validation): // (prevents tricksies with escaped "s and then #s) // (http://stackoverflow.com/questions/5082398/regex-to-replace-single-backslashes-excluding-those-followed-by-certain-chars) configFile = configFile.replaceAll("(?<!\\\\)(?:((\\\\\\\\)*)\\\\)[\"']", "X"); //TESTED (by hand - using last 2 fields of success_2_1) // Stage 1: remove #s, and anything in quotes (for the purpose of the validation) configFile = configFile.replaceAll("(?m)(?:([\"'])(?:(?!\\1).)*\\1)", "VALUE").replaceAll("(?m)(?:#.*$)", "");/*w w w.j a va2s.co m*/ //TESTED (2_1 - including with a # inside the ""s - Event_Date -> Event_#Date) //TESTED (2_2 - various combinations of "s nested inside 's) ... yes that is a negative lookahead up there - yikes! // Stage 2: get a nested list of objects int depth = 0; int ifdepth = -1; Stack<Integer> ifStack = new Stack<Integer>(); BasicDBObject inputOrFilter = null; Matcher m = _navigateLogstash.matcher(configFile); // State: String currTopLevelBlockName = null; String currSecondLevelBlockName = null; BasicDBObject currSecondLevelBlock = null; while (m.find()) { boolean simpleField = false; //DEBUG //System.out.println("--DEPTH="+depth + " GROUP=" + m.group() + " IFS" + Arrays.toString(ifStack.toArray())); //System.out.println("STATES: " + currTopLevelBlockName + " AND " + currSecondLevelBlockName); if (m.group().equals("}")) { if (ifdepth == depth) { // closing an if statement ifStack.pop(); if (ifStack.isEmpty()) { ifdepth = -1; } else { ifdepth = ifStack.peek(); } } //TESTED (1_1bc, 2_1) else { // closing a processing block depth--; if (depth < 0) { // {} Mismatch error.append("{} Mismatch (})"); return null; } //TESTED (1_1abc) } } else { // new attribute! String typeName = m.group(1); if (null == typeName) { // it's an if statement or a string value typeName = m.group(4); if (null != typeName) { simpleField = true; } } else if (typeName.equalsIgnoreCase("else")) { // It's an if statement.. typeName = null; } if (null == typeName) { // if statement after all // Just keep track of ifs so we can ignore them ifStack.push(depth); ifdepth = depth; // (don't increment depth) } //TESTED (1_1bc, 2_1) else { // processing block String subTypeName = m.group(3); if (null != subTypeName) { // eg codec.multiline typeName = typeName + "." + subTypeName; } //TESTED (2_1, 2_3) if (depth == 0) { // has to be one of input/output/filter) String topLevelType = typeName.toLowerCase(); if (topLevelType.equalsIgnoreCase("input") || topLevelType.equalsIgnoreCase("filter")) { if (tree.containsField(topLevelType)) { error.append("Multiple input or filter blocks: " + topLevelType); return null; } //TESTED (1_3ab) else { inputOrFilter = new BasicDBObject(); tree.put(topLevelType, inputOrFilter); // Store state: currTopLevelBlockName = topLevelType; } //TESTED (*) } else { if (topLevelType.equalsIgnoreCase("output")) { error.append( "Not allowed output blocks - these are appended automatically by the logstash harvester"); } else { error.append("Unrecognized processing block: " + topLevelType); } return null; } //TESTED (1_4a) } else if (depth == 1) { // processing blocks String subElType = typeName.toLowerCase(); // Some validation: can't include a type called "filter" anywhere if ((null != currTopLevelBlockName) && currTopLevelBlockName.equals("input")) { if (subElType.equals("filter") || subElType.endsWith(".filter")) { error.append("Not allowed sub-elements of input called 'filter' (1)"); return null; } } //TESTED (1_5b) BasicDBList subElements = (BasicDBList) inputOrFilter.get(subElType); if (null == subElements) { subElements = new BasicDBList(); inputOrFilter.put(subElType, subElements); } BasicDBObject newEl = new BasicDBObject(); subElements.add(newEl); // Store state: currSecondLevelBlockName = subElType; currSecondLevelBlock = newEl; } //TESTED (*) else if (depth == 2) { // attributes of processing blocks // we'll just store the field names for these and do any simple validation that was too complicated for the regexes String subSubElType = typeName.toLowerCase(); // Validation: if (null != currTopLevelBlockName) { // 1] sincedb path if (currTopLevelBlockName.equals("input") && (null != currSecondLevelBlockName)) { // (don't care what the second level block name is - no sincedb allowed) if (subSubElType.equalsIgnoreCase("sincedb_path")) { error.append("Not allowed sincedb_path in input.* block"); return null; } //TESTED (1_5a) // 2] no sub-(-sub etc)-elements of input called filter if (subSubElType.equals("filter") || subSubElType.endsWith(".filter")) { error.append("Not allowed sub-elements of input called 'filter' (2)"); return null; } //TESTED (1_5c) } } // Store in map: if (null != currSecondLevelBlock) { currSecondLevelBlock.put(subSubElType, new BasicDBObject()); } } // (won't go any deeper than this) if (!simpleField) { depth++; } } } } if (0 != depth) { error.append("{} Mismatch ({)"); return null; } //TESTED (1_2a) return tree; }
From source file:com.ikanow.infinit.e.data_model.api.knowledge.DocumentPojoApiMap.java
License:Apache License
public static void mapToApi(BasicDBObject doc) { // 1. (doc_index field) doc.remove(DocumentPojo.index_);/* w ww .j a va 2 s .c o m*/ // 2. (source title) String tmp = doc.getString(DocumentPojo.source_); if (null != tmp) { BasicDBList array = new BasicDBList(); array.add(tmp); doc.put(DocumentPojo.source_, array); } // 3. (source key) tmp = DocumentPojo.getSourceKey(doc.getString(DocumentPojo.sourceKey_)); if (null != tmp) { BasicDBList array = new BasicDBList(); array.add(tmp); doc.put(DocumentPojo.sourceKey_, array); } // 4. (media type) tmp = doc.getString(DocumentPojo.mediaType_); if (null != tmp) { BasicDBList array = new BasicDBList(); array.add(tmp); doc.put(DocumentPojo.mediaType_, array); } }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
/** * Checks if the new params MAX_SPLITS and MAX_DOCS_PER_SPLIT are set * in the config. If they are it will use those to do splits via limit/skip * otherwise it will call the previous chunking splitter in MongoSplitter. * /*from ww w . jav a 2 s . c o m*/ * @param conf * @return */ public static List<InputSplit> calculateSplits(InfiniteMongoConfig conf) { // First off: What is our sharding scheme? boolean shardingPolicyNew = false; try { BasicDBObject shardQuery = new BasicDBObject("_id", "doc_metadata.metadata"); BasicDBObject shardInfo = (BasicDBObject) DbManager.getCollection("config", "collections") .findOne(shardQuery); if (null != shardInfo) { BasicDBObject shardInfoKey = (BasicDBObject) shardInfo.get("key"); if (null != shardInfoKey) { shardingPolicyNew = (shardInfoKey.size() > 1); } } } //TESTED (new and old) catch (Exception e) { } // stick with the old sharding, it's probably going to die soon after though, honestly // conf.getQuery returns a new copy of the query, so get once and use everywhere... BasicDBObject confQuery = (BasicDBObject) conf.getQuery(); BasicDBObject srcTagsQuery = (BasicDBObject) conf.getSourceTags(); String collection = conf.getInputURI().getCollection(); if (!collection.equals(DbManager.getDocument().getContent().getName()) && !collection.equals(DbManager.getDocument().getMetadata().getName())) { // Case 1: feature table or custom table // Just run legacy code return calculateSplits_phase2(conf, confQuery, false, false, null); } else { // complex cases... boolean simpleOtherIndex = false; // Check whether a simple query has been performed on a different indexed field if (null == srcTagsQuery) { // (if srcTags specified, then going to want to use sourceKey as the index) for (String s : Arrays.asList(EntityPojo.docQuery_index_, DocumentPojo.url_)) { Object selector = confQuery.get(s); if (selector instanceof String) { simpleOtherIndex = true; break; } else if (selector instanceof DBObject) { DBObject selectorDbo = (DBObject) selector; if (selectorDbo.containsField(DbManager.in_)) { simpleOtherIndex = true; break; } } } //TESTED (both types, plus check complex indexes don't work) // ALLOWED: {"entities.index": { "$in": [ "xxx", "yyy"] }, {"entities.index": "xxx" }, ditto for "url" // NOT ALLOWED: { "entities.index": { "$ne": "xxx" } } } //TESTED check ignored if eg entity_index specified if (simpleOtherIndex) { // Case 2: we have a simple query on an indexed field // Just run legacy code return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null); } //TESTED else if (conf.getLimit() > 0) { // debug //Case 3: Ensure we have small sets of sources to search over BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery, conf.getMaxDocsPerSplit()); final List<InputSplit> splits = new ArrayList<InputSplit>(); boolean queryNonTrivial = isQueryNonTrivial(confQuery); if (!queryNonTrivial) { //Case 3a: query is trivial, so can just create splits directly from the split pre-calcs int toProcess = conf.getLimit(); Iterator<Object> itSplit = collectionOfSplits.iterator(); while ((toProcess > 0) && (itSplit.hasNext())) { BasicDBObject split = (BasicDBObject) itSplit.next(); int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L); int toGet = (docCount > toProcess) ? toProcess : docCount; BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout())); toProcess -= docCount; } } //TESTED } else { // Case 3b: annoying, some extra query terms, gonna need to do it the hard way... int toProcess = conf.getLimit(); Iterator<Object> itSplit = collectionOfSplits.iterator(); DBCollection coll = InfiniteMongoConfigUtil.getCollection(conf.getInputURI()); while ((toProcess > 0) && (itSplit.hasNext())) { BasicDBObject split = (BasicDBObject) itSplit.next(); BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { int docsCounted = (int) coll.getCount(modQuery, null, toProcess, 0); int toGet = (docsCounted > toProcess) ? toProcess : docsCounted; if (docsCounted > 0) { splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout())); toProcess -= docsCounted; } } //TESTED } } //TESTED return splits; } else { // More complex cases: if (shardingPolicyNew) { // Case 4a: NEW SHARDING SCHEME // Always fetch the new sources, eg convert communityId to sourceKeys try { splitPrecalculations_newShardScheme(confQuery, srcTagsQuery); // (modifies confQuery if returns true) boolean queryNonTrivial = isQueryNonTrivial(confQuery); return calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null); // (ie trivial query => always use chunks, bypass skip/limit test) } //TESTED (trivial + non-trivial) catch (Exception e) { // Didn't match any sources, no problem return new ArrayList<InputSplit>(); } //TESTED } //TESTED else { BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery, conf.getMaxDocsPerSplit()); if (null == collectionOfSplits) { // Case 4b: OLD SHARDING SCHEME can't get a partition by source keys, just back off to old code return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null); } //TESTED (old code) else { conf.setMaxDocsPerSplit(2 * conf.getMaxDocsPerSplit()); // (because we stop creating splits when the exceed the size) // Case 4c: OLD SHARDING SCHEME, have a source key partition int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits(); boolean queryNonTrivial = isQueryNonTrivial(confQuery); final List<InputSplit> splits = new ArrayList<InputSplit>(); BasicDBObject savedQuery = confQuery; Iterator<Object> itSplit = collectionOfSplits.iterator(); BasicDBList bigSplit = null; while (itSplit.hasNext()) { BasicDBObject split = (BasicDBObject) itSplit.next(); int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L); if (docCount < nMaxCount) { // small split, will use skip/limit BasicDBObject modQuery = convertQuery(savedQuery, split.get(DocumentPojo.sourceKey_)); if (null != modQuery) { final int SPLIT_THRESHOLD = 3; // A few cases: if ((docCount < (SPLIT_THRESHOLD * conf.getMaxDocsPerSplit())) || !queryNonTrivial) { splits.addAll(calculateSplits_phase2(conf, modQuery, false, shardingPolicyNew, (Integer) docCount)); } //TESTED (based on limit, based on query) else { // My guess at the point at which you might as well as do the full query in the hope you're going // to save some (empty) splits splits.addAll(calculateSplits_phase2(conf, modQuery, false, shardingPolicyNew, null)); } //TESTED } //TESTED } else { // large split, combine all these guys into an array of source keys if (null == bigSplit) { bigSplit = new BasicDBList(); } bigSplit.add(split.get(DocumentPojo.sourceKey_)); // (guaranteed to be a single element) } } //(end loop over collections) if (null != bigSplit) { // If we have a big left over community then create a set of splits for that - always chunks if query trivial if (1 == bigSplit.size()) { confQuery.put(DocumentPojo.sourceKey_, bigSplit.iterator().next()); } else { confQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, bigSplit)); } splits.addAll(calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null)); } //TESTED: singleton+trivial (sandy), array+trivial (sentiment/enron), array+non-trivial (sentiment/enron, docGeo), singleton+non-trivial (sandy, docGeo) return splits; } //TESTED: end if Cases 4a, 4b, 4c } //(end if old vs new sharding policy) } //(non-debug case) } //(content or metadata table are most complex) }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
@SuppressWarnings("unchecked") public static BasicDBList splitPrecalculations_oldShardSchemeOrDebug(BasicDBObject query, BasicDBObject srcTagsQuery, int maxCountPerTask) { // Get the communityIds from the query Collection<ObjectId> communityIds = null; try {/*from ww w . j av a 2s .co m*/ BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_); communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_); if (null == communityIds) { return null; } } catch (Exception e) { return null; // back out } BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_, new BasicDBObject(DbManager.in_, communityIds)); BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1); keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1); BasicDBObject sortFields = new BasicDBObject(SourcePojo.key_, 1); // Get and remove the sourceKey information, incorporate into source query: Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_); if (null != sourceKeyQueryTerm) { keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); } //TESTED if (null != srcTagsQuery) { keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_)); } //TESTED DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(sortFields); // (note the sort is needed so that the potentially expensive doc query has a sensibly ordered $in clause) if (dbc.count() > 5000) { // (too many source keys to process, just going to leave well alone... note this means $srctags will fail open) return null; } else { //TreeMap<String, Long> sourceKeys = new TreeMap<String, Long>(); // Build collections of objects of format { sourceKey: string or [], totalDocs } BasicDBList sourceKeyListCollection = new BasicDBList(); BasicDBList sourceKeyList = null; int runningDocs = 0; int runningSources = 0; while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); String sourceKey = (String) dbo.get(SourcePojo.key_); if (null != sourceKey) { long docCount = 0L; try { BasicDBObject harvestStatus = (BasicDBObject) dbo.get(SourcePojo.harvest_); if (null != harvestStatus) { docCount = harvestStatus.getLong(SourceHarvestStatusPojo.doccount_, 0L); } } catch (Exception e) { } //DEBUG //System.out.println("SOURCE=" + sourceKey + " DOC_COUNT=" + docCount + " RUNNING=" + runningDocs +"," + runningSources + ": " + sourceKeyList); if (docCount > maxCountPerTask) { // source is large enough by itself // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKey); collection.put(SourceHarvestStatusPojo.doccount_, docCount); sourceKeyListCollection.add(collection); // (leaving running* alone, can keep building that) } //TESTED (by eye, system community of demo cluster) else if ((runningDocs + docCount) > maxCountPerTask) { // have now got a large enough collection of sources if (null == sourceKeyList) { sourceKeyList = new BasicDBList(); } sourceKeyList.add(sourceKey); // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount); sourceKeyListCollection.add(collection); sourceKeyList = null; runningDocs = 0; runningSources = 0; } //TESTED (by eye, system community of demo cluster) else if (runningSources >= 15) { // have a limit on the number of sources per query, to keep the queries manageable sourceKeyList.add(sourceKey); // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount); sourceKeyListCollection.add(collection); sourceKeyList = null; runningDocs = 0; runningSources = 0; } //TESTED (by eye, system community of demo cluster) else { // (keep) build(ing) list if (null == sourceKeyList) { sourceKeyList = new BasicDBList(); } sourceKeyList.add(sourceKey); runningDocs += docCount; runningSources++; } //TESTED (by eye, system community of demo cluster) } //(end if has source key) } //(end loop over cursor) // Finish off: if (null != sourceKeyList) { // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs); sourceKeyListCollection.add(collection); } //TESTED (by eye, system community of demo cluster) if (sourceKeyListCollection.isEmpty()) { // query returns empty throw new RuntimeException("Communities contain no sources"); } return sourceKeyListCollection; } // (end if too many source keys across the communities) }
From source file:com.ikanow.infinit.e.data_model.store.MongoDbUtil.java
License:Apache License
public static BasicDBList encodeArray(JsonArray a) { BasicDBList dbl = new BasicDBList(); for (JsonElement el : a) { dbl.add(encodeUnknown(el));//from w w w . ja v a 2 s. c o m } return dbl; }
From source file:com.ikanow.infinit.e.harvest.enrichment.custom.JavaScriptUtils.java
License:Open Source License
public static BasicDBList parseNativeJsObject(Object returnVal, ScriptEngine engine) throws ScriptException { try {/*from w ww.j a va 2 s. c o m*/ engine.put("output", returnVal); // Use BasicDBObject directly so I can reduce memory usage by setting the initial capacity depending on the size of the JSON array // BasicDBObject objFactory = new BasicDBObject(); // engine.put("objFactory", objFactory); BasicDBList listFactory = new BasicDBList(); engine.put("listFactory", listFactory); BasicDBList outList = new BasicDBList(); engine.put("outList", outList); engine.eval("s1(output);"); return outList; } catch (Exception e) { throw new RuntimeException("1 Cannot parse return non-JSON object: " + returnVal.getClass().toString() + ":" + returnVal.toString() + "; if embedding JAVA, considering using eg \"X = '' + X\" to convert back to native JS strings."); } }
From source file:com.ikanow.infinit.e.harvest.enrichment.legacy.opencalais.ExtractorOpenCalais.java
License:Open Source License
/** * Takes a feed with some of the information stored in it * such as title, desc, etc, and needs to parse the full * text and add entities, events, and other metadata. * //from w w w. j a v a 2 s . c om * @param partialDoc The feedpojo before extraction with fulltext field to extract on * @return The feedpojo after extraction with entities, events, and full metadata * @throws ExtractorDocumentLevelException */ @Override public void extractEntities(DocumentPojo partialDoc) throws ExtractorDocumentLevelException { if (null == partialDoc) { return; } configure(partialDoc.getTempSource()); num_extraction_requests.incrementAndGet(); try { if (null == partialDoc.getFullText()) { return; } if (partialDoc.getFullText().length() < 32) { // Else don't waste Extractor call/error logging return; } PostMethod method = createPostMethod(partialDoc.getFullText()); int responseCode = client.executeMethod(method); if (responseCode == HttpStatus.SC_FORBIDDEN) //INF-1101 forbidden gets thrown when too many concurrent requests occur, try 14 more times { int count = 1; while (count < 15 && responseCode == HttpStatus.SC_FORBIDDEN) { try { Thread.sleep(1800); } catch (Exception e) { } // carry on... responseCode = client.executeMethod(method); //attempt call again count++; } num_extraction_collisions.addAndGet(count); } if (responseCode == HttpStatus.SC_OK) { byte[] responseBytes = method.getResponseBody(); String response = new String(responseBytes, "UTF-8"); List<EntityPojo> entities = new ArrayList<EntityPojo>(); List<AssociationPojo> events = new ArrayList<AssociationPojo>(); ObjectMapper mapper = new ObjectMapper(); JsonNode root = mapper.readValue(response, JsonNode.class); Iterator<JsonNode> iter = root.getElements(); Iterator<String> iterNames = root.getFieldNames(); List<JsonNode> eventNodes = new ArrayList<JsonNode>(); BasicDBList rawEventObjects = null; while (iter.hasNext()) { String currNodeName = iterNames.next(); JsonNode currNode = iter.next(); if (!currNodeName.equals("doc")) //we can assume these are the entities/topics { String typeGroup = currNode.get("_typeGroup").getTextValue(); //check typegroup to see if it is an entity if (typeGroup.equals("entities")) { try { EntityPojo ep = new EntityPojo(); //get what fields we can ep.setType(currNode.get("_type").getTextValue()); try { ep.setDimension(DimensionUtility.getDimensionByType(ep.getType())); } catch (java.lang.IllegalArgumentException e) { ep.setDimension(EntityPojo.Dimension.What); } String name = ""; JsonNode nameNode = null; try { nameNode = currNode.get("name"); name = nameNode.getTextValue(); } catch (Exception ex) { logger.debug("Error parsing name node: " + currNode.toString()); continue; } ep.setActual_name(name); ep.setRelevance(Double.parseDouble(currNode.get("relevance").getValueAsText())); ep.setFrequency((long) currNode.get("instances").size()); //attempt to get resolutions if they exist JsonNode resolutionNode = currNode.get("resolutions"); if (null != resolutionNode) { //resolution nodes are arrays JsonNode resolutionFirst = resolutionNode.get(0); ep.setSemanticLinks(new ArrayList<String>()); ep.getSemanticLinks().add(resolutionFirst.get("id").getTextValue()); //this is a link to an alchemy page ep.setDisambiguatedName(resolutionFirst.get("name").getTextValue()); //check if we need to create a geo object if (null != resolutionFirst.get("latitude")) { GeoPojo gp = new GeoPojo(); String lat = resolutionFirst.get("latitude").getValueAsText(); String lon = resolutionFirst.get("longitude").getValueAsText(); gp.lat = Double.parseDouble(lat); gp.lon = Double.parseDouble(lon); ep.setGeotag(gp); } } else { ep.setDisambiguatedName(name); // use actual name) } entityNameMap.put(currNodeName.toLowerCase(), ep); entities.add(ep); } catch (Exception ex) { logger.error("Error creating event pojo from OpenCalaisNode: " + ex.getMessage(), ex); } } else if (typeGroup.equals("relations")) { eventNodes.add(currNode); } } } //handle events if (bAddRawEventsToMetadata) { // For now just re-process these into DB objects since we know that works... rawEventObjects = new BasicDBList(); } for (JsonNode eventNode : eventNodes) { AssociationPojo event = parseEvent(eventNode); //remove useless events (an event is useless if it only has a verb (guessing currently) if (null != event) { event = removeUselessEvents(event); if (null != event) { events.add(event); } } if (bAddRawEventsToMetadata) { BasicDBObject eventDbo = (BasicDBObject) com.mongodb.util.JSON.parse(eventNode.toString()); if (null != eventDbo) { BasicDBObject transformObj = new BasicDBObject(); for (Map.Entry<String, Object> entries : eventDbo.entrySet()) { if (entries.getValue() instanceof String) { String val = (String) entries.getValue(); EntityPojo transformVal = findMappedEntityName(val); if (null != transformVal) { transformObj.put(entries.getKey(), transformVal.getIndex()); transformObj.put(entries.getKey() + "__hash", val); } else { transformObj.put(entries.getKey(), val); } } else { transformObj.put(entries.getKey(), entries.getValue()); } } // (add to another list, which will get written to metadata) rawEventObjects.add(transformObj); } } } if (bAddRawEventsToMetadata) { partialDoc.addToMetadata("OpenCalaisEvents", rawEventObjects.toArray()); } if (null != partialDoc.getEntities()) { partialDoc.getEntities().addAll(entities); partialDoc.setEntities(partialDoc.getEntities()); } else if (null != entities) { partialDoc.setEntities(entities); } if (null != partialDoc.getAssociations()) { partialDoc.getAssociations().addAll(events); partialDoc.setAssociations(partialDoc.getAssociations()); } else if (null != events) { partialDoc.setAssociations(events); } } else // Error back from OC, presumably the input doc is malformed/too long { throw new InfiniteEnums.ExtractorDocumentLevelException( "OpenCalais HTTP error code: " + Integer.toString(responseCode)); } } catch (Exception e) { //DEBUG //e.printStackTrace(); logger.debug("OpenCalais", e); //there was an error, so we return null instead throw new InfiniteEnums.ExtractorDocumentLevelException(e.getMessage()); } }
From source file:com.ikanow.infinit.e.harvest.extraction.document.database.DatabaseHarvester.java
License:Open Source License
public static BasicDBList getComplexArray(String columnName, java.sql.Array a) throws IllegalArgumentException, SQLException { BasicDBList bsonArray = new BasicDBList(); Object array = a.getArray();// ww w. j a v a2 s. com int length = Array.getLength(array); for (int i = 0; i < length; ++i) { Object o = Array.get(array, i); bsonArray.add(convertJdbcTypes(columnName, o)); } a.free(); return bsonArray; }