Example usage for com.mongodb BasicDBList BasicDBList

List of usage examples for com.mongodb BasicDBList BasicDBList


In this page you can find the example usage for com.mongodb BasicDBList BasicDBList.



Source Link


From source file:com.ikanow.infinit.e.api.knowledge.federated.SimpleFederatedQueryEngine.java

License:Open Source License

public BasicDBObject createDocFromJson(BasicDBList jsonList, String url, FederatedRequest request,
        SourceFederatedQueryConfigPojo endpointInfo) {
    BasicDBObject doc = null; // (don't create unless needed)
    BasicDBList ents = null;//from www .j av  a 2s.  c o  m
    StringBuffer entVals = null;
    HashSet<String> entDedup = null;

    if (_testMode) { // In test mode, need to return the JSON even if no entities are specified 
        doc = new BasicDBObject();
    if (null != endpointInfo.docConversionMap) {
        for (Map.Entry<String, String> docInfo : endpointInfo.docConversionMap.entrySet()) {
            for (Object jsonObj : jsonList) {
                BasicDBObject json = (BasicDBObject) jsonObj;
                try {
                    String key = docInfo.getKey();
                    // (allow user to not prepend array: if they don't want to)
                    if ((1 == json.size()) && json.containsKey((Object) "array")) {
                        if (!key.startsWith("array:") && !key.startsWith(":array") && !key.startsWith("$:array")
                                && !key.startsWith("::") && !key.startsWith("$::")) {
                            if (key.startsWith(":")) { // jpath
                                key = ":array" + key;
                            } else if (key.startsWith("$:")) { // jpath
                                key = "$:array" + key.substring(1);
                            } else {
                                key = "array:" + key;
                    } //TESTED (by hand)
                    if (key.startsWith(":")) { // jpath
                        key = "$" + key;
                    // NOTE: *not* org.json.JSONArray
                    JSONArray candidateEntities = null;
                    if (key.startsWith("$")) {
                        JSONArray candidateEntities_tmp = JsonPath.read(json.toString(), key.replace(':', '.'));
                        if (null != candidateEntities_tmp) {
                            candidateEntities = new JSONArray();
                            for (Object o : candidateEntities_tmp) {
                                if (o instanceof String) {
                                } else if (o instanceof JSONArray) {
                                    candidateEntities.addAll((JSONArray) o);
                            } //TESTED (displayUrl vs entities, 3.2)

                    } //(TESTED (permutations above by hand))
                    else {
                        String s = (String) MongoDbUtil.getProperty(json, key.replace(':', '.'));
                        if (null != s) {
                            candidateEntities = new JSONArray();
                    } //TESTED (3.1)                                 

                    if (null != candidateEntities)
                        for (int i = 0; i < candidateEntities.size(); ++i) {
                            Object o = candidateEntities.get(i);
                            if (!(o instanceof String)) {
                            String s = o.toString();
                            if (null == doc) {
                                doc = new BasicDBObject();
                                //(various fields added below)
                            if (docInfo.getValue().equalsIgnoreCase(DocumentPojo.displayUrl_)) {
                                doc.put(DocumentPojo.displayUrl_, s);
                            } //TESTED (3.1, 4.*)
                            else { // Entities!
                                if (null == ents) {
                                    ents = new BasicDBList();
                                String index = s.toLowerCase() + "/" + docInfo.getValue().toLowerCase();

                                if (null == entDedup) {
                                    entDedup = new HashSet<String>();
                                } else if (entDedup.contains(index)) { // Entity deduplication
                                } //TESTED (3.2)

                                if (null == entVals) {
                                    entVals = new StringBuffer(": ");
                                } else {
                                    entVals.append(", ");

                                String dimension = null;
                                if (null != endpointInfo.typeToDimensionMap) {
                                    try {
                                        dimension = EntityPojo.Dimension
                                    } catch (Exception e) {
                                if (null == dimension) {
                                    dimension = EntityPojo.Dimension.What.toString();
                                } //TESTED (by hand)

                                // (alternative to "made up" values would be to go looking in the existing docs/ents?)
                                // (we'll try to avoid that for now...)
                                BasicDBObject ent = new BasicDBObject();
                                ent.put(EntityPojo.disambiguated_name_, s);
                                ent.put(EntityPojo.type_, docInfo.getValue());
                                ent.put(EntityPojo.dimension_, dimension);
                                ent.put(EntityPojo.relevance_, 1.0);
                                ent.put(EntityPojo.doccount_, 1L); // (ie relative to this query)
                                ent.put(EntityPojo.averageFreq_, 1.0);
                                ent.put(EntityPojo.datasetSignificance_, 10.0); // (ie relative to this query)
                                ent.put(EntityPojo.significance_, 10.0); // (ie relative to this query)
                                ent.put(EntityPojo.frequency_, 1.0);
                                ent.put(EntityPojo.index_, index);
                                ent.put(EntityPojo.queryCoverage_, 100.0); // (ie relative to this query)
                                ent.put(EntityPojo.totalfrequency_, 1.0); // (ie relative to this query)
                            } //TESTED (3.1, 4.*)
                } catch (Exception e) {
                    //(do nothing? null or the wrong type)
            } //end loop over various JSON objects retrieved
        } //(End loop over doc conversion elements)
    } //TESTED (3.*, 4.*)

    if ((null == ents) && !_testMode) { // don't return unless there are any entities
        return null;
    } else if (null != doc) {
        // Insert mandatory fields:
        // (Note the query format is a little bit different, the following fields are converted to arrays:
        //  sourceKey, source, communityId, mediaType)
        doc.put(DocumentPojo._id_, new ObjectId());
        doc.put(DocumentPojo.url_, url);
        doc.put(DocumentPojo.created_, new Date());
        doc.put(DocumentPojo.modified_, new Date());
        doc.put(DocumentPojo.publishedDate_, new Date());
        doc.put(DocumentPojo.sourceKey_, endpointInfo.parentSource.getKey());
        doc.put(DocumentPojo.source_, endpointInfo.parentSource.getTitle());
        doc.put(DocumentPojo.communityId_, new ObjectId(request.communityIdStrs[0]));
        doc.put(DocumentPojo.mediaType_, endpointInfo.parentSource.getMediaType());
        doc.put(DocumentPojo.metadata_, new BasicDBObject("json", jsonList.toArray()));

        if ((null != entVals) && (entVals.length() > 165)) { // (arbitrary length)
        doc.put(DocumentPojo.title_, new StringBuffer(endpointInfo.titlePrefix).append(": ")
        doc.put(DocumentPojo.entities_, ents);
        Gson gson = new GsonBuilder().setPrettyPrinting().create();
        JsonParser jp = new JsonParser();
        JsonElement je = jp.parse(jsonList.toString());
        doc.put(DocumentPojo.description_, gson.toJson(je)); // (prettified JSON)            
    } //TESTED (3.*, 4.*)

    return doc;

From source file:com.ikanow.infinit.e.api.knowledge.processing.AggregationUtils.java

License:Open Source License

public static void loadAggregationResults(ResponsePojo rp, Facets facets, Aggregations aggs,
        AggregationOutputPojo aggOutParams, ScoringUtils scoreStats, AliasLookupTable aliasLookup,
        String[] entityTypeFilterStrings, String[] assocVerbFilterStrings,
        AggregationUtils.GeoContainer extraAliasAggregatedGeo) {
    HashMap<String, List<? extends Object>> moments = null;

    if ((null != facets) && (null != facets.getFacets()))
        for (Map.Entry<String, Facet> facet : facets.getFacets().entrySet()) {
            // Geo

            if (facet.getKey().equals("geo")) {
                TermsFacet geoFacet = (TermsFacet) facet.getValue();
                Set<GeoAggregationPojo> geoCounts = null;
                int nHighestCount = -1;
                int nLowestCount = Integer.MAX_VALUE;
                // If we've got some geotags from the alias masters then start with them:
                if ((null != extraAliasAggregatedGeo) && (null != extraAliasAggregatedGeo.geotags)) {
                    geoCounts = extraAliasAggregatedGeo.geotags;
                    nHighestCount = (int) extraAliasAggregatedGeo.minCount;
                    nLowestCount = (int) extraAliasAggregatedGeo.maxCount;
                } else {
                    geoCounts = new TreeSet<GeoAggregationPojo>();
                }/*w ww  .  j  a va 2  s  .c  o  m*/
                for (TermsFacet.Entry geo : geoFacet.getEntries()) {
                    String geohash = FacetUtils.getTerm(geo).substring(2);
                    double[] loc = GeoHashUtils.decode(geohash);
                    GeoAggregationPojo geoObj = new GeoAggregationPojo(loc[0], loc[1]);
                    geoObj.count = geo.getCount();
                    geoObj.type = GeoOntologyMapping.decodeOntologyCode(FacetUtils.getTerm(geo).charAt(0));
                    // (note this aggregates geo points whose decoded lat/logns are the same, which can result in slightly fewer records than requested)
                    // (note the aggregation writes the aggregated count into geoObj.count)

                    if (geoObj.count > nHighestCount) { // (the counts can be modified by the add command above)
                        nHighestCount = geo.getCount();
                    if (geoObj.count < nLowestCount) {
                        nLowestCount = geo.getCount();
                rp.setGeo(geoCounts, nHighestCount, nLowestCount);
            } //(TESTED)
            if (facet.getKey().equals("time")) {
                DateHistogramFacet timeFacet = (DateHistogramFacet) facet.getValue();
                rp.setTimes(timeFacet.getEntries(), QueryHandler.getInterval(aggOutParams.timesInterval, 'm'));
            } //(TESTED)

            if (facet.getKey().equals("events")) {
                TermsFacet eventsFacet = (TermsFacet) facet.getValue();
                rp.setEvents(parseEventAggregationOutput("Event", eventsFacet, scoreStats, aliasLookup,
                        entityTypeFilterStrings, assocVerbFilterStrings));
            if (facet.getKey().equals("facts")) {
                TermsFacet factsFacet = (TermsFacet) facet.getValue();
                rp.setFacts(parseEventAggregationOutput("Fact", factsFacet, scoreStats, aliasLookup,
                        entityTypeFilterStrings, assocVerbFilterStrings));
            //TESTED x2

            if (facet.getKey().equals("sourceTags")) {
                TermsFacet tagsFacet = (TermsFacet) facet.getValue();
            if (facet.getKey().equals("sourceTypes")) {
                TermsFacet typesFacet = (TermsFacet) facet.getValue();
            if (facet.getKey().equals("sourceKeys")) {
                TermsFacet keysFacet = (TermsFacet) facet.getValue();
            //TESTED x3

            // Moments (basic functionality)

            if (facet.getKey().startsWith("moments.")) {
                DateHistogramFacet momentFacet = (DateHistogramFacet) facet.getValue();
                if (null == moments) {
                    moments = new HashMap<String, List<? extends Object>>();
                moments.put(facet.getKey().substring(8), momentFacet.getEntries());
            } //TESTED

        } //(end loop over generated facets)   

    if ((null != aggs) && (null != aggs.asMap()))
        for (Map.Entry<String, Aggregation> agg : aggs.asMap().entrySet()) {

            if (agg.getKey().equals("moments")) {
                if (null == moments) {
                    moments = new HashMap<String, List<? extends Object>>();

                DateHistogram val = (DateHistogram) agg.getValue();

                //TODO (INF-2688): Finalize format 
                BasicDBList dbl = new BasicDBList();
                for (DateHistogram.Bucket dateBucket : val.getBuckets()) {
                    if (dateBucket.getKeyAsNumber().longValue() > 0) {
                        BasicDBObject dataBucketDbo = new BasicDBObject();
                        dataBucketDbo.put("time", dateBucket.getKeyAsNumber().longValue());
                        dataBucketDbo.put("count", dateBucket.getDocCount());
                        for (Map.Entry<String, Aggregation> dateAggs : dateBucket.getAggregations().asMap()
                                .entrySet()) {
                            if (dateAggs.getKey().equals("geo")) {

                                BasicDBList dbl_geo = new BasicDBList();
                                MultiBucketsAggregation geoVal = (MultiBucketsAggregation) dateAggs.getValue();

                                long nHighestCount = Long.MIN_VALUE;
                                for (MultiBucketsAggregation.Bucket geoBucket : geoVal.getBuckets()) {
                                    String geohash = geoBucket.getKey().substring(2);
                                    double[] loc = GeoHashUtils.decode(geohash);
                                    GeoAggregationPojo geoObj = new GeoAggregationPojo(loc[0], loc[1]);
                                    BasicDBObject geoDbo = new BasicDBObject(4);
                                    geoDbo.put("lat", geoObj.lat);
                                    geoDbo.put("lon", geoObj.lon);
                                    geoDbo.put("count", geoBucket.getDocCount());
                                    geoDbo.put("type", GeoOntologyMapping

                                    if (geoBucket.getDocCount() > nHighestCount) { // (the counts can be modified by the add command above)
                                        nHighestCount = geoBucket.getDocCount();
                                dataBucketDbo.put("maxGeoCount", nHighestCount);
                                dataBucketDbo.put("geo", dbl_geo);
                moments.put("times", dbl);
        } //(end loop over generated aggregations)      

    if ((null != moments) && !moments.isEmpty()) {
        rp.setMoments(moments, QueryHandler.getInterval(aggOutParams.moments.timesInterval, 'm'));


From source file:com.ikanow.infinit.e.application.utils.LogstashConfigUtils.java

License:Open Source License

public static BasicDBObject parseLogstashConfig(String configFile, StringBuffer error) {

    BasicDBObject tree = new BasicDBObject();

    // Stage 0: remove escaped "s and 's (for the purpose of the validation):
    // (prevents tricksies with escaped "s and then #s)
    // (http://stackoverflow.com/questions/5082398/regex-to-replace-single-backslashes-excluding-those-followed-by-certain-chars)
    configFile = configFile.replaceAll("(?<!\\\\)(?:((\\\\\\\\)*)\\\\)[\"']", "X");
    //TESTED (by hand - using last 2 fields of success_2_1)

    // Stage 1: remove #s, and anything in quotes (for the purpose of the validation)
    configFile = configFile.replaceAll("(?m)(?:([\"'])(?:(?!\\1).)*\\1)", "VALUE").replaceAll("(?m)(?:#.*$)",
            "");/*w  w w.j a  va2s.co  m*/
    //TESTED (2_1 - including with a # inside the ""s - Event_Date -> Event_#Date)
    //TESTED (2_2 - various combinations of "s nested inside 's) ... yes that is a negative lookahead up there - yikes!

    // Stage 2: get a nested list of objects
    int depth = 0;
    int ifdepth = -1;
    Stack<Integer> ifStack = new Stack<Integer>();
    BasicDBObject inputOrFilter = null;
    Matcher m = _navigateLogstash.matcher(configFile);
    // State:
    String currTopLevelBlockName = null;
    String currSecondLevelBlockName = null;
    BasicDBObject currSecondLevelBlock = null;
    while (m.find()) {
        boolean simpleField = false;

        //System.out.println("--DEPTH="+depth + " GROUP=" + m.group() + " IFS" + Arrays.toString(ifStack.toArray()));
        //System.out.println("STATES: " + currTopLevelBlockName + " AND " + currSecondLevelBlockName);

        if (m.group().equals("}")) {

            if (ifdepth == depth) { // closing an if statement
                if (ifStack.isEmpty()) {
                    ifdepth = -1;
                } else {
                    ifdepth = ifStack.peek();
            } //TESTED (1_1bc, 2_1)
            else { // closing a processing block

                if (depth < 0) { // {} Mismatch
                    error.append("{} Mismatch (})");
                    return null;
                } //TESTED (1_1abc)
        } else { // new attribute!

            String typeName = m.group(1);
            if (null == typeName) { // it's an if statement or a string value
                typeName = m.group(4);
                if (null != typeName) {
                    simpleField = true;
            } else if (typeName.equalsIgnoreCase("else")) { // It's an if statement..
                typeName = null;
            if (null == typeName) { // if statement after all
                // Just keep track of ifs so we can ignore them
                ifdepth = depth;
                // (don't increment depth)
            } //TESTED (1_1bc, 2_1)
            else { // processing block
                String subTypeName = m.group(3);
                if (null != subTypeName) { // eg codec.multiline
                    typeName = typeName + "." + subTypeName;
                } //TESTED (2_1, 2_3)

                if (depth == 0) { // has to be one of input/output/filter)
                    String topLevelType = typeName.toLowerCase();
                    if (topLevelType.equalsIgnoreCase("input") || topLevelType.equalsIgnoreCase("filter")) {
                        if (tree.containsField(topLevelType)) {
                            error.append("Multiple input or filter blocks: " + topLevelType);
                            return null;
                        } //TESTED (1_3ab)
                        else {
                            inputOrFilter = new BasicDBObject();
                            tree.put(topLevelType, inputOrFilter);

                            // Store state:
                            currTopLevelBlockName = topLevelType;
                        } //TESTED (*)
                    } else {
                        if (topLevelType.equalsIgnoreCase("output")) {
                                    "Not allowed output blocks - these are appended automatically by the logstash harvester");
                        } else {
                            error.append("Unrecognized processing block: " + topLevelType);
                        return null;
                    } //TESTED (1_4a)
                } else if (depth == 1) { // processing blocks
                    String subElType = typeName.toLowerCase();

                    // Some validation: can't include a type called "filter" anywhere
                    if ((null != currTopLevelBlockName) && currTopLevelBlockName.equals("input")) {
                        if (subElType.equals("filter") || subElType.endsWith(".filter")) {
                            error.append("Not allowed sub-elements of input called 'filter' (1)");
                            return null;
                    } //TESTED (1_5b)

                    BasicDBList subElements = (BasicDBList) inputOrFilter.get(subElType);
                    if (null == subElements) {
                        subElements = new BasicDBList();
                        inputOrFilter.put(subElType, subElements);
                    BasicDBObject newEl = new BasicDBObject();

                    // Store state:
                    currSecondLevelBlockName = subElType;
                    currSecondLevelBlock = newEl;
                } //TESTED (*)
                else if (depth == 2) { // attributes of processing blocks
                    // we'll just store the field names for these and do any simple validation that was too complicated for the regexes
                    String subSubElType = typeName.toLowerCase();

                    // Validation:
                    if (null != currTopLevelBlockName) {
                        // 1] sincedb path
                        if (currTopLevelBlockName.equals("input") && (null != currSecondLevelBlockName)) {
                            // (don't care what the second level block name is - no sincedb allowed)
                            if (subSubElType.equalsIgnoreCase("sincedb_path")) {
                                error.append("Not allowed sincedb_path in input.* block");
                                return null;
                            } //TESTED (1_5a)
                              // 2] no sub-(-sub etc)-elements of input called filter
                            if (subSubElType.equals("filter") || subSubElType.endsWith(".filter")) {
                                error.append("Not allowed sub-elements of input called 'filter' (2)");
                                return null;
                            } //TESTED (1_5c)

                    // Store in map:
                    if (null != currSecondLevelBlock) {
                        currSecondLevelBlock.put(subSubElType, new BasicDBObject());
                // (won't go any deeper than this)
                if (!simpleField) {

    if (0 != depth) {
        error.append("{} Mismatch ({)");
        return null;
    } //TESTED (1_2a)

    return tree;

From source file:com.ikanow.infinit.e.data_model.api.knowledge.DocumentPojoApiMap.java

License:Apache License

public static void mapToApi(BasicDBObject doc) {
    // 1. (doc_index field)
    doc.remove(DocumentPojo.index_);/* w ww  .j  a va  2  s  .c  o  m*/
    // 2. (source title)
    String tmp = doc.getString(DocumentPojo.source_);
    if (null != tmp) {
        BasicDBList array = new BasicDBList();
        doc.put(DocumentPojo.source_, array);
    // 3. (source key)
    tmp = DocumentPojo.getSourceKey(doc.getString(DocumentPojo.sourceKey_));
    if (null != tmp) {
        BasicDBList array = new BasicDBList();
        doc.put(DocumentPojo.sourceKey_, array);
    // 4. (media type)
    tmp = doc.getString(DocumentPojo.mediaType_);
    if (null != tmp) {
        BasicDBList array = new BasicDBList();
        doc.put(DocumentPojo.mediaType_, array);


From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

 * Checks if the new params MAX_SPLITS and MAX_DOCS_PER_SPLIT are set
 * in the config.  If they are it will use those to do splits via limit/skip
 * otherwise it will call the previous chunking splitter in MongoSplitter.
 * /*from ww w  .  jav  a  2  s  .  c  o m*/
 * @param conf
 * @return

public static List<InputSplit> calculateSplits(InfiniteMongoConfig conf) {
    // First off: What is our sharding scheme?

    boolean shardingPolicyNew = false;
    try {
        BasicDBObject shardQuery = new BasicDBObject("_id", "doc_metadata.metadata");
        BasicDBObject shardInfo = (BasicDBObject) DbManager.getCollection("config", "collections")
        if (null != shardInfo) {
            BasicDBObject shardInfoKey = (BasicDBObject) shardInfo.get("key");
            if (null != shardInfoKey) {
                shardingPolicyNew = (shardInfoKey.size() > 1);
    } //TESTED (new and old)
    catch (Exception e) {
    } // stick with the old sharding, it's probably going to die soon after though, honestly

    // conf.getQuery returns a new copy of the query, so get once and use everywhere...
    BasicDBObject confQuery = (BasicDBObject) conf.getQuery();

    BasicDBObject srcTagsQuery = (BasicDBObject) conf.getSourceTags();

    String collection = conf.getInputURI().getCollection();
    if (!collection.equals(DbManager.getDocument().getContent().getName())
            && !collection.equals(DbManager.getDocument().getMetadata().getName())) {
        // Case 1: feature table or custom table
        // Just run legacy code
        return calculateSplits_phase2(conf, confQuery, false, false, null);
    } else { // complex cases...
        boolean simpleOtherIndex = false;
        // Check whether a simple query has been performed on a different indexed field         
        if (null == srcTagsQuery) { // (if srcTags specified, then going to want to use sourceKey as the index)
            for (String s : Arrays.asList(EntityPojo.docQuery_index_, DocumentPojo.url_)) {
                Object selector = confQuery.get(s);
                if (selector instanceof String) {
                    simpleOtherIndex = true;
                } else if (selector instanceof DBObject) {
                    DBObject selectorDbo = (DBObject) selector;
                    if (selectorDbo.containsField(DbManager.in_)) {
                        simpleOtherIndex = true;
            } //TESTED (both types, plus check complex indexes don't work)         
              // ALLOWED: {"entities.index": { "$in": [ "xxx", "yyy"] }, {"entities.index": "xxx" }, ditto for "url"
              // NOT ALLOWED: { "entities.index": { "$ne": "xxx" } }
        //TESTED check ignored if eg entity_index specified

        if (simpleOtherIndex) {
            // Case 2: we have a simple query on an indexed field 
            // Just run legacy code

            return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);
        } //TESTED
        else if (conf.getLimit() > 0) { // debug
            //Case 3: Ensure we have small sets of sources to search over
            BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery,
            final List<InputSplit> splits = new ArrayList<InputSplit>();

            boolean queryNonTrivial = isQueryNonTrivial(confQuery);
            if (!queryNonTrivial) {
                //Case 3a: query is trivial, so can just create splits directly from the split pre-calcs
                int toProcess = conf.getLimit();
                Iterator<Object> itSplit = collectionOfSplits.iterator();
                while ((toProcess > 0) && (itSplit.hasNext())) {
                    BasicDBObject split = (BasicDBObject) itSplit.next();

                    int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                    int toGet = (docCount > toProcess) ? toProcess : docCount;
                    BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
                    if (null != modQuery) {
                        splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery,
                                conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
                        toProcess -= docCount;
                } //TESTED
            } else {
                // Case 3b: annoying, some extra query terms, gonna need to do it the hard way...
                int toProcess = conf.getLimit();
                Iterator<Object> itSplit = collectionOfSplits.iterator();
                DBCollection coll = InfiniteMongoConfigUtil.getCollection(conf.getInputURI());
                while ((toProcess > 0) && (itSplit.hasNext())) {
                    BasicDBObject split = (BasicDBObject) itSplit.next();

                    BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
                    if (null != modQuery) {
                        int docsCounted = (int) coll.getCount(modQuery, null, toProcess, 0);
                        int toGet = (docsCounted > toProcess) ? toProcess : docsCounted;
                        if (docsCounted > 0) {
                            splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(),
                                    modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
                            toProcess -= docsCounted;
                    } //TESTED
            } //TESTED

            return splits;
        } else { // More complex cases:

            if (shardingPolicyNew) {
                // Case 4a: NEW SHARDING SCHEME

                // Always fetch the new sources, eg convert communityId to sourceKeys
                try {
                    splitPrecalculations_newShardScheme(confQuery, srcTagsQuery); // (modifies confQuery if returns true)            
                    boolean queryNonTrivial = isQueryNonTrivial(confQuery);

                    return calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null);

                    // (ie trivial query => always use chunks, bypass skip/limit test)
                } //TESTED (trivial + non-trivial)
                catch (Exception e) { // Didn't match any sources, no problem
                    return new ArrayList<InputSplit>();
                } //TESTED

            } //TESTED
            else {

                BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery,
                        srcTagsQuery, conf.getMaxDocsPerSplit());

                if (null == collectionOfSplits) {
                    // Case 4b: OLD SHARDING SCHEME can't get a partition by source keys, just back off to old code
                    return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);
                } //TESTED (old code)
                else {
                    conf.setMaxDocsPerSplit(2 * conf.getMaxDocsPerSplit());
                    // (because we stop creating splits when the exceed the size)

                    // Case 4c: OLD SHARDING SCHEME, have a source key partition
                    int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits();
                    boolean queryNonTrivial = isQueryNonTrivial(confQuery);
                    final List<InputSplit> splits = new ArrayList<InputSplit>();

                    BasicDBObject savedQuery = confQuery;

                    Iterator<Object> itSplit = collectionOfSplits.iterator();
                    BasicDBList bigSplit = null;
                    while (itSplit.hasNext()) {
                        BasicDBObject split = (BasicDBObject) itSplit.next();
                        int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                        if (docCount < nMaxCount) { // small split, will use skip/limit
                            BasicDBObject modQuery = convertQuery(savedQuery,
                            if (null != modQuery) {

                                final int SPLIT_THRESHOLD = 3;
                                // A few cases:
                                if ((docCount < (SPLIT_THRESHOLD * conf.getMaxDocsPerSplit()))
                                        || !queryNonTrivial) {
                                    splits.addAll(calculateSplits_phase2(conf, modQuery, false,
                                            shardingPolicyNew, (Integer) docCount));
                                } //TESTED (based on limit, based on query)
                                else {
                                    // My guess at the point at which you might as well as do the full query in the hope you're going
                                    // to save some (empty) splits
                                    splits.addAll(calculateSplits_phase2(conf, modQuery, false,
                                            shardingPolicyNew, null));
                                } //TESTED
                            } //TESTED
                        } else { // large split, combine all these guys into an array of source keys
                            if (null == bigSplit) {
                                bigSplit = new BasicDBList();
                            // (guaranteed to be a single element)
                    } //(end loop over collections)

                    if (null != bigSplit) {

                        // If we have a big left over community then create a set of splits for that - always chunks if query trivial
                        if (1 == bigSplit.size()) {
                            confQuery.put(DocumentPojo.sourceKey_, bigSplit.iterator().next());
                        } else {
                            confQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, bigSplit));
                        splits.addAll(calculateSplits_phase2(conf, confQuery, !queryNonTrivial,
                                shardingPolicyNew, null));
                    } //TESTED: singleton+trivial (sandy), array+trivial (sentiment/enron), array+non-trivial (sentiment/enron, docGeo), singleton+non-trivial (sandy, docGeo)

                    return splits;

                } //TESTED: end if Cases 4a, 4b, 4c

            } //(end if old vs new sharding policy)

        } //(non-debug case)
    } //(content or metadata table are most complex)

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

public static BasicDBList splitPrecalculations_oldShardSchemeOrDebug(BasicDBObject query,
        BasicDBObject srcTagsQuery, int maxCountPerTask) {
    // Get the communityIds from the query
    Collection<ObjectId> communityIds = null;
    try {/*from ww  w . j av a  2s .co  m*/
        BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_);
        communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_);
        if (null == communityIds) {
            return null;
    } catch (Exception e) {
        return null; // back out

    BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_,
            new BasicDBObject(DbManager.in_, communityIds));
    BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
    keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
    BasicDBObject sortFields = new BasicDBObject(SourcePojo.key_, 1);

    // Get and remove the sourceKey information, incorporate into source query:
    Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_);
    if (null != sourceKeyQueryTerm) {
        keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm);
    } //TESTED
    if (null != srcTagsQuery) {
        keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_));
    } //TESTED

    DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(sortFields);
    // (note the sort is needed so that the potentially expensive doc query has a sensibly ordered $in clause)
    if (dbc.count() > 5000) {
        // (too many source keys to process, just going to leave well alone... note this means $srctags will fail open)
        return null;
    } else {
        //TreeMap<String, Long> sourceKeys = new TreeMap<String, Long>();
        // Build collections of objects of format { sourceKey: string or [], totalDocs }
        BasicDBList sourceKeyListCollection = new BasicDBList();
        BasicDBList sourceKeyList = null;
        int runningDocs = 0;
        int runningSources = 0;
        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            String sourceKey = (String) dbo.get(SourcePojo.key_);
            if (null != sourceKey) {
                long docCount = 0L;
                try {
                    BasicDBObject harvestStatus = (BasicDBObject) dbo.get(SourcePojo.harvest_);
                    if (null != harvestStatus) {
                        docCount = harvestStatus.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                } catch (Exception e) {

                //System.out.println("SOURCE=" + sourceKey + " DOC_COUNT=" + docCount + " RUNNING=" + runningDocs +"," + runningSources + ": " + sourceKeyList);

                if (docCount > maxCountPerTask) { // source is large enough by itself
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKey);
                    collection.put(SourceHarvestStatusPojo.doccount_, docCount);
                    // (leaving running* alone, can keep building that)
                } //TESTED (by eye, system community of demo cluster)
                else if ((runningDocs + docCount) > maxCountPerTask) { // have now got a large enough collection of sources 
                    if (null == sourceKeyList) {
                        sourceKeyList = new BasicDBList();
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                    collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                    sourceKeyList = null;
                    runningDocs = 0;
                    runningSources = 0;
                } //TESTED (by eye, system community of demo cluster)
                else if (runningSources >= 15) { // have a limit on the number of sources per query, to keep the queries manageable
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                    collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                    sourceKeyList = null;
                    runningDocs = 0;
                    runningSources = 0;
                } //TESTED (by eye, system community of demo cluster)
                else { // (keep) build(ing) list
                    if (null == sourceKeyList) {
                        sourceKeyList = new BasicDBList();
                    runningDocs += docCount;
                } //TESTED (by eye, system community of demo cluster)
            } //(end if has source key)
        } //(end loop over cursor)

        // Finish off:
        if (null != sourceKeyList) {
            // Create collection
            BasicDBObject collection = new BasicDBObject();
            collection.put(DocumentPojo.sourceKey_, sourceKeyList);
            collection.put(SourceHarvestStatusPojo.doccount_, runningDocs);
        } //TESTED (by eye, system community of demo cluster)

        if (sourceKeyListCollection.isEmpty()) { // query returns empty
            throw new RuntimeException("Communities contain no sources");
        return sourceKeyListCollection;

    } // (end if too many source keys across the communities)

From source file:com.ikanow.infinit.e.data_model.store.MongoDbUtil.java

License:Apache License

public static BasicDBList encodeArray(JsonArray a) {
    BasicDBList dbl = new BasicDBList();
    for (JsonElement el : a) {
        dbl.add(encodeUnknown(el));//from w w  w .  ja v  a 2 s.  c o m
    return dbl;

From source file:com.ikanow.infinit.e.harvest.enrichment.custom.JavaScriptUtils.java

License:Open Source License

public static BasicDBList parseNativeJsObject(Object returnVal, ScriptEngine engine) throws ScriptException {
    try {/*from w ww.j  a va  2 s. c  o  m*/
        engine.put("output", returnVal);

        // Use BasicDBObject directly so I can reduce memory usage by setting the initial capacity depending on the size of the JSON array
        //         BasicDBObject objFactory = new BasicDBObject();
        //         engine.put("objFactory", objFactory);
        BasicDBList listFactory = new BasicDBList();
        engine.put("listFactory", listFactory);
        BasicDBList outList = new BasicDBList();
        engine.put("outList", outList);


        return outList;
    } catch (Exception e) {
        throw new RuntimeException("1 Cannot parse return non-JSON object: " + returnVal.getClass().toString()
                + ":" + returnVal.toString()
                + "; if embedding JAVA, considering using eg \"X = '' + X\" to convert back to native JS strings.");

From source file:com.ikanow.infinit.e.harvest.enrichment.legacy.opencalais.ExtractorOpenCalais.java

License:Open Source License

 * Takes a feed with some of the information stored in it
 * such as title, desc, etc, and needs to parse the full
 * text and add entities, events, and other metadata.
 * //from w  w w. j  a  v a  2  s . c  om
 * @param partialDoc The feedpojo before extraction with fulltext field to extract on
 * @return The feedpojo after extraction with entities, events, and full metadata
 * @throws ExtractorDocumentLevelException 
public void extractEntities(DocumentPojo partialDoc) throws ExtractorDocumentLevelException {
    if (null == partialDoc) {

    try {
        if (null == partialDoc.getFullText()) {
        if (partialDoc.getFullText().length() < 32) { // Else don't waste Extractor call/error logging

        PostMethod method = createPostMethod(partialDoc.getFullText());
        int responseCode = client.executeMethod(method);

        if (responseCode == HttpStatus.SC_FORBIDDEN) //INF-1101 forbidden gets thrown when too many concurrent requests occur, try 14 more times
            int count = 1;
            while (count < 15 && responseCode == HttpStatus.SC_FORBIDDEN) {
                try {
                } catch (Exception e) {
                } // carry on...

                responseCode = client.executeMethod(method); //attempt call again

        if (responseCode == HttpStatus.SC_OK) {
            byte[] responseBytes = method.getResponseBody();
            String response = new String(responseBytes, "UTF-8");
            List<EntityPojo> entities = new ArrayList<EntityPojo>();
            List<AssociationPojo> events = new ArrayList<AssociationPojo>();
            ObjectMapper mapper = new ObjectMapper();
            JsonNode root = mapper.readValue(response, JsonNode.class);
            Iterator<JsonNode> iter = root.getElements();
            Iterator<String> iterNames = root.getFieldNames();
            List<JsonNode> eventNodes = new ArrayList<JsonNode>();
            BasicDBList rawEventObjects = null;
            while (iter.hasNext()) {
                String currNodeName = iterNames.next();
                JsonNode currNode = iter.next();
                if (!currNodeName.equals("doc")) //we can assume these are the entities/topics
                    String typeGroup = currNode.get("_typeGroup").getTextValue();
                    //check typegroup to see if it is an entity
                    if (typeGroup.equals("entities")) {
                        try {
                            EntityPojo ep = new EntityPojo();
                            //get what fields we can               
                            try {
                            } catch (java.lang.IllegalArgumentException e) {
                            String name = "";
                            JsonNode nameNode = null;
                            try {
                                nameNode = currNode.get("name");
                                name = nameNode.getTextValue();
                            } catch (Exception ex) {
                                logger.debug("Error parsing name node: " + currNode.toString());
                            ep.setFrequency((long) currNode.get("instances").size());
                            //attempt to get resolutions if they exist
                            JsonNode resolutionNode = currNode.get("resolutions");
                            if (null != resolutionNode) {
                                //resolution nodes are arrays
                                JsonNode resolutionFirst = resolutionNode.get(0);
                                ep.setSemanticLinks(new ArrayList<String>());
                                ep.getSemanticLinks().add(resolutionFirst.get("id").getTextValue()); //this is a link to an alchemy page
                                //check if we need to create a geo object
                                if (null != resolutionFirst.get("latitude")) {
                                    GeoPojo gp = new GeoPojo();
                                    String lat = resolutionFirst.get("latitude").getValueAsText();
                                    String lon = resolutionFirst.get("longitude").getValueAsText();
                                    gp.lat = Double.parseDouble(lat);
                                    gp.lon = Double.parseDouble(lon);
                            } else {
                                ep.setDisambiguatedName(name); // use actual name)                           
                            entityNameMap.put(currNodeName.toLowerCase(), ep);
                        } catch (Exception ex) {
                            logger.error("Error creating event pojo from OpenCalaisNode: " + ex.getMessage(),
                    } else if (typeGroup.equals("relations")) {
            //handle events
            if (bAddRawEventsToMetadata) {
                // For now just re-process these into DB objects since we know that works...
                rawEventObjects = new BasicDBList();
            for (JsonNode eventNode : eventNodes) {
                AssociationPojo event = parseEvent(eventNode);
                //remove useless events (an event is useless if it only has a verb (guessing currently)
                if (null != event) {
                    event = removeUselessEvents(event);
                    if (null != event) {
                if (bAddRawEventsToMetadata) {
                    BasicDBObject eventDbo = (BasicDBObject) com.mongodb.util.JSON.parse(eventNode.toString());
                    if (null != eventDbo) {
                        BasicDBObject transformObj = new BasicDBObject();
                        for (Map.Entry<String, Object> entries : eventDbo.entrySet()) {
                            if (entries.getValue() instanceof String) {
                                String val = (String) entries.getValue();
                                EntityPojo transformVal = findMappedEntityName(val);
                                if (null != transformVal) {
                                    transformObj.put(entries.getKey(), transformVal.getIndex());
                                    transformObj.put(entries.getKey() + "__hash", val);
                                } else {
                                    transformObj.put(entries.getKey(), val);
                            } else {
                                transformObj.put(entries.getKey(), entries.getValue());

                        // (add to another list, which will get written to metadata)
            if (bAddRawEventsToMetadata) {
                partialDoc.addToMetadata("OpenCalaisEvents", rawEventObjects.toArray());
            if (null != partialDoc.getEntities()) {
            } else if (null != entities) {
            if (null != partialDoc.getAssociations()) {
            } else if (null != events) {
        } else // Error back from OC, presumably the input doc is malformed/too long
            throw new InfiniteEnums.ExtractorDocumentLevelException(
                    "OpenCalais HTTP error code: " + Integer.toString(responseCode));
    } catch (Exception e) {
        logger.debug("OpenCalais", e);
        //there was an error, so we return null instead
        throw new InfiniteEnums.ExtractorDocumentLevelException(e.getMessage());

From source file:com.ikanow.infinit.e.harvest.extraction.document.database.DatabaseHarvester.java

License:Open Source License

public static BasicDBList getComplexArray(String columnName, java.sql.Array a)
        throws IllegalArgumentException, SQLException {
    BasicDBList bsonArray = new BasicDBList();

    Object array = a.getArray();//  ww w.  j  a  v  a2  s.  com
    int length = Array.getLength(array);
    for (int i = 0; i < length; ++i) {
        Object o = Array.get(array, i);
        bsonArray.add(convertJdbcTypes(columnName, o));

    return bsonArray;