Example usage for com.mongodb BasicDBList BasicDBList

Introduction

In this page you can find the example usage for com.mongodb BasicDBList BasicDBList.

Prototype

BasicDBList

Source Link

Usage

From source file:com.ikanow.infinit.e.api.knowledge.federated.SimpleFederatedQueryEngine.java

License:Open Source License

public BasicDBObject createDocFromJson(BasicDBList jsonList, String url, FederatedRequest request,
        SourceFederatedQueryConfigPojo endpointInfo) {
    BasicDBObject doc = null; // (don't create unless needed)
    BasicDBList ents = null;//from www .j av  a 2s.  c o  m
    StringBuffer entVals = null;
    HashSet<String> entDedup = null;

    if (_testMode) { // In test mode, need to return the JSON even if no entities are specified 
        doc = new BasicDBObject();
    }
    if (null != endpointInfo.docConversionMap) {
        for (Map.Entry<String, String> docInfo : endpointInfo.docConversionMap.entrySet()) {
            for (Object jsonObj : jsonList) {
                BasicDBObject json = (BasicDBObject) jsonObj;
                try {
                    String key = docInfo.getKey();
                    // (allow user to not prepend array: if they don't want to)
                    if ((1 == json.size()) && json.containsKey((Object) "array")) {
                        if (!key.startsWith("array:") && !key.startsWith(":array") && !key.startsWith("$:array")
                                && !key.startsWith("::") && !key.startsWith("$::")) {
                            if (key.startsWith(":")) { // jpath
                                key = ":array" + key;
                            } else if (key.startsWith("$:")) { // jpath
                                key = "$:array" + key.substring(1);
                            } else {
                                key = "array:" + key;
                            }
                        }
                    } //TESTED (by hand)
                    if (key.startsWith(":")) { // jpath
                        key = "$" + key;
                    }
                    // NOTE: *not* org.json.JSONArray
                    JSONArray candidateEntities = null;
                    if (key.startsWith("$")) {
                        JSONArray candidateEntities_tmp = JsonPath.read(json.toString(), key.replace(':', '.'));
                        if (null != candidateEntities_tmp) {
                            candidateEntities = new JSONArray();
                            for (Object o : candidateEntities_tmp) {
                                if (o instanceof String) {
                                    candidateEntities.add(o);
                                } else if (o instanceof JSONArray) {
                                    candidateEntities.addAll((JSONArray) o);
                                }
                            } //TESTED (displayUrl vs entities, 3.2)
                        }
                        //DEBUG
                        //System.out.println(candidateEntities);

                    } //(TESTED (permutations above by hand))
                    else {
                        String s = (String) MongoDbUtil.getProperty(json, key.replace(':', '.'));
                        if (null != s) {
                            candidateEntities = new JSONArray();
                            candidateEntities.add(s);
                        }
                    } //TESTED (3.1)                                 

                    if (null != candidateEntities)
                        for (int i = 0; i < candidateEntities.size(); ++i) {
                            Object o = candidateEntities.get(i);
                            if (!(o instanceof String)) {
                                continue;
                            }
                            String s = o.toString();
                            if (null == doc) {
                                doc = new BasicDBObject();
                                //(various fields added below)
                            }
                            if (docInfo.getValue().equalsIgnoreCase(DocumentPojo.displayUrl_)) {
                                doc.put(DocumentPojo.displayUrl_, s);
                            } //TESTED (3.1, 4.*)
                            else { // Entities!
                                if (null == ents) {
                                    ents = new BasicDBList();
                                }
                                String index = s.toLowerCase() + "/" + docInfo.getValue().toLowerCase();

                                if (null == entDedup) {
                                    entDedup = new HashSet<String>();
                                } else if (entDedup.contains(index)) { // Entity deduplication
                                    continue;
                                } //TESTED (3.2)
                                entDedup.add(index);

                                if (null == entVals) {
                                    entVals = new StringBuffer(": ");
                                } else {
                                    entVals.append(", ");
                                }
                                entVals.append(s);

                                String dimension = null;
                                if (null != endpointInfo.typeToDimensionMap) {
                                    try {
                                        dimension = EntityPojo.Dimension
                                                .valueOf(
                                                        endpointInfo.typeToDimensionMap.get(docInfo.getValue()))
                                                .toString();
                                    } catch (Exception e) {
                                    }
                                }
                                if (null == dimension) {
                                    dimension = EntityPojo.Dimension.What.toString();
                                } //TESTED (by hand)

                                // (alternative to "made up" values would be to go looking in the existing docs/ents?)
                                // (we'll try to avoid that for now...)
                                BasicDBObject ent = new BasicDBObject();
                                ent.put(EntityPojo.disambiguated_name_, s);
                                ent.put(EntityPojo.type_, docInfo.getValue());
                                ent.put(EntityPojo.dimension_, dimension);
                                ent.put(EntityPojo.relevance_, 1.0);
                                ent.put(EntityPojo.doccount_, 1L); // (ie relative to this query)
                                ent.put(EntityPojo.averageFreq_, 1.0);
                                ent.put(EntityPojo.datasetSignificance_, 10.0); // (ie relative to this query)
                                ent.put(EntityPojo.significance_, 10.0); // (ie relative to this query)
                                ent.put(EntityPojo.frequency_, 1.0);
                                ent.put(EntityPojo.index_, index);
                                ent.put(EntityPojo.queryCoverage_, 100.0); // (ie relative to this query)
                                ent.put(EntityPojo.totalfrequency_, 1.0); // (ie relative to this query)
                                ents.add(ent);
                            } //TESTED (3.1, 4.*)
                        }
                } catch (Exception e) {
                    //(do nothing? null or the wrong type)
                    //e.printStackTrace();
                }
            } //end loop over various JSON objects retrieved
        } //(End loop over doc conversion elements)
    } //TESTED (3.*, 4.*)

    if ((null == ents) && !_testMode) { // don't return unless there are any entities
        return null;
    } else if (null != doc) {
        // Insert mandatory fields:
        // (Note the query format is a little bit different, the following fields are converted to arrays:
        //  sourceKey, source, communityId, mediaType)
        doc.put(DocumentPojo._id_, new ObjectId());
        doc.put(DocumentPojo.url_, url);
        doc.put(DocumentPojo.created_, new Date());
        doc.put(DocumentPojo.modified_, new Date());
        doc.put(DocumentPojo.publishedDate_, new Date());
        doc.put(DocumentPojo.sourceKey_, endpointInfo.parentSource.getKey());
        doc.put(DocumentPojo.source_, endpointInfo.parentSource.getTitle());
        doc.put(DocumentPojo.communityId_, new ObjectId(request.communityIdStrs[0]));
        doc.put(DocumentPojo.mediaType_, endpointInfo.parentSource.getMediaType());
        doc.put(DocumentPojo.metadata_, new BasicDBObject("json", jsonList.toArray()));

        if ((null != entVals) && (entVals.length() > 165)) { // (arbitrary length)
            entVals.setLength(165);
            entVals.append("...");
        }
        doc.put(DocumentPojo.title_, new StringBuffer(endpointInfo.titlePrefix).append(": ")
                .append(request.requestParameter).append(entVals).toString());
        doc.put(DocumentPojo.entities_, ents);
        Gson gson = new GsonBuilder().setPrettyPrinting().create();
        JsonParser jp = new JsonParser();
        JsonElement je = jp.parse(jsonList.toString());
        doc.put(DocumentPojo.description_, gson.toJson(je)); // (prettified JSON)            
    } //TESTED (3.*, 4.*)

    return doc;
}

From source file:com.ikanow.infinit.e.api.knowledge.processing.AggregationUtils.java

License:Open Source License

public static void loadAggregationResults(ResponsePojo rp, Facets facets, Aggregations aggs,
        AggregationOutputPojo aggOutParams, ScoringUtils scoreStats, AliasLookupTable aliasLookup,
        String[] entityTypeFilterStrings, String[] assocVerbFilterStrings,
        AggregationUtils.GeoContainer extraAliasAggregatedGeo) {
    HashMap<String, List<? extends Object>> moments = null;

    if ((null != facets) && (null != facets.getFacets()))
        for (Map.Entry<String, Facet> facet : facets.getFacets().entrySet()) {
            // Geo

            if (facet.getKey().equals("geo")) {
                TermsFacet geoFacet = (TermsFacet) facet.getValue();
                Set<GeoAggregationPojo> geoCounts = null;
                int nHighestCount = -1;
                int nLowestCount = Integer.MAX_VALUE;
                // If we've got some geotags from the alias masters then start with them:
                if ((null != extraAliasAggregatedGeo) && (null != extraAliasAggregatedGeo.geotags)) {
                    geoCounts = extraAliasAggregatedGeo.geotags;
                    nHighestCount = (int) extraAliasAggregatedGeo.minCount;
                    nLowestCount = (int) extraAliasAggregatedGeo.maxCount;
                } else {
                    geoCounts = new TreeSet<GeoAggregationPojo>();
                }/*w ww  .  j  a va 2  s  .c  o  m*/
                for (TermsFacet.Entry geo : geoFacet.getEntries()) {
                    String geohash = FacetUtils.getTerm(geo).substring(2);
                    double[] loc = GeoHashUtils.decode(geohash);
                    GeoAggregationPojo geoObj = new GeoAggregationPojo(loc[0], loc[1]);
                    geoObj.count = geo.getCount();
                    geoObj.type = GeoOntologyMapping.decodeOntologyCode(FacetUtils.getTerm(geo).charAt(0));
                    geoCounts.add(geoObj);
                    // (note this aggregates geo points whose decoded lat/logns are the same, which can result in slightly fewer records than requested)
                    // (note the aggregation writes the aggregated count into geoObj.count)

                    if (geoObj.count > nHighestCount) { // (the counts can be modified by the add command above)
                        nHighestCount = geo.getCount();
                    }
                    if (geoObj.count < nLowestCount) {
                        nLowestCount = geo.getCount();
                    }
                }
                rp.setGeo(geoCounts, nHighestCount, nLowestCount);
            } //(TESTED)
            if (facet.getKey().equals("time")) {
                DateHistogramFacet timeFacet = (DateHistogramFacet) facet.getValue();
                rp.setTimes(timeFacet.getEntries(), QueryHandler.getInterval(aggOutParams.timesInterval, 'm'));
            } //(TESTED)

            if (facet.getKey().equals("events")) {
                TermsFacet eventsFacet = (TermsFacet) facet.getValue();
                rp.setEvents(parseEventAggregationOutput("Event", eventsFacet, scoreStats, aliasLookup,
                        entityTypeFilterStrings, assocVerbFilterStrings));
            }
            if (facet.getKey().equals("facts")) {
                TermsFacet factsFacet = (TermsFacet) facet.getValue();
                rp.setFacts(parseEventAggregationOutput("Fact", factsFacet, scoreStats, aliasLookup,
                        entityTypeFilterStrings, assocVerbFilterStrings));
            }
            //TESTED x2

            if (facet.getKey().equals("sourceTags")) {
                TermsFacet tagsFacet = (TermsFacet) facet.getValue();
                rp.setSourceMetaTags(tagsFacet.getEntries());
            }
            if (facet.getKey().equals("sourceTypes")) {
                TermsFacet typesFacet = (TermsFacet) facet.getValue();
                rp.setSourceMetaTypes(typesFacet.getEntries());
            }
            if (facet.getKey().equals("sourceKeys")) {
                TermsFacet keysFacet = (TermsFacet) facet.getValue();
                rp.setSources(keysFacet.getEntries());
            }
            //TESTED x3

            // Moments (basic functionality)

            if (facet.getKey().startsWith("moments.")) {
                DateHistogramFacet momentFacet = (DateHistogramFacet) facet.getValue();
                if (null == moments) {
                    moments = new HashMap<String, List<? extends Object>>();
                }
                moments.put(facet.getKey().substring(8), momentFacet.getEntries());
            } //TESTED

        } //(end loop over generated facets)   

    if ((null != aggs) && (null != aggs.asMap()))
        for (Map.Entry<String, Aggregation> agg : aggs.asMap().entrySet()) {

            if (agg.getKey().equals("moments")) {
                if (null == moments) {
                    moments = new HashMap<String, List<? extends Object>>();
                }

                DateHistogram val = (DateHistogram) agg.getValue();

                //TODO (INF-2688): Finalize format 
                BasicDBList dbl = new BasicDBList();
                for (DateHistogram.Bucket dateBucket : val.getBuckets()) {
                    if (dateBucket.getKeyAsNumber().longValue() > 0) {
                        BasicDBObject dataBucketDbo = new BasicDBObject();
                        dataBucketDbo.put("time", dateBucket.getKeyAsNumber().longValue());
                        dataBucketDbo.put("count", dateBucket.getDocCount());
                        for (Map.Entry<String, Aggregation> dateAggs : dateBucket.getAggregations().asMap()
                                .entrySet()) {
                            if (dateAggs.getKey().equals("geo")) {

                                BasicDBList dbl_geo = new BasicDBList();
                                MultiBucketsAggregation geoVal = (MultiBucketsAggregation) dateAggs.getValue();

                                long nHighestCount = Long.MIN_VALUE;
                                for (MultiBucketsAggregation.Bucket geoBucket : geoVal.getBuckets()) {
                                    String geohash = geoBucket.getKey().substring(2);
                                    double[] loc = GeoHashUtils.decode(geohash);
                                    GeoAggregationPojo geoObj = new GeoAggregationPojo(loc[0], loc[1]);
                                    BasicDBObject geoDbo = new BasicDBObject(4);
                                    geoDbo.put("lat", geoObj.lat);
                                    geoDbo.put("lon", geoObj.lon);
                                    geoDbo.put("count", geoBucket.getDocCount());
                                    geoDbo.put("type", GeoOntologyMapping
                                            .decodeOntologyCode(geoBucket.getKey().charAt(0)));
                                    dbl_geo.add(geoDbo);

                                    if (geoBucket.getDocCount() > nHighestCount) { // (the counts can be modified by the add command above)
                                        nHighestCount = geoBucket.getDocCount();
                                    }
                                }
                                dataBucketDbo.put("maxGeoCount", nHighestCount);
                                dataBucketDbo.put("geo", dbl_geo);
                            }
                        }
                        dbl.add(dataBucketDbo);
                    }
                }
                moments.put("times", dbl);
            }
        } //(end loop over generated aggregations)      

    if ((null != moments) && !moments.isEmpty()) {
        rp.setMoments(moments, QueryHandler.getInterval(aggOutParams.moments.timesInterval, 'm'));
    }

}

From source file:com.ikanow.infinit.e.application.utils.LogstashConfigUtils.java

License:Open Source License

public static BasicDBObject parseLogstashConfig(String configFile, StringBuffer error) {

    BasicDBObject tree = new BasicDBObject();

    // Stage 0: remove escaped "s and 's (for the purpose of the validation):
    // (prevents tricksies with escaped "s and then #s)
    // (http://stackoverflow.com/questions/5082398/regex-to-replace-single-backslashes-excluding-those-followed-by-certain-chars)
    configFile = configFile.replaceAll("(?<!\\\\)(?:((\\\\\\\\)*)\\\\)[\"']", "X");
    //TESTED (by hand - using last 2 fields of success_2_1)

    // Stage 1: remove #s, and anything in quotes (for the purpose of the validation)
    configFile = configFile.replaceAll("(?m)(?:([\"'])(?:(?!\\1).)*\\1)", "VALUE").replaceAll("(?m)(?:#.*$)",
            "");/*w  w w.j a  va2s.co  m*/
    //TESTED (2_1 - including with a # inside the ""s - Event_Date -> Event_#Date)
    //TESTED (2_2 - various combinations of "s nested inside 's) ... yes that is a negative lookahead up there - yikes!

    // Stage 2: get a nested list of objects
    int depth = 0;
    int ifdepth = -1;
    Stack<Integer> ifStack = new Stack<Integer>();
    BasicDBObject inputOrFilter = null;
    Matcher m = _navigateLogstash.matcher(configFile);
    // State:
    String currTopLevelBlockName = null;
    String currSecondLevelBlockName = null;
    BasicDBObject currSecondLevelBlock = null;
    while (m.find()) {
        boolean simpleField = false;

        //DEBUG
        //System.out.println("--DEPTH="+depth + " GROUP=" + m.group() + " IFS" + Arrays.toString(ifStack.toArray()));
        //System.out.println("STATES: " + currTopLevelBlockName + " AND " + currSecondLevelBlockName);

        if (m.group().equals("}")) {

            if (ifdepth == depth) { // closing an if statement
                ifStack.pop();
                if (ifStack.isEmpty()) {
                    ifdepth = -1;
                } else {
                    ifdepth = ifStack.peek();
                }
            } //TESTED (1_1bc, 2_1)
            else { // closing a processing block

                depth--;
                if (depth < 0) { // {} Mismatch
                    error.append("{} Mismatch (})");
                    return null;
                } //TESTED (1_1abc)
            }
        } else { // new attribute!

            String typeName = m.group(1);
            if (null == typeName) { // it's an if statement or a string value
                typeName = m.group(4);
                if (null != typeName) {
                    simpleField = true;
                }
            } else if (typeName.equalsIgnoreCase("else")) { // It's an if statement..
                typeName = null;
            }
            if (null == typeName) { // if statement after all
                // Just keep track of ifs so we can ignore them
                ifStack.push(depth);
                ifdepth = depth;
                // (don't increment depth)
            } //TESTED (1_1bc, 2_1)
            else { // processing block
                String subTypeName = m.group(3);
                if (null != subTypeName) { // eg codec.multiline
                    typeName = typeName + "." + subTypeName;
                } //TESTED (2_1, 2_3)

                if (depth == 0) { // has to be one of input/output/filter)
                    String topLevelType = typeName.toLowerCase();
                    if (topLevelType.equalsIgnoreCase("input") || topLevelType.equalsIgnoreCase("filter")) {
                        if (tree.containsField(topLevelType)) {
                            error.append("Multiple input or filter blocks: " + topLevelType);
                            return null;
                        } //TESTED (1_3ab)
                        else {
                            inputOrFilter = new BasicDBObject();
                            tree.put(topLevelType, inputOrFilter);

                            // Store state:
                            currTopLevelBlockName = topLevelType;
                        } //TESTED (*)
                    } else {
                        if (topLevelType.equalsIgnoreCase("output")) {
                            error.append(
                                    "Not allowed output blocks - these are appended automatically by the logstash harvester");
                        } else {
                            error.append("Unrecognized processing block: " + topLevelType);
                        }
                        return null;
                    } //TESTED (1_4a)
                } else if (depth == 1) { // processing blocks
                    String subElType = typeName.toLowerCase();

                    // Some validation: can't include a type called "filter" anywhere
                    if ((null != currTopLevelBlockName) && currTopLevelBlockName.equals("input")) {
                        if (subElType.equals("filter") || subElType.endsWith(".filter")) {
                            error.append("Not allowed sub-elements of input called 'filter' (1)");
                            return null;
                        }
                    } //TESTED (1_5b)

                    BasicDBList subElements = (BasicDBList) inputOrFilter.get(subElType);
                    if (null == subElements) {
                        subElements = new BasicDBList();
                        inputOrFilter.put(subElType, subElements);
                    }
                    BasicDBObject newEl = new BasicDBObject();
                    subElements.add(newEl);

                    // Store state:
                    currSecondLevelBlockName = subElType;
                    currSecondLevelBlock = newEl;
                } //TESTED (*)
                else if (depth == 2) { // attributes of processing blocks
                    // we'll just store the field names for these and do any simple validation that was too complicated for the regexes
                    String subSubElType = typeName.toLowerCase();

                    // Validation:
                    if (null != currTopLevelBlockName) {
                        // 1] sincedb path
                        if (currTopLevelBlockName.equals("input") && (null != currSecondLevelBlockName)) {
                            // (don't care what the second level block name is - no sincedb allowed)
                            if (subSubElType.equalsIgnoreCase("sincedb_path")) {
                                error.append("Not allowed sincedb_path in input.* block");
                                return null;
                            } //TESTED (1_5a)
                              // 2] no sub-(-sub etc)-elements of input called filter
                            if (subSubElType.equals("filter") || subSubElType.endsWith(".filter")) {
                                error.append("Not allowed sub-elements of input called 'filter' (2)");
                                return null;
                            } //TESTED (1_5c)
                        }
                    }

                    // Store in map:
                    if (null != currSecondLevelBlock) {
                        currSecondLevelBlock.put(subSubElType, new BasicDBObject());
                    }
                }
                // (won't go any deeper than this)
                if (!simpleField) {
                    depth++;
                }
            }

        }
    }
    if (0 != depth) {
        error.append("{} Mismatch ({)");
        return null;
    } //TESTED (1_2a)

    return tree;
}

From source file:com.ikanow.infinit.e.data_model.api.knowledge.DocumentPojoApiMap.java

License:Apache License

public static void mapToApi(BasicDBObject doc) {
    // 1. (doc_index field)
    doc.remove(DocumentPojo.index_);/* w ww  .j  a va  2  s  .c  o  m*/
    // 2. (source title)
    String tmp = doc.getString(DocumentPojo.source_);
    if (null != tmp) {
        BasicDBList array = new BasicDBList();
        array.add(tmp);
        doc.put(DocumentPojo.source_, array);
    }
    // 3. (source key)
    tmp = DocumentPojo.getSourceKey(doc.getString(DocumentPojo.sourceKey_));
    if (null != tmp) {
        BasicDBList array = new BasicDBList();
        array.add(tmp);
        doc.put(DocumentPojo.sourceKey_, array);
    }
    // 4. (media type)
    tmp = doc.getString(DocumentPojo.mediaType_);
    if (null != tmp) {
        BasicDBList array = new BasicDBList();
        array.add(tmp);
        doc.put(DocumentPojo.mediaType_, array);
    }

}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

/**
 * Checks if the new params MAX_SPLITS and MAX_DOCS_PER_SPLIT are set
 * in the config.  If they are it will use those to do splits via limit/skip
 * otherwise it will call the previous chunking splitter in MongoSplitter.
 * /*from ww w  .  jav  a  2  s  .  c  o m*/
 * @param conf
 * @return
 */

public static List<InputSplit> calculateSplits(InfiniteMongoConfig conf) {
    // First off: What is our sharding scheme?

    boolean shardingPolicyNew = false;
    try {
        BasicDBObject shardQuery = new BasicDBObject("_id", "doc_metadata.metadata");
        BasicDBObject shardInfo = (BasicDBObject) DbManager.getCollection("config", "collections")
                .findOne(shardQuery);
        if (null != shardInfo) {
            BasicDBObject shardInfoKey = (BasicDBObject) shardInfo.get("key");
            if (null != shardInfoKey) {
                shardingPolicyNew = (shardInfoKey.size() > 1);
            }
        }
    } //TESTED (new and old)
    catch (Exception e) {
    } // stick with the old sharding, it's probably going to die soon after though, honestly

    // conf.getQuery returns a new copy of the query, so get once and use everywhere...
    BasicDBObject confQuery = (BasicDBObject) conf.getQuery();

    BasicDBObject srcTagsQuery = (BasicDBObject) conf.getSourceTags();

    String collection = conf.getInputURI().getCollection();
    if (!collection.equals(DbManager.getDocument().getContent().getName())
            && !collection.equals(DbManager.getDocument().getMetadata().getName())) {
        // Case 1: feature table or custom table
        // Just run legacy code
        return calculateSplits_phase2(conf, confQuery, false, false, null);
    } else { // complex cases...
        boolean simpleOtherIndex = false;
        // Check whether a simple query has been performed on a different indexed field         
        if (null == srcTagsQuery) { // (if srcTags specified, then going to want to use sourceKey as the index)
            for (String s : Arrays.asList(EntityPojo.docQuery_index_, DocumentPojo.url_)) {
                Object selector = confQuery.get(s);
                if (selector instanceof String) {
                    simpleOtherIndex = true;
                    break;
                } else if (selector instanceof DBObject) {
                    DBObject selectorDbo = (DBObject) selector;
                    if (selectorDbo.containsField(DbManager.in_)) {
                        simpleOtherIndex = true;
                        break;
                    }
                }
            } //TESTED (both types, plus check complex indexes don't work)         
              // ALLOWED: {"entities.index": { "$in": [ "xxx", "yyy"] }, {"entities.index": "xxx" }, ditto for "url"
              // NOT ALLOWED: { "entities.index": { "$ne": "xxx" } }
        }
        //TESTED check ignored if eg entity_index specified

        if (simpleOtherIndex) {
            // Case 2: we have a simple query on an indexed field 
            // Just run legacy code

            return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);
        } //TESTED
        else if (conf.getLimit() > 0) { // debug
            //Case 3: Ensure we have small sets of sources to search over
            BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery,
                    conf.getMaxDocsPerSplit());
            final List<InputSplit> splits = new ArrayList<InputSplit>();

            boolean queryNonTrivial = isQueryNonTrivial(confQuery);
            if (!queryNonTrivial) {
                //Case 3a: query is trivial, so can just create splits directly from the split pre-calcs
                int toProcess = conf.getLimit();
                Iterator<Object> itSplit = collectionOfSplits.iterator();
                while ((toProcess > 0) && (itSplit.hasNext())) {
                    BasicDBObject split = (BasicDBObject) itSplit.next();

                    int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                    int toGet = (docCount > toProcess) ? toProcess : docCount;
                    BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
                    if (null != modQuery) {
                        splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery,
                                conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
                        toProcess -= docCount;
                    }
                } //TESTED
            } else {
                // Case 3b: annoying, some extra query terms, gonna need to do it the hard way...
                int toProcess = conf.getLimit();
                Iterator<Object> itSplit = collectionOfSplits.iterator();
                DBCollection coll = InfiniteMongoConfigUtil.getCollection(conf.getInputURI());
                while ((toProcess > 0) && (itSplit.hasNext())) {
                    BasicDBObject split = (BasicDBObject) itSplit.next();

                    BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
                    if (null != modQuery) {
                        int docsCounted = (int) coll.getCount(modQuery, null, toProcess, 0);
                        int toGet = (docsCounted > toProcess) ? toProcess : docsCounted;
                        if (docsCounted > 0) {
                            splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(),
                                    modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
                            toProcess -= docsCounted;
                        }
                    } //TESTED
                }
            } //TESTED

            return splits;
        } else { // More complex cases:

            if (shardingPolicyNew) {
                // Case 4a: NEW SHARDING SCHEME

                // Always fetch the new sources, eg convert communityId to sourceKeys
                try {
                    splitPrecalculations_newShardScheme(confQuery, srcTagsQuery); // (modifies confQuery if returns true)            
                    boolean queryNonTrivial = isQueryNonTrivial(confQuery);

                    return calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null);

                    // (ie trivial query => always use chunks, bypass skip/limit test)
                } //TESTED (trivial + non-trivial)
                catch (Exception e) { // Didn't match any sources, no problem
                    return new ArrayList<InputSplit>();
                } //TESTED

            } //TESTED
            else {

                BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery,
                        srcTagsQuery, conf.getMaxDocsPerSplit());

                if (null == collectionOfSplits) {
                    // Case 4b: OLD SHARDING SCHEME can't get a partition by source keys, just back off to old code
                    return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);
                } //TESTED (old code)
                else {
                    conf.setMaxDocsPerSplit(2 * conf.getMaxDocsPerSplit());
                    // (because we stop creating splits when the exceed the size)

                    // Case 4c: OLD SHARDING SCHEME, have a source key partition
                    int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits();
                    boolean queryNonTrivial = isQueryNonTrivial(confQuery);
                    final List<InputSplit> splits = new ArrayList<InputSplit>();

                    BasicDBObject savedQuery = confQuery;

                    Iterator<Object> itSplit = collectionOfSplits.iterator();
                    BasicDBList bigSplit = null;
                    while (itSplit.hasNext()) {
                        BasicDBObject split = (BasicDBObject) itSplit.next();
                        int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                        if (docCount < nMaxCount) { // small split, will use skip/limit
                            BasicDBObject modQuery = convertQuery(savedQuery,
                                    split.get(DocumentPojo.sourceKey_));
                            if (null != modQuery) {

                                final int SPLIT_THRESHOLD = 3;
                                // A few cases:
                                if ((docCount < (SPLIT_THRESHOLD * conf.getMaxDocsPerSplit()))
                                        || !queryNonTrivial) {
                                    splits.addAll(calculateSplits_phase2(conf, modQuery, false,
                                            shardingPolicyNew, (Integer) docCount));
                                } //TESTED (based on limit, based on query)
                                else {
                                    // My guess at the point at which you might as well as do the full query in the hope you're going
                                    // to save some (empty) splits
                                    splits.addAll(calculateSplits_phase2(conf, modQuery, false,
                                            shardingPolicyNew, null));
                                } //TESTED
                            } //TESTED
                        } else { // large split, combine all these guys into an array of source keys
                            if (null == bigSplit) {
                                bigSplit = new BasicDBList();
                            }
                            bigSplit.add(split.get(DocumentPojo.sourceKey_));
                            // (guaranteed to be a single element)
                        }
                    } //(end loop over collections)

                    if (null != bigSplit) {

                        // If we have a big left over community then create a set of splits for that - always chunks if query trivial
                        if (1 == bigSplit.size()) {
                            confQuery.put(DocumentPojo.sourceKey_, bigSplit.iterator().next());
                        } else {
                            confQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, bigSplit));
                        }
                        splits.addAll(calculateSplits_phase2(conf, confQuery, !queryNonTrivial,
                                shardingPolicyNew, null));
                    } //TESTED: singleton+trivial (sandy), array+trivial (sentiment/enron), array+non-trivial (sentiment/enron, docGeo), singleton+non-trivial (sandy, docGeo)

                    return splits;

                } //TESTED: end if Cases 4a, 4b, 4c

            } //(end if old vs new sharding policy)

        } //(non-debug case)
    } //(content or metadata table are most complex)
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

@SuppressWarnings("unchecked")
public static BasicDBList splitPrecalculations_oldShardSchemeOrDebug(BasicDBObject query,
        BasicDBObject srcTagsQuery, int maxCountPerTask) {
    // Get the communityIds from the query
    Collection<ObjectId> communityIds = null;
    try {/*from ww  w . j av a  2s .co  m*/
        BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_);
        communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_);
        if (null == communityIds) {
            return null;
        }
    } catch (Exception e) {
        return null; // back out
    }

    BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_,
            new BasicDBObject(DbManager.in_, communityIds));
    BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
    keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
    BasicDBObject sortFields = new BasicDBObject(SourcePojo.key_, 1);

    // Get and remove the sourceKey information, incorporate into source query:
    Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_);
    if (null != sourceKeyQueryTerm) {
        keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm);
    } //TESTED
    if (null != srcTagsQuery) {
        keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_));
    } //TESTED

    DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(sortFields);
    // (note the sort is needed so that the potentially expensive doc query has a sensibly ordered $in clause)
    if (dbc.count() > 5000) {
        // (too many source keys to process, just going to leave well alone... note this means $srctags will fail open)
        return null;
    } else {
        //TreeMap<String, Long> sourceKeys = new TreeMap<String, Long>();
        // Build collections of objects of format { sourceKey: string or [], totalDocs }
        BasicDBList sourceKeyListCollection = new BasicDBList();
        BasicDBList sourceKeyList = null;
        int runningDocs = 0;
        int runningSources = 0;
        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            String sourceKey = (String) dbo.get(SourcePojo.key_);
            if (null != sourceKey) {
                long docCount = 0L;
                try {
                    BasicDBObject harvestStatus = (BasicDBObject) dbo.get(SourcePojo.harvest_);
                    if (null != harvestStatus) {
                        docCount = harvestStatus.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                    }
                } catch (Exception e) {
                }

                //DEBUG
                //System.out.println("SOURCE=" + sourceKey + " DOC_COUNT=" + docCount + " RUNNING=" + runningDocs +"," + runningSources + ": " + sourceKeyList);

                if (docCount > maxCountPerTask) { // source is large enough by itself
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKey);
                    collection.put(SourceHarvestStatusPojo.doccount_, docCount);
                    sourceKeyListCollection.add(collection);
                    // (leaving running* alone, can keep building that)
                } //TESTED (by eye, system community of demo cluster)
                else if ((runningDocs + docCount) > maxCountPerTask) { // have now got a large enough collection of sources 
                    if (null == sourceKeyList) {
                        sourceKeyList = new BasicDBList();
                    }
                    sourceKeyList.add(sourceKey);
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                    collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                    sourceKeyListCollection.add(collection);
                    sourceKeyList = null;
                    runningDocs = 0;
                    runningSources = 0;
                } //TESTED (by eye, system community of demo cluster)
                else if (runningSources >= 15) { // have a limit on the number of sources per query, to keep the queries manageable
                    sourceKeyList.add(sourceKey);
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                    collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                    sourceKeyListCollection.add(collection);
                    sourceKeyList = null;
                    runningDocs = 0;
                    runningSources = 0;
                } //TESTED (by eye, system community of demo cluster)
                else { // (keep) build(ing) list
                    if (null == sourceKeyList) {
                        sourceKeyList = new BasicDBList();
                    }
                    sourceKeyList.add(sourceKey);
                    runningDocs += docCount;
                    runningSources++;
                } //TESTED (by eye, system community of demo cluster)
            } //(end if has source key)
        } //(end loop over cursor)

        // Finish off:
        if (null != sourceKeyList) {
            // Create collection
            BasicDBObject collection = new BasicDBObject();
            collection.put(DocumentPojo.sourceKey_, sourceKeyList);
            collection.put(SourceHarvestStatusPojo.doccount_, runningDocs);
            sourceKeyListCollection.add(collection);
        } //TESTED (by eye, system community of demo cluster)

        if (sourceKeyListCollection.isEmpty()) { // query returns empty
            throw new RuntimeException("Communities contain no sources");
        }
        return sourceKeyListCollection;

    } // (end if too many source keys across the communities)
}

From source file:com.ikanow.infinit.e.data_model.store.MongoDbUtil.java

License:Apache License

public static BasicDBList encodeArray(JsonArray a) {
    BasicDBList dbl = new BasicDBList();
    for (JsonElement el : a) {
        dbl.add(encodeUnknown(el));//from w w  w .  ja v  a 2 s.  c o m
    }
    return dbl;
}

From source file:com.ikanow.infinit.e.harvest.enrichment.custom.JavaScriptUtils.java

License:Open Source License

public static BasicDBList parseNativeJsObject(Object returnVal, ScriptEngine engine) throws ScriptException {
    try {/*from w ww.j  a va  2 s. c  o  m*/
        engine.put("output", returnVal);

        // Use BasicDBObject directly so I can reduce memory usage by setting the initial capacity depending on the size of the JSON array
        //         BasicDBObject objFactory = new BasicDBObject();
        //         engine.put("objFactory", objFactory);
        BasicDBList listFactory = new BasicDBList();
        engine.put("listFactory", listFactory);
        BasicDBList outList = new BasicDBList();
        engine.put("outList", outList);

        engine.eval("s1(output);");

        return outList;
    } catch (Exception e) {
        throw new RuntimeException("1 Cannot parse return non-JSON object: " + returnVal.getClass().toString()
                + ":" + returnVal.toString()
                + "; if embedding JAVA, considering using eg \"X = '' + X\" to convert back to native JS strings.");
    }
}

From source file:com.ikanow.infinit.e.harvest.enrichment.legacy.opencalais.ExtractorOpenCalais.java

License:Open Source License

/**
 * Takes a feed with some of the information stored in it
 * such as title, desc, etc, and needs to parse the full
 * text and add entities, events, and other metadata.
 * //from w  w w. j  a  v a  2  s . c  om
 * @param partialDoc The feedpojo before extraction with fulltext field to extract on
 * @return The feedpojo after extraction with entities, events, and full metadata
 * @throws ExtractorDocumentLevelException 
 */
@Override
public void extractEntities(DocumentPojo partialDoc) throws ExtractorDocumentLevelException {
    if (null == partialDoc) {
        return;
    }
    configure(partialDoc.getTempSource());

    num_extraction_requests.incrementAndGet();
    try {
        if (null == partialDoc.getFullText()) {
            return;
        }
        if (partialDoc.getFullText().length() < 32) { // Else don't waste Extractor call/error logging
            return;
        }

        PostMethod method = createPostMethod(partialDoc.getFullText());
        int responseCode = client.executeMethod(method);

        if (responseCode == HttpStatus.SC_FORBIDDEN) //INF-1101 forbidden gets thrown when too many concurrent requests occur, try 14 more times
        {
            int count = 1;
            while (count < 15 && responseCode == HttpStatus.SC_FORBIDDEN) {
                try {
                    Thread.sleep(1800);
                } catch (Exception e) {
                } // carry on...

                responseCode = client.executeMethod(method); //attempt call again
                count++;
            }
            num_extraction_collisions.addAndGet(count);
        }

        if (responseCode == HttpStatus.SC_OK) {
            byte[] responseBytes = method.getResponseBody();
            String response = new String(responseBytes, "UTF-8");
            List<EntityPojo> entities = new ArrayList<EntityPojo>();
            List<AssociationPojo> events = new ArrayList<AssociationPojo>();
            ObjectMapper mapper = new ObjectMapper();
            JsonNode root = mapper.readValue(response, JsonNode.class);
            Iterator<JsonNode> iter = root.getElements();
            Iterator<String> iterNames = root.getFieldNames();
            List<JsonNode> eventNodes = new ArrayList<JsonNode>();
            BasicDBList rawEventObjects = null;
            while (iter.hasNext()) {
                String currNodeName = iterNames.next();
                JsonNode currNode = iter.next();
                if (!currNodeName.equals("doc")) //we can assume these are the entities/topics
                {
                    String typeGroup = currNode.get("_typeGroup").getTextValue();
                    //check typegroup to see if it is an entity
                    if (typeGroup.equals("entities")) {
                        try {
                            EntityPojo ep = new EntityPojo();
                            //get what fields we can               
                            ep.setType(currNode.get("_type").getTextValue());
                            try {
                                ep.setDimension(DimensionUtility.getDimensionByType(ep.getType()));
                            } catch (java.lang.IllegalArgumentException e) {
                                ep.setDimension(EntityPojo.Dimension.What);
                            }
                            String name = "";
                            JsonNode nameNode = null;
                            try {
                                nameNode = currNode.get("name");
                                name = nameNode.getTextValue();
                            } catch (Exception ex) {
                                logger.debug("Error parsing name node: " + currNode.toString());
                                continue;
                            }
                            ep.setActual_name(name);
                            ep.setRelevance(Double.parseDouble(currNode.get("relevance").getValueAsText()));
                            ep.setFrequency((long) currNode.get("instances").size());
                            //attempt to get resolutions if they exist
                            JsonNode resolutionNode = currNode.get("resolutions");
                            if (null != resolutionNode) {
                                //resolution nodes are arrays
                                JsonNode resolutionFirst = resolutionNode.get(0);
                                ep.setSemanticLinks(new ArrayList<String>());
                                ep.getSemanticLinks().add(resolutionFirst.get("id").getTextValue()); //this is a link to an alchemy page
                                ep.setDisambiguatedName(resolutionFirst.get("name").getTextValue());
                                //check if we need to create a geo object
                                if (null != resolutionFirst.get("latitude")) {
                                    GeoPojo gp = new GeoPojo();
                                    String lat = resolutionFirst.get("latitude").getValueAsText();
                                    String lon = resolutionFirst.get("longitude").getValueAsText();
                                    gp.lat = Double.parseDouble(lat);
                                    gp.lon = Double.parseDouble(lon);
                                    ep.setGeotag(gp);
                                }
                            } else {
                                ep.setDisambiguatedName(name); // use actual name)                           
                            }
                            entityNameMap.put(currNodeName.toLowerCase(), ep);
                            entities.add(ep);
                        } catch (Exception ex) {
                            logger.error("Error creating event pojo from OpenCalaisNode: " + ex.getMessage(),
                                    ex);
                        }
                    } else if (typeGroup.equals("relations")) {
                        eventNodes.add(currNode);
                    }
                }
            }
            //handle events
            if (bAddRawEventsToMetadata) {
                // For now just re-process these into DB objects since we know that works...
                rawEventObjects = new BasicDBList();
            }
            for (JsonNode eventNode : eventNodes) {
                AssociationPojo event = parseEvent(eventNode);
                //remove useless events (an event is useless if it only has a verb (guessing currently)
                if (null != event) {
                    event = removeUselessEvents(event);
                    if (null != event) {
                        events.add(event);
                    }
                }
                if (bAddRawEventsToMetadata) {
                    BasicDBObject eventDbo = (BasicDBObject) com.mongodb.util.JSON.parse(eventNode.toString());
                    if (null != eventDbo) {
                        BasicDBObject transformObj = new BasicDBObject();
                        for (Map.Entry<String, Object> entries : eventDbo.entrySet()) {
                            if (entries.getValue() instanceof String) {
                                String val = (String) entries.getValue();
                                EntityPojo transformVal = findMappedEntityName(val);
                                if (null != transformVal) {
                                    transformObj.put(entries.getKey(), transformVal.getIndex());
                                    transformObj.put(entries.getKey() + "__hash", val);
                                } else {
                                    transformObj.put(entries.getKey(), val);
                                }
                            } else {
                                transformObj.put(entries.getKey(), entries.getValue());
                            }
                        }

                        // (add to another list, which will get written to metadata)
                        rawEventObjects.add(transformObj);
                    }
                }
            }
            if (bAddRawEventsToMetadata) {
                partialDoc.addToMetadata("OpenCalaisEvents", rawEventObjects.toArray());
            }
            if (null != partialDoc.getEntities()) {
                partialDoc.getEntities().addAll(entities);
                partialDoc.setEntities(partialDoc.getEntities());
            } else if (null != entities) {
                partialDoc.setEntities(entities);
            }
            if (null != partialDoc.getAssociations()) {
                partialDoc.getAssociations().addAll(events);
                partialDoc.setAssociations(partialDoc.getAssociations());
            } else if (null != events) {
                partialDoc.setAssociations(events);
            }
        } else // Error back from OC, presumably the input doc is malformed/too long
        {
            throw new InfiniteEnums.ExtractorDocumentLevelException(
                    "OpenCalais HTTP error code: " + Integer.toString(responseCode));
        }
    } catch (Exception e) {
        //DEBUG
        //e.printStackTrace();
        logger.debug("OpenCalais", e);
        //there was an error, so we return null instead
        throw new InfiniteEnums.ExtractorDocumentLevelException(e.getMessage());
    }
}

From source file:com.ikanow.infinit.e.harvest.extraction.document.database.DatabaseHarvester.java

License:Open Source License

public static BasicDBList getComplexArray(String columnName, java.sql.Array a)
        throws IllegalArgumentException, SQLException {
    BasicDBList bsonArray = new BasicDBList();

    Object array = a.getArray();//  ww w.  j  a  v  a2  s.  com
    int length = Array.getLength(array);
    for (int i = 0; i < length; ++i) {
        Object o = Array.get(array, i);
        bsonArray.add(convertJdbcTypes(columnName, o));
    }
    a.free();

    return bsonArray;
}