Example usage for com.mongodb BasicDBObject get

Introduction

In this page you can find the example usage for com.mongodb BasicDBObject get.

Prototype

public Object get(final String key)

Source Link

Document

Gets a value from this object

Usage

From source file:com.ikanow.infinit.e.api.knowledge.output.RssOutput.java

License:Open Source License

public String getDocs(ResponsePojo rp) {
    // Create the feed using Rome
    SyndFeed feed = new SyndFeedImpl(); // create the feed
    String feedType = "rss_2.0";

    // Setup a list of feeds
    @SuppressWarnings("unchecked")
    List<BasicDBObject> docs = (List<BasicDBObject>) rp.getData();

    // Set the title of the feed
    feed.setTitle("Infinit.e Knowledge Discovery RSS Feed");
    feed.setDescription("Infinit.e Search Results RSS Feed");
    feed.setLanguage("en-us");
    feed.setPublishedDate(new Date(System.currentTimeMillis()));
    feed.setFeedType(feedType); // set the type of your feed
    feed.setLink("http://www.ikanow.com");

    // Establish the list to contain the feeds
    List<SyndEntry> entries = new ArrayList<SyndEntry>();

    // loop through the result set
    for (BasicDBObject fdbo : docs) {
        SyndEntry entry = new SyndEntryImpl(); // create a feed entry

        if (fdbo.getString("title") != null) {
            entry.setTitle(fdbo.getString("title"));

            Date pubDate = (Date) fdbo.get("publishedDate");
            if (pubDate != null)
                entry.setPublishedDate(pubDate);

            if (fdbo.getString("url") != null)
                entry.setLink(fdbo.getString("url"));

            if (fdbo.getString("description") != null) {
                // Create the content for the entry
                SyndContent content = new SyndContentImpl(); // create the content of your entry
                content.setType("text/plain");
                content.setValue(fdbo.getString("description"));
                entry.setDescription(content);
            }//from  w  w  w. j  a va 2  s .  c om
            entries.add(entry);
        }
    }

    feed.setEntries(entries); // you can add multiple entries in your feed

    SyndFeedOutput output = new SyndFeedOutput();
    String rss = null;

    try {
        rss = output.outputString(feed);
    } catch (FeedException e) {
        e.printStackTrace();
        logger.error("Line: [" + e.getStackTrace()[2].getLineNumber() + "] " + e.getMessage());
    }
    return rss;
}

From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java

License:Open Source License

@SuppressWarnings("unchecked")
private void stage1_initialCountingLoop(DBCursor docs, AdvancedQueryPojo.QueryScorePojo scoreParams,
        int toReturn, StatisticsPojo scores, LinkedList<BasicDBObject> standaloneEventsReturn,
        int nCommunities) {
    double s0_nQuerySubsetDocCountInv = 1.0 / (double) _s0_nQuerySubsetDocCount;

    // Some memory management:
    DBCollection dbc = MongoDbManager.getDocument().getMetadata();
    DBDecoderFactory defaultDecoder = dbc.getDBDecoderFactory();

    try {// w  w w . jav  a  2  s  .c o m
        SizeReportingBasicBSONDecoder sizeReportingDecoder = new SizeReportingBasicBSONDecoder();
        dbc.setDBDecoderFactory(sizeReportingDecoder);

        long currMemUsage = 0;
        int ndocs = 0;
        long lastBatch = 0L;

        long initialUnusedMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory();
        long initialFreeMemory = Runtime.getRuntime().freeMemory();

        for (DBObject f0 : docs) {
            BasicDBObject f = (BasicDBObject) f0;
            long newMemUsage = sizeReportingDecoder.getSize();
            if ((newMemUsage - currMemUsage) > 0) { // check every batch               
                long now = new Date().getTime();

                //DEBUG
                //logger.warn(ndocs + " : " + (now - lastBatch) + " : " + newMemUsage + " VS " + Runtime.getRuntime().maxMemory() + " UNUSED " + (Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory()) + " FREE " + Runtime.getRuntime().freeMemory());

                // Check vs total memory:
                long runtimeMem = Runtime.getRuntime().maxMemory();
                // note newMemUsage is the input memory ... gets expanded ~6x by the BSON-ification, allowed at most 1/4rd of memory...
                // Also if we're taking more than 20s for a batch then limp over the limit and exit...
                if (((newMemUsage * 24) > runtimeMem)
                        || (((now - lastBatch) > 20000L) && (ndocs >= toReturn))) {
                    long finalUnusedMemory = Runtime.getRuntime().maxMemory()
                            - Runtime.getRuntime().totalMemory();
                    long finalFreeMemory = Runtime.getRuntime().freeMemory();

                    logger.error("Query truncated memUsage=" + newMemUsage + ", memory=" + runtimeMem
                            + ", docs=" + ndocs + ", totaldocs=" + scores.found + ", init_free_mem="
                            + initialFreeMemory + ", end_free_mem=" + finalFreeMemory + ", init_unused_mem="
                            + initialUnusedMemory + ", end_unused_mem=" + finalUnusedMemory);
                    break;
                } //TESTED
                currMemUsage = newMemUsage;
                lastBatch = now;
            } //TESTED
            ndocs++;

            // Simple handling for standalone events
            if ((null != _s0_standaloneEventAggregator) && !_s0_bNeedToCalcSig) {
                //if _s0_bNeedToCalcSig then do this elsewhere
                ScoringUtils_Associations.addStandaloneEvents(f, 0.0, 0, _s0_standaloneEventAggregator,
                        _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter,
                        _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts);
            } //TESTED

            if (!_s0_bNeedToCalcSig) {
                continue;
            } //TESTED

            if (nCommunities > 1) { // (could have pan-community entities)
                ObjectId communityId = (ObjectId) f.get(DocumentPojo.communityId_);
                if (null != communityId) { // (have big problems if so, but anyway!)
                    int retval = _s0_multiCommunityHandler.community_getIdAndInitialize(communityId,
                            _s1_entitiesInDataset);
                    // (returns an int community id but also sets it into the cache, so just use that below)
                    if (Integer.MIN_VALUE == retval) {
                        //this document cannot be viewed from within this set of communities
                        continue;
                    }
                }
            } //TESTED      

            TempDocBucket docBucket = new TempDocBucket();
            docBucket.dbo = f;
            ObjectId id = (ObjectId) f.get(DocumentPojo._id_);

            // If we're going to weight relevance in, or we need the geo temporal decay:
            if ((0 != scoreParams.relWeight) || (null != scoreParams.timeProx)
                    || (null != scoreParams.geoProx)) {
                StatisticsPojo.Score scoreObj = scores.getScore().get(id);
                if (null != scoreObj) {
                    docBucket.explain = scoreObj.explain; // (will normally be null)
                    docBucket.luceneScore = scoreObj.score;
                    if ((null != scoreParams.timeProx) || (null != scoreParams.geoProx)) {
                        if (scoreObj.decay >= 0.0) {
                            docBucket.geoTemporalDecay = scoreObj.decay;
                        }
                        // (see also below for low accuracy geo scoring)
                    }
                } else {
                    docBucket.luceneScore = 1.0;
                }
            } //TESTED
            else if (this._s0_sortingByDate) {
                StatisticsPojo.Score scoreObj = scores.getScore().get(id);
                if (null != scoreObj) {
                    docBucket.nLuceneIndex = scoreObj.nIndex;
                }
            }
            docBucket.manualWeighting = this.getManualScoreWeights(scoreParams, f);

            BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_));
            if (null != l) {

                long nEntsInDoc = l.size();
                double dBestGeoScore = 0.0; // (for low accuracy geo only)
                for (Iterator<?> e0 = l.iterator(); e0.hasNext();) {
                    BasicDBObject e = (BasicDBObject) e0.next();
                    BasicDBObject tmpGeotag = null;
                    if (_s3_bLowAccuracyGeo || (null != _s1_dManualGeoDecay_latLonInvdecay)) {
                        // low accuracy geo, need to look for geotag
                        tmpGeotag = (BasicDBObject) e.get(EntityPojo.geotag_);
                    }

                    // Get attributes

                    double freq = -1.0;
                    long ntotaldoccount = -1;
                    String entity_index;
                    Double sentiment = null;
                    try {
                        sentiment = (Double) e.get(EntityPojo.sentiment_);
                        ntotaldoccount = e.getLong(EntityPojo.doccount_);
                        freq = e.getDouble(EntityPojo.frequency_);
                        entity_index = e.getString(EntityPojo.index_);
                        if (null == entity_index) {
                            // Just bypass the entity 
                            e.put(EntityPojo.significance_, 0.0);
                            nEntsInDoc--;
                            continue;
                        }
                    } catch (Exception ex) {
                        try {
                            String sfreq;
                            if (ntotaldoccount < 0) {
                                sfreq = e.getString(EntityPojo.doccount_);
                                ntotaldoccount = Long.valueOf(sfreq);
                            }
                            if (freq < -0.5) {
                                sfreq = e.getString(EntityPojo.frequency_);
                                freq = Long.valueOf(sfreq).doubleValue();
                            }
                            entity_index = e.getString(EntityPojo.index_);
                            if (null == entity_index) {
                                // Just bypass the entity 
                                e.put(EntityPojo.significance_, 0.0);
                                nEntsInDoc--;
                                continue;
                            }
                        } catch (Exception e2) {
                            // Just bypass the entity 
                            e.put(EntityPojo.significance_, 0.0);
                            nEntsInDoc--;
                            continue;
                        }
                    } //TESTED

                    // First loop through is just counting

                    // Retrieve entity (create/initialzie if necessary)
                    EntSigHolder shp = _s1_entitiesInDataset.get(entity_index);
                    if (null == shp) {
                        if (ntotaldoccount > (long) _s0_globalDocCount) { // obviously can't have more entities-in-dos than docs... 
                            ntotaldoccount = (long) _s0_globalDocCount;
                        }
                        shp = new EntSigHolder(entity_index, ntotaldoccount, _s0_multiCommunityHandler);

                        // Stage 1a alias handling: set up infrastructure, calculate doc overlap
                        if (null != _s1_aliasLookup) {
                            stage1_initAlias(shp);
                        }
                        if ((null != shp.aliasInfo) && (null == shp.masterAliasSH)) { // this is the discard alias
                            nEntsInDoc--;
                            continue;
                        } //TESTED

                        // Check if entity is in type filter list
                        if (null != _s0_entityTypeFilter) {
                            String entType = null;
                            if (null != shp.aliasInfo) {
                                entType = shp.aliasInfo.getType();
                            } else {
                                entType = e.getString(EntityPojo.type_);
                            }
                            if (_s0_bEntityTypeFilterPositive) {
                                if ((null != entType)
                                        && !_s0_entityTypeFilter.contains(entType.toLowerCase())) {
                                    nEntsInDoc--;
                                    continue;
                                }
                            } else if ((null != entType)
                                    && _s0_entityTypeFilter.contains(entType.toLowerCase())) {
                                //(negative filter)
                                nEntsInDoc--;
                                continue;
                            }

                        } //TESTED (end entity filter)

                        // Geo:
                        if (null != shp.aliasInfo) {
                            if (null != shp.aliasInfo.getGeotag()) { //Geo, overwrite/create tmpGeotag
                                if (_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo
                                        || (null != _s1_dManualGeoDecay_latLonInvdecay)) {
                                    // Always capture alias geo, even if not in low accuracy mode because we add it to the 
                                    // legitimate geo:
                                    if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo)
                                            && (null == _s3_geoBuckets)) {
                                        // Initialize the buckets if this is for aggregation not just decay
                                        _s3_geoBuckets = (LinkedList<EntSigHolder>[]) new LinkedList[_s3_nGEO_BUCKETS];
                                    }

                                    if (null == tmpGeotag) {
                                        tmpGeotag = new BasicDBObject();
                                    }
                                    tmpGeotag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat);
                                    tmpGeotag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon);

                                    if (null != shp.aliasInfo.getOntology_type()) {
                                        e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type());
                                    }
                                }
                            }
                        } //TESTED (end geo for aggregation or decay)

                        _s1_entitiesInDataset.put(entity_index, shp);
                        // end Stage 1a alias handling
                    } //(end if is alias)

                    // Stage 1b alias handling: calculate document counts (taking overlaps into account)
                    if (null != shp.masterAliasSH) {
                        // Counts:
                        shp.masterAliasSH.nTotalDocCount++;
                        // docs including overlaps
                        shp.masterAliasSH.avgFreqOverQuerySubset += freq;

                        // Keep track of overlaps:
                        if (f != shp.masterAliasSH.unusedDbo) {
                            shp.masterAliasSH.unusedDbo = f;
                            // (note this is only used in stage 1, alias.unusedDbo is re-used differently in stage 3/4)
                            shp.masterAliasSH.nDocCountInQuerySubset++;
                            // non-overlapping docs ie < shp.nDocCountInQuerySubset
                        }

                        // Sentiment:
                        shp.masterAliasSH.positiveSentiment += shp.positiveSentiment;
                        shp.masterAliasSH.negativeSentiment += shp.negativeSentiment;
                        if (null != sentiment) {
                            shp.masterAliasSH.nTotalSentimentValues++;
                        }

                    } //TESTED (end if is alias)
                      // end Stage 1b

                    // Pan-community logic (this needs to be before the entity object is updated)
                    if (_s0_multiCommunityHandler.isActive()) {
                        _s0_multiCommunityHandler.community_updateCorrelations(shp, ntotaldoccount,
                                entity_index);
                    } else { // (Once we've started multi-community logic, this is no longer desirable)
                        if ((ntotaldoccount > shp.nTotalDocCount) && (ntotaldoccount <= _s0_globalDocCount)) {
                            shp.nTotalDocCount = ntotaldoccount;
                        }
                        //(note there used to be some cases where we adjusted for dc/tf==0, but the 
                        // underlying issue in the data model that caused this has been fixed, so it's 
                        // now a pathological case that can be ignored)
                    } //(TESTED)

                    // Update counts:
                    _s1_sumFreqInQuerySubset += freq;
                    shp.avgFreqOverQuerySubset += freq;
                    shp.nDocCountInQuerySubset++;
                    shp.decayedDocCountInQuerySubset += docBucket.geoTemporalDecay;
                    // (note this doesn't handle low accuracy geo-decay ... we'll address that via a separate term)

                    TempEntityInDocBucket entBucket = new TempEntityInDocBucket();
                    entBucket.dbo = e;
                    entBucket.freq = freq;
                    entBucket.doc = docBucket;
                    shp.entityInstances.add(entBucket);
                    if (null != tmpGeotag) { // (only needed for low accuracy geo aggregation)

                        if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo) && (null == shp.geotag)) { // (first time for shp only)
                            shp.geotag = tmpGeotag;
                            shp.geotaggedEntity = e; // (ie for onto type, which has been overwritten in the alias case...)
                        }
                        if (null != _s1_dManualGeoDecay_latLonInvdecay) {
                            // Emulate scripted Lucene calculations
                            double minlat = tmpGeotag.getDouble(GeoPojo.lat_);
                            double minlon = tmpGeotag.getDouble(GeoPojo.lon_);
                            double paramlat = _s1_dManualGeoDecay_latLonInvdecay[0];
                            double paramlon = _s1_dManualGeoDecay_latLonInvdecay[1];
                            double gdecay = _s1_dManualGeoDecay_latLonInvdecay[2];
                            char ontCode = GeoOntologyMapping
                                    .encodeOntologyCode(e.getString(EntityPojo.ontology_type_));
                            double dDecay = QueryDecayScript.getGeoDecay(minlat, minlon, paramlat, paramlon,
                                    gdecay, ontCode);
                            if (dDecay > dBestGeoScore) {
                                dBestGeoScore = dDecay;
                            }
                        } //TESTED
                    } //(end if entity has geo and need to process entity geo)

                    if (freq > shp.maxFreq) {
                        shp.maxFreq = freq;
                    }
                    // Sentiment:
                    if ((null != sentiment) && (Math.abs(sentiment) <= 1.1)) { // (actually 1.0)
                        shp.nTotalSentimentValues++;
                        if (sentiment > 0.0) {
                            shp.positiveSentiment += sentiment;
                        } else {
                            shp.negativeSentiment += sentiment;
                        }
                    } else if (null != sentiment) { // corrupt sentiment for some reason?!
                        e.put(EntityPojo.sentiment_, null);
                    }
                    docBucket.docLength += freq;

                } //(end loop over entities)

                docBucket.nLeftToProcess = nEntsInDoc;
                docBucket.nEntsInDoc = (int) nEntsInDoc;

                if (null != this._s1_dManualGeoDecay_latLonInvdecay) { // Low accuracy geo-calculations
                    docBucket.geoTemporalDecay *= dBestGeoScore;
                    docBucket.luceneScore *= dBestGeoScore;
                    _s2_dAvgLowAccuracyGeoDecay += dBestGeoScore * s0_nQuerySubsetDocCountInv;
                } //TESTED            

            } // (end if feed has entities)

            // Handle documents with no entities - can still promote them
            if (0 == docBucket.nLeftToProcess) { // (use this rather than doc length in case all the entities had freq 0)
                _s1_noEntityBuckets.add(docBucket);
            }

        } // (end loop over feeds)
          //TESTED
    } finally {
        dbc.setDBDecoderFactory(defaultDecoder);
    }
}

From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java

License:Open Source License

private void stage4_prepareDocsForOutput(AdvancedQueryPojo.QueryScorePojo scoreParams, StatisticsPojo scores,
        long nToClientLimit, LinkedList<BasicDBObject> returnList) {
    // Get the documents
    long nDocs = 0;
    double dBestScore = 0.0;
    double dAvgScore = 0.0;

    double dSigFactor = 100.0 / (_s3_dSigScalingFactor * _s2_dApproxAverageDocumentSig);
    double dRelFactor = 100.0 / (_s3_dLuceneScalingFactor * _s0_avgLuceneScore);

    // Start at the bottom of the list, so don't need to worry about skipping documents, just count out from the bottom
    // The call to stage3_calculateTFTerms with nStart+nToClientLimit handles the rest

    Iterator<TempDocBucket> pqIt = _s3_pqDocs.iterator();
    while (pqIt.hasNext() && (nDocs < nToClientLimit)) {
        TempDocBucket qsf = pqIt.next();
        nDocs++;//from   w  ww .  j  a v a  2s . c o m
        if (!_s0_sortingByDate) {
            dBestScore = qsf.totalScore;
        }
        dAvgScore += dBestScore;

        BasicDBObject f = qsf.dbo;

        // Phase "0" - these are the highest prio events
        boolean bNeedToFilterAndAliasAssoc_event = true;
        boolean bNeedToFilterAndAliasAssoc_fact = true;
        boolean bNeedToFilterAndAliasAssoc_summary = true;
        if (null != _s0_standaloneEventAggregator) {
            ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0,
                    _s0_standaloneEventAggregator, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                    _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts);
            bNeedToFilterAndAliasAssoc_event = false;
            bNeedToFilterAndAliasAssoc_fact = false;
            bNeedToFilterAndAliasAssoc_summary = false;
        } //TESTED
        if (null != _s0_lowAccuracyAssociationAggregator_events) {
            ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0,
                    _s0_lowAccuracyAssociationAggregator_events, _s0_bEntityTypeFilterPositive,
                    _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, true, false,
                    false);
            bNeedToFilterAndAliasAssoc_event = false;
        } //TESTED                        
        if (null != _s0_lowAccuracyAssociationAggregator_facts) {
            ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0,
                    _s0_lowAccuracyAssociationAggregator_facts, _s0_bEntityTypeFilterPositive,
                    _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, false, false,
                    true);
            bNeedToFilterAndAliasAssoc_fact = false;
        } //TESTED

        try {
            DocumentPojoApiMap.mapToApi(f);
            // Handle deduplication/multi-community code:
            if (null != qsf.dupList) {
                try {
                    ScoringUtils_MultiCommunity.community_combineDuplicateDocs(qsf);
                } catch (Exception e) {
                    // Do nothing, just carry on with minimal damage!
                }
            }

            // Scoring:
            double d = qsf.aggSignificance * dSigFactor;
            if (Double.isNaN(d)) {
                f.put(DocumentPojo.aggregateSignif_, 0.0);
            } else {
                f.put(DocumentPojo.aggregateSignif_, d);
            }
            d = qsf.luceneScore * dRelFactor;
            if (Double.isNaN(d)) {
                f.put(DocumentPojo.queryRelevance_, 0.0);
            } else {
                f.put(DocumentPojo.queryRelevance_, d);
            }
            if (!_s0_sortingByDate) {
                f.put(DocumentPojo.score_, qsf.totalScore);
            }

            BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_));

            // Handle update ids vs normal ids:
            ObjectId updateId = (ObjectId) f.get(DocumentPojo.updateId_);
            if (null != updateId) { // swap the 2...
                f.put(DocumentPojo.updateId_, f.get(DocumentPojo._id_));
                f.put(DocumentPojo._id_, updateId);
            }

            // Check if entities enabled            
            if ((null != l) && (!_s0_bGeoEnts && !_s0_bNonGeoEnts)) {
                f.removeField(DocumentPojo.entities_);
                l = null;
            } //TESTED

            // Check if events etc enabled
            if ((!_s0_bEvents && !_s0_bFacts && !_s0_bSummaries)) {
                f.removeField(DocumentPojo.associations_);
            } //TESTED            
            else if (!_s0_bEvents || !_s0_bFacts || !_s0_bSummaries || (null != _s0_assocVerbFilter)) {

                // Keep only specified event_types
                BasicDBList lev = (BasicDBList) (f.get(DocumentPojo.associations_));
                if (null != lev) {
                    for (Iterator<?> e0 = lev.iterator(); e0.hasNext();) {
                        BasicDBObject e = (BasicDBObject) e0.next();

                        // Type filter
                        boolean bNeedToFilterAndAliasAssoc = true;
                        String sEvType = e.getString(AssociationPojo.assoc_type_);
                        boolean bKeep = true;
                        if (null == sEvType) {
                            bKeep = false;
                        } else if (sEvType.equalsIgnoreCase("event")) {
                            if (!_s0_bEvents)
                                bKeep = false;
                            bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_event;
                        } else if (sEvType.equalsIgnoreCase("fact")) {
                            if (!_s0_bFacts)
                                bKeep = false;
                            bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_fact;
                        } else if (sEvType.equalsIgnoreCase("summary")) {
                            if (!_s0_bSummaries)
                                bKeep = false;
                            bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_summary;
                        }
                        if (!bKeep) {
                            e0.remove();
                        } else { // Type matches, now for some more complex logic....

                            if (bNeedToFilterAndAliasAssoc) { // (otherwise done already)

                                bKeep = ScoringUtils_Associations.filterAndAliasAssociation(e, _s1_aliasLookup,
                                        true, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                                        _s0_entityTypeFilter, _s0_assocVerbFilter);
                                if (!bKeep) {
                                    e0.remove();
                                }

                            } //TESTED

                        } //(end output filter logic)

                    } // (end loop over events)   
                } // (end if this doc has events)

            } //TESTED            

            // Check if metadata is enabled
            if (!_s0_bMetadata) {
                f.removeField(DocumentPojo.metadata_);
            } //TESTED

            if (null != l) {

                for (Iterator<?> e0 = l.iterator(); e0.hasNext();) {
                    BasicDBObject e = (BasicDBObject) e0.next();

                    if (!_s0_bNonGeoEnts) { // then must only be getting geo (else wouldn't be in this loop)
                        if (null == e.get(EntityPojo.geotag_)) {
                            e0.remove();
                            continue;
                        }
                    }

                    String entity_index = e.getString(EntityPojo.index_);
                    if (null == entity_index)
                        continue;

                    EntSigHolder shp = (EntSigHolder) _s1_entitiesInDataset.get(entity_index);

                    if (null != shp) {
                        // Stage 4x: alias processing, just overwrite 
                        // (note don't delete "duplicate entities", hard-to-be-globally-consistent
                        //  and will potentially throw data away which might be undesirable)
                        if (null != shp.masterAliasSH) {
                            shp = shp.masterAliasSH; // (already has all the aggregated values used below)
                            if (!entity_index.equals(shp.aliasInfo.getIndex())) {
                                e.put(EntityPojo.index_, shp.aliasInfo.getIndex());
                                e.put(EntityPojo.disambiguated_name_, shp.aliasInfo.getDisambiguatedName());
                                e.put(EntityPojo.type_, shp.aliasInfo.getType());
                                e.put(EntityPojo.dimension_, shp.aliasInfo.getDimension());

                                if (null != shp.aliasInfo.getGeotag()) {
                                    BasicDBObject aliasedGeoTag = new BasicDBObject();
                                    aliasedGeoTag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat);
                                    aliasedGeoTag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon);
                                    e.put(EntityPojo.geotag_, aliasedGeoTag);
                                    if (null != shp.aliasInfo.getOntology_type()) {
                                        e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type());
                                    }
                                } //TESTED
                            }
                        } //TESTED
                          // end Stage 4x of alias processing                  

                        double dataSig = shp.datasetSignificance;
                        if (Double.isNaN(dataSig)) {
                            e.put(EntityPojo.datasetSignificance_, 0.0);
                        } else {
                            e.put(EntityPojo.datasetSignificance_, dataSig);
                        }
                        e.put(EntityPojo.queryCoverage_, shp.queryCoverage);
                        e.put(EntityPojo.averageFreq_, shp.avgFreqOverQuerySubset);
                        if (shp.nTotalSentimentValues > 0) {
                            e.put(EntityPojo.positiveSentiment_, shp.positiveSentiment);
                            e.put(EntityPojo.negativeSentiment_, shp.negativeSentiment);
                            e.put(EntityPojo.sentimentCount_, shp.nTotalSentimentValues);
                        }
                    } else { // (most likely to occur if the entity is discarded (alias/filter) or is corrupt in some way)
                        e0.remove();
                        continue;
                    }

                } //(end loop over entities)
            } // (end if feed has entities)
              //TESTED

            // Explain if enabled
            if (null != qsf.explain) {
                f.put(DocumentPojo.explain_, qsf.explain);
            }

            // Add to the end of the list (so will come back from API call in natural order, highest first)
            returnList.addFirst(f);
            // (add elements to the front of the list so that the top of the list is ordered by priority)
        } catch (Exception e) {
            // Probably a JSON error, just carry on
            String title = f.getString(DocumentPojo.title_);
            logger.error(title + ": " + e.getMessage());
        }

    } // (end loop over feeds)
      //TESTED

    // Update the scores:
    scores.maxScore = (float) dBestScore;
    if (nDocs > 0) {
        scores.avgScore = (float) dAvgScore / nDocs;
    }
}

From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java

License:Open Source License

private void stage4_prepareEntsForOutput(LinkedList<BasicDBObject> entityReturn) {
    if (_s0_nNumEntsReturn > 0) { // (else entities not enabled)

        for (EntSigHolder qsf = _s3_pqEnt.poll(); null != qsf; qsf = _s3_pqEnt.poll()) // (start with lowest ranking)
        {/*  w  ww .  jav a 2 s .c  om*/
            BasicDBObject ent = qsf.unusedDbo;
            if (null == ent) {
                int nTries = 0;
                if (null != qsf.entityInstances) { // (should never be null but just to be on the safe side...
                    for (TempEntityInDocBucket tefb : qsf.entityInstances) {
                        // (Try to find an entity that wasn't promoted ie can now be re-used
                        //  if we can't find one quite quickly then bail out and we'll pay the cost of cloning it)
                        if (!tefb.doc.bPromoted) {
                            ent = tefb.dbo;
                            break;
                        } else if (++nTries > 10) {
                            break;
                        }
                    }
                    if (null == ent) {
                        ent = qsf.entityInstances.get(0).dbo;
                    }
                } else { // (no entityInstances, something alias-related has gone wrong, just skip) 
                    continue;
                }
            } //TESTED
            qsf.entityInstances = null; // (don't need this any more, can be gc'd)

            try {

                if (null != qsf.aliasInfo) {
                    if (!qsf.index.equals(qsf.aliasInfo.getIndex())) {
                        ent.put(EntityPojo.index_, qsf.aliasInfo.getIndex());
                        ent.put(EntityPojo.disambiguated_name_, qsf.aliasInfo.getDisambiguatedName());
                        ent.put(EntityPojo.type_, qsf.aliasInfo.getType());
                        ent.put(EntityPojo.dimension_, qsf.aliasInfo.getDimension());
                        if (null != qsf.aliasInfo.getGeotag()) {
                            BasicDBObject aliasedGeoTag = new BasicDBObject();
                            aliasedGeoTag.put(GeoPojo.lat_, qsf.aliasInfo.getGeotag().lat);
                            aliasedGeoTag.put(GeoPojo.lon_, qsf.aliasInfo.getGeotag().lon);
                            ent.put(EntityPojo.geotag_, aliasedGeoTag);
                            if (null != qsf.aliasInfo.getOntology_type()) {
                                ent.put(EntityPojo.ontology_type_, qsf.aliasInfo.getOntology_type());
                            }
                        } //TESTED
                    }
                } //TESTED

                if (null == ent.get(EntityPojo.datasetSignificance_)) { // Not getting promoted so need to add fields...                  
                    if (Double.isNaN(qsf.datasetSignificance)) {
                        ent.put("datasetSignificance", 0.0);
                    } else {
                        ent.put(EntityPojo.datasetSignificance_, qsf.datasetSignificance);
                    }
                    ent.put(EntityPojo.queryCoverage_, qsf.queryCoverage);
                    ent.put(EntityPojo.averageFreq_, qsf.avgFreqOverQuerySubset);
                    if (qsf.nTotalSentimentValues > 0) {
                        ent.put(EntityPojo.positiveSentiment_, qsf.positiveSentiment);
                        ent.put(EntityPojo.negativeSentiment_, qsf.negativeSentiment);
                        ent.put(EntityPojo.sentimentCount_, qsf.nTotalSentimentValues);
                    }
                } else { // (... but can just use it without cloning)
                    BasicDBObject ent2 = new BasicDBObject();
                    for (Map.Entry<String, Object> kv : ent.entrySet()) {
                        ent2.append(kv.getKey(), kv.getValue());
                    }
                    ent = ent2;
                }
                ent.removeField(EntityPojo.relevance_);
                if (Double.isNaN(qsf.maxDocSig)) {
                    ent.put(EntityPojo.significance_, 0.0);
                } else {
                    ent.put(EntityPojo.significance_, qsf.maxDocSig);
                }
                ent.put(EntityPojo.frequency_, (long) qsf.maxFreq);
                entityReturn.addFirst(ent);
            } catch (Exception e) {
                // Probably a JSON error, just carry on
                String title = ent.getString(EntityPojo.index_);
                logger.error(title + ": " + e.getMessage());
            } //TESTED
        }
    } //TESTED            
}

From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java

License:Open Source License

private double getManualScoreWeights(AdvancedQueryPojo.QueryScorePojo scoreParams, BasicDBObject doc) {
    // Highest prio: source key weight
    if (null != scoreParams.sourceWeights) {
        String sourceKey = DocumentPojo.getSourceKey(doc.getString(DocumentPojo.sourceKey_));
        Double dWeight = scoreParams.sourceWeights.get(sourceKey);

        if (null != dWeight) {
            return dWeight;
        }/*ww  w .  ja  v a  2s .c om*/
    }
    // Middle prio: type
    if (null != scoreParams.typeWeights) {
        String mediaType = doc.getString(DocumentPojo.mediaType_);
        Double dWeight = scoreParams.typeWeights.get(mediaType);

        if (null != dWeight) {
            return dWeight;
        }
    }
    // Lowest prio: average of tags
    if (null != scoreParams.tagWeights) {
        double dScore = 0.0;
        int nComps = 0;
        BasicDBList tags = (BasicDBList) doc.get(DocumentPojo.tags_);
        if (null != tags) {
            for (Object tagObj : tags) {
                String tag = (String) tagObj;
                Double dWeight = scoreParams.tagWeights.get(tag);
                if (null != dWeight) {
                    nComps++;
                    dScore += dWeight;
                }
            }
            if (nComps > 0) {
                return dScore / nComps;
            }
        }
    }
    return 1.0;
}

From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils_Associations.java

License:Open Source License

static void addStandaloneEvents(BasicDBObject doc, double dDocSig, int nPhase,
        StandaloneEventHashAggregator standaloneEventAggregator, boolean bEntTypeFilterPositive,
        boolean bAssocVerbFilterPositive, HashSet<String> entTypeFilter, HashSet<String> assocVerbFilter,
        boolean bEvents, boolean bSummaries, boolean bFacts) {
    if (standaloneEventAggregator.bSimulateAggregation) {
        bSummaries = false;//  ww w .ja va  2  s .  c  om
    }
    String sDocIsoPubDate = null;

    BasicDBList lev = (BasicDBList) (doc.get(DocumentPojo.associations_));
    if (null != lev) {
        for (Iterator<?> e0 = lev.iterator(); e0.hasNext();) {
            BasicDBObject e = (BasicDBObject) e0.next();

            String sEvType = e.getString(AssociationPojo.assoc_type_);
            boolean bIsFact = false;
            boolean bIsSummary = false;
            boolean bKeep = true;
            if (null == sEvType) {
                bKeep = false;
            } else if (sEvType.equalsIgnoreCase("event")) {
                if (!bEvents)
                    bKeep = false;
            } else if (sEvType.equalsIgnoreCase("fact")) {
                if (!bFacts)
                    bKeep = false;
                bIsFact = true;
            } else if (sEvType.equalsIgnoreCase("summary")) {
                if (!bSummaries)
                    bKeep = false;
                bIsSummary = true;
            } //TESTED x4

            // Filter and aliasing logic:
            if (bKeep) {
                boolean bKeep2 = filterAndAliasAssociation(e, standaloneEventAggregator.aliasLookup, true,
                        bEntTypeFilterPositive, bAssocVerbFilterPositive, entTypeFilter, assocVerbFilter);
                if (!bKeep2) {
                    e0.remove();
                    // (remove/rename events based on filters where we can, 
                    //  means we don't have to do it in stage4)
                    bKeep = false;
                }
            } //TESTED

            if (bKeep) {
                String time_start = null;
                String time_end = null; // (normally not needed)

                if (!standaloneEventAggregator.bSimulateAggregation) { //else times are discarded                  
                    // Add time from document
                    time_start = e.getString(AssociationPojo.time_start_);

                    if (null == time_start) {
                        if (null == sDocIsoPubDate) {
                            // Convert docu pub date to ISO (day granularity):
                            Date pubDate = (Date) doc.get(DocumentPojo.publishedDate_);

                            if (null != pubDate) {
                                SimpleDateFormat f2 = new SimpleDateFormat("yyyy-MM-dd");
                                time_start = f2.format(pubDate);
                            }
                        } else {
                            time_start = sDocIsoPubDate; // (so it doesn't get added again below)
                        }
                    } //TESTED               
                    else { // Remove hourly granularity for consistency                  
                        time_start = time_start.replaceAll("T.*$", "");
                        time_end = e.getString(AssociationPojo.time_end_);

                        if (null != time_end) {
                            time_end = time_end.replaceAll("T.*$", "");
                        }
                    } //TESTED (with debug code, eg time_start = "1997-07-16T19:20:30+01:00")
                    if (null != time_start) { // Ensure it has day granularity, to help with aggregation
                        e.put(AssociationPojo.time_start_, time_start);
                        if (null != time_end) {
                            e.put(AssociationPojo.time_end_, time_end);
                        }
                    } //TESTED
                } //(end if normal standalone mode, not aggregation simulation)

                StandaloneEventHashCode evtHolder = new StandaloneEventHashCode(
                        standaloneEventAggregator.bSimulateAggregation, e, bIsSummary, bIsFact);
                BasicDBObject oldEvt = standaloneEventAggregator.store.get(evtHolder);

                if (null == oldEvt) {
                    // Doc count (see below)
                    e.put(AssociationPojo.doccount_, 1);
                    double dAssocSig = dDocSig * dDocSig;

                    // Weight down summaries slightly (80%), and summaries with missing entities a lot (50%)  
                    if (bIsSummary) {
                        String sEntity2 = (String) e.get(AssociationPojo.entity2_);
                        if (null == sEntity2) {
                            dAssocSig *= 0.50;
                        } else {
                            dAssocSig *= 0.80;
                        }
                    }

                    // Running significance count:
                    e.put(AssociationPojo.assoc_sig_, dAssocSig); // (use sum-squared to score up events that occur frequently)
                    if (dAssocSig > standaloneEventAggregator.dMaxSig) {
                        standaloneEventAggregator.dMaxSig = dAssocSig;
                    }

                    standaloneEventAggregator.store.put(evtHolder, e);

                    // Add to list in some sort of very basic order...
                    if (2 == nPhase) { // Put at the back, it's probably really low sig
                        standaloneEventAggregator.tmpList.add(e);
                    } else if (1 == nPhase) { // Put at the front until Phase 0 comes along
                        standaloneEventAggregator.tmpList.addFirst(e);
                        standaloneEventAggregator.nPhase1Events++;
                    } else { // phases 0 and 1 get the higher orderings
                        standaloneEventAggregator.tmpList.addFirst(e);
                        standaloneEventAggregator.nPhase0Events++;
                    }
                } else { // Update doc count
                    long nDocCount = oldEvt.getInt(AssociationPojo.doccount_, 1) + 1;
                    oldEvt.put(AssociationPojo.doccount_, nDocCount);
                    // Running significance count:
                    double dAssocSig = oldEvt.getDouble(AssociationPojo.doccount_) + dDocSig * dDocSig;
                    oldEvt.put(AssociationPojo.assoc_sig_, dAssocSig);
                    if (dAssocSig / nDocCount > standaloneEventAggregator.dMaxSig) {
                        standaloneEventAggregator.dMaxSig = dAssocSig;
                    }

                    if (bIsFact && !standaloneEventAggregator.bSimulateAggregation) {
                        // For facts, also update the time range:
                        String old_time_start = oldEvt.getString(AssociationPojo.time_start_);
                        String old_time_end = oldEvt.getString(AssociationPojo.time_end_);
                        // Just keep this really simple and inefficient:
                        TreeSet<String> timeOrder = new TreeSet<String>();
                        if (null != old_time_start) {
                            timeOrder.add(old_time_start);
                        }
                        if (null != old_time_end) {
                            timeOrder.add(old_time_end);
                        }
                        if (null != time_start) {
                            timeOrder.add(time_start);
                        }
                        if (null != time_end) {
                            timeOrder.add(time_end);
                        }
                        if (timeOrder.size() > 1) {
                            Iterator<String> itStart = timeOrder.iterator();
                            oldEvt.put(AssociationPojo.time_start_, itStart.next());
                            Iterator<String> itEnd = timeOrder.descendingIterator();
                            oldEvt.put(AssociationPojo.time_end_, itEnd.next());
                        }

                    } // end if is fact - treat times different
                }
                //TESTED

            } // (end if keeping this event)
        } // (end loop over events)   
    } // (end if this doc has events)

}

From source file:com.ikanow.infinit.e.api.knowledge.QueryHandler.java

License:Open Source License

private ResponsePojo getSavedQueryInstead(String storedQueryNameOrId, String[] communityIdStrs,
        AdvancedQueryPojo query) {/*from   w w w  .j av  a 2  s  .  c o  m*/
    ResponsePojo rp = null;
    ObjectId oid = null;
    BasicDBObject jobQuery = null;
    try {
        oid = new ObjectId(storedQueryNameOrId);
        jobQuery = new BasicDBObject(CustomMapReduceJobPojo._id_, oid);
    } catch (Exception e) {
        jobQuery = new BasicDBObject(CustomMapReduceJobPojo.jobtitle_, storedQueryNameOrId);
    }
    CustomMapReduceJobPojo savedJob = CustomMapReduceJobPojo
            .fromDb(DbManager.getCustom().getLookup().findOne(jobQuery), CustomMapReduceJobPojo.class);

    if (null != savedJob) { // Is this even a saved job?
        if (null != savedJob.jarURL) {
            savedJob = null;
        }
    }
    if (null != savedJob) { // Authorization
        boolean auth = false;
        String communityIdStrList = Arrays.toString(communityIdStrs);
        for (ObjectId commId : savedJob.communityIds) {

            if (communityIdStrList.contains(commId.toString())) {
                auth = true;
                break;
            }
        }
        if (!auth) {
            savedJob = null;
        }
        if (null == savedJob) {
            throw new RuntimeException(
                    "Can't find saved query, or is a custom job not a query, or authorization error");
        }
        // OK go get the results of the job
        DBCollection coll = DbManager.getCollection(savedJob.getOutputDatabase(), savedJob.outputCollection);
        BasicDBObject result = (BasicDBObject) coll.findOne(); // (at some point support multiple saved queries)
        if (null == result) {
            throw new RuntimeException("Saved query is empty");
        }
        BasicDBObject apiResultToConvert = (BasicDBObject) result.get("value");
        if (null == apiResultToConvert) {
            throw new RuntimeException("Saved query has invalid format");
        }
        rp = ResponsePojo.fromDb(apiResultToConvert);
    } else if (null != oid) { // Support new user/doc queues
        SharePojo share = SharePojo.fromDb(DbManager.getSocial().getShare().findOne(jobQuery), SharePojo.class);
        if ((null == share) || (null == share.getShare())
                || (!share.getType().equals(DocumentQueueControlPojo.UserQueue)
                        && !share.getType().equals(DocumentQueueControlPojo.SavedQueryQueue))) {
            throw new RuntimeException(
                    "Can't find saved query, or is a custom job not a query, or authorization error");
        } else { // share.share is a  DocumentQueueControlPojo
            DocumentQueueControlPojo queue = DocumentQueueControlPojo.fromApi(share.getShare(),
                    DocumentQueueControlPojo.class);
            BasicDBObject docQuery1 = new BasicDBObject(DocumentPojo._id_,
                    new BasicDBObject(DbManager.in_, queue.getQueueList()));
            BasicDBObject docQuery2 = new BasicDBObject(DocumentPojo.updateId_,
                    new BasicDBObject(DbManager.in_, queue.getQueueList()));
            BasicDBObject docQuery = new BasicDBObject(DbManager.or_, Arrays.asList(docQuery1, docQuery2));
            DBCursor dbc = DbManager.getDocument().getMetadata().find(docQuery).limit(query.score.numAnalyze);
            ScoringUtils scoreStats = new ScoringUtils();
            List<BasicDBObject> docs = null;
            StatisticsPojo stats = new StatisticsPojo();
            stats.setSavedScores(query.output.docs.skip, dbc.count());
            try {
                boolean lockAcquired = true;
                try {
                    lockAcquired = this.acquireConcurrentAccessLock();

                } catch (InterruptedException e) {
                    //(that's fine just carry on)
                    lockAcquired = false;
                }
                if (!lockAcquired) {
                    rp.setResponse(
                            new ResponseObject("Query", false, "Query engine busy, please try again later."));
                    return rp;
                }
                scoreStats.setAliasLookupTable(_aliasLookup);
                docs = scoreStats.calcTFIDFAndFilter(DbManager.getDocument().getMetadata(), dbc, query.score,
                        query.output, stats, false, query.output.docs.skip, query.output.docs.numReturn,
                        communityIdStrs, null, null, null, null, null, null, null, null);
            } finally {
                scoreStats.clearAsMuchMemoryAsPossible();
                this.releaseConcurrentAccessLock();
            }
            rp = new ResponsePojo();
            rp.setResponse(new ResponseObject("Query", true, "Saved Query: " + share.getTitle()));
            rp.setStats(stats);
            if ((null != docs) && (docs.size() > 0)) {
                rp.setData(docs, (BasePojoApiMap<BasicDBObject>) null);
            } else { // (ensure there's always an empty list)
                docs = new ArrayList<BasicDBObject>(0);
                rp.setData(docs, (BasePojoApiMap<BasicDBObject>) null);
            }
        } //end if user or saved query queue
    }
    return rp;

}

From source file:com.ikanow.infinit.e.api.knowledge.SearchHandler.java

License:Open Source License

/**
 * Performs a reverse geolookup, takes a lat/lon and returns a list of nearby
 * locations/*from  w ww  .j av  a2 s . com*/
 * 
 * @param latitude
 * @param longitude
 * @return
 */
private List<SearchSuggestPojo> reverseGeoLookup(Double latitude, Double longitude) {
    List<SearchSuggestPojo> locations = null;

    BasicDBList results = runGeoNear(latitude, longitude);

    if (results != null) {
        locations = new ArrayList<SearchSuggestPojo>();
        if (results.size() > 0) {
            for (int i = 0; i < 10 && i < results.size(); i++) {
                BasicDBObject result = (BasicDBObject) results.get(i);
                Double distance = result.getDouble("dis");
                BasicDBObject obj = (BasicDBObject) result.get("obj");
                locations.add(buildLocation(obj, distance));
            }
        }
    }
    return locations;
}

From source file:com.ikanow.infinit.e.api.social.sharing.ShareHandler.java

License:Open Source License

private String getReferenceString(SharePojo share) {
    // FILE:// w  w  w . ja va2  s .c  om
    if (null == share.getDocumentLocation().get_id()) { // local file based reference
        FileInputStream fin = null;
        Scanner s = null;
        try {
            File f = new File(share.getDocumentLocation().getCollection());
            fin = new FileInputStream(f);
            s = new Scanner(fin, "UTF-8");
            return (s.useDelimiter("\n").next());
        } catch (Exception e) {
            return null;
        } finally {
            try {
                if (null != fin)
                    fin.close();
                if (null != s)
                    s.close();
            } catch (Exception e) {
            } // (probably just never opened)               
        }
    }
    // DB:
    // Carry on, this is a database object
    HashSet<String> shareIdStrs = new HashSet<String>();
    for (ShareCommunityPojo commIds : share.getCommunities()) {
        shareIdStrs.add(commIds.get_id().toString());
    }
    String retVal = null;
    BasicDBObject query = new BasicDBObject(DocumentPojo._id_, share.getDocumentLocation().get_id()); // (same for all artifacts)
    String dbName = share.getDocumentLocation().getDatabase();
    String collectionName = share.getDocumentLocation().getCollection();
    BasicDBObject returnVal = (BasicDBObject) MongoDbManager.getCollection(dbName, collectionName)
            .findOne(query);
    try {
        BasicDBList communities = null;
        boolean bCustomJob = dbName.equals("custommr"); // (a bit different)
        boolean bFoundOverlap = false;
        if (!bCustomJob) {
            ObjectId communityId = (ObjectId) returnVal.get(DocumentPojo.communityId_); // (same for other artifacts)
            bFoundOverlap = shareIdStrs.contains(communityId.toString());
        } else {
            communities = (BasicDBList) returnVal.get("communityIds"); // (shared across multiple json types)
            for (Object commIdObj : communities) {
                ObjectId commId = (ObjectId) commIdObj;
                if (shareIdStrs.contains(commId.toString())) {
                    bFoundOverlap = true;
                    break;
                }
            }
        }
        if (!bFoundOverlap) {
            throw new RuntimeException(""); // (turned into the common message below)
        }
        if (!bCustomJob) { // everything but custom jobs
            Date modifiedTime = returnVal.getDate(DocumentPojo.modified_); // (same for other artifacts)
            if (null != modifiedTime) {
                share.setModified(modifiedTime);
            }
            retVal = returnVal.toString();
        } else { // custom jobs
            String database = returnVal.getString(CustomMapReduceJobPojo.outputDatabase_);
            if (null == database) {
                database = dbName;
            }
            Date modifiedTime = returnVal.getDate(CustomMapReduceJobPojo.lastCompletionTime_);
            if (null != modifiedTime) {
                share.setModified(modifiedTime);
            }
            String collection = returnVal.getString(CustomMapReduceJobPojo.outputCollection_);
            BasicDBObject returnVal2 = (BasicDBObject) MongoDbManager.getCollection(database, collection)
                    .findOne();
            retVal = returnVal2.toString();
        }
    } catch (Exception e) {
        throw new RuntimeException("Document not found or permission issue (no overlapping communities)");
    }
    return retVal;
}

From source file:com.ikanow.infinit.e.application.utils.LogstashConfigUtils.java

License:Open Source License

public static String validateLogstashInput(String sourceKey, String config, StringBuffer errorMessage,
        boolean isAdmin) {

    if (null == _props) {
        _props = new PropertiesManager();
        String allowedInputs = _props.getProperty("harvest.logstash.allowed_inputs");

        if ((null == allowedInputs) || (allowedInputs.isEmpty())) {
            allowedInputs = "collectd,drupal_dblog,gelf,gemfire,imap,irc,lumberjack,s3,snmptrap,sqs,syslog,twitter,udp,xmpp,zenoss";
            // currently *not* allowed by default: elasticsearch,eventlog,exec,file,ganglia,generator,graphite,heroku,jmx,log4j,pipe,puppet_facter,rabbitmq,redit,relp,sqlite,stdin,stomp,tcp,unix,varnishlog,websocket,wmi,zeromq
        }// ww w.  j  a  v a  2s .  c  om
        _allowedInputs.addAll(Arrays.asList(allowedInputs.toLowerCase().split("\\s*,\\s*")));

        String allowedFilters = _props.getProperty("harvest.logstash.allowed_filters");
        if ((null == allowedFilters) || (allowedFilters.isEmpty())) {
            allowedFilters = "advisor,alter,anonymize,checksum,cidr,cipher,clone,collate,csv,date,dns,drop,elapsed,extractnumbers,fingerprint,geoip,gelfify,grep,grok,grokdiscovery,l18n,json,json_encode,kv,metaevent,metrics,multiline,mutate,noop,prune,punct,railsparallelrequest,range,sleep,split,sumnumbers,syslog_pri,throttle,translate,unique,urldecode,useragent,uuid,wms,wmts,xml";
            // currently *not* allowed by default: elasticsearch,ruby,zeromq
        }
        _allowedFilters.addAll(Arrays.asList(allowedFilters.toLowerCase().split("\\s*,\\s*")));
    } //TESTED (3_2a)

    // Configuration validation, phase 1

    errorMessage.append("Validation error:");
    BasicDBObject jsonifiedConfig = parseLogstashConfig(config, errorMessage);
    if (null == jsonifiedConfig) {
        return null;
    }
    errorMessage.setLength(0);

    // Configuration validation, phase 2 - very basic checks on the structure of the object

    Object input = jsonifiedConfig.get("input");
    if ((null == input) || !(input instanceof BasicDBObject)) { // Does input exist?
        errorMessage.append(
                "Invalid input format, should be 'input { INPUT_TYPE { ... } }' (only one INPUT_TYPE) and also contain a filter, no \"s around them. (0)");
        return null;
    } //TESTED (3_1d)
    else { // Check there's only one input type and (unless admin) it's one of the allowed types
        BasicDBObject inputDbo = (BasicDBObject) input;
        if (1 != inputDbo.size()) {
            errorMessage.append(
                    "Invalid input format, should be 'input { INPUT_TYPE { ... } }' (only one INPUT_TYPE) and also contain a filter, no \"s around them. (1)");
            return null;
        } //TESTED
        if (!isAdmin) {
            for (String key : inputDbo.keySet()) {
                if (!_allowedInputs.contains(key.toLowerCase())) {
                    errorMessage.append("Security error, non-admin not allowed input type " + key
                            + ", allowed options: " + _allowedInputs.toString());
                    return null;
                } //TESTED
            }
        } //TESTED (3_1abc)
    }
    Object filter = jsonifiedConfig.get("filter");
    if ((null == filter) || !(filter instanceof BasicDBObject)) { // Does filter exist?
        errorMessage.append(
                "Invalid input format, should be 'input { INPUT_TYPE { ... } }' (only one INPUT_TYPE) and also contain a filter, no \"s around them. (2)");
        return null;
    } //TESTED (3_2d)
    else { // Check there's only one input type and (unless admin) it's one of the allowed types
        if (!isAdmin) {
            BasicDBObject filterDbo = (BasicDBObject) filter;
            for (String key : filterDbo.keySet()) {
                if (!_allowedFilters.contains(key.toLowerCase())) {
                    errorMessage.append("Security error, non-admin not allowed filter type " + key
                            + ", allowed options: " + _allowedFilters.toString());
                    return null;
                } //TESTED
            }
        } //TESTED (3_2abc)
    }

    // Configuration validation, phase 3

    Matcher m = null;
    m = _validationRegexInputReplace.matcher(config);
    if (!m.find()) {
        errorMessage.append(
                "Invalid input format, should be 'input { INPUT_TYPE { ... } }' (only one INPUT_TYPE) and also contain a filter, no \"s around them. (3)");
        return null;
    } //TESTED (see above)
    else { // If admin check on allowed types
        String inputType = m.group(2).toLowerCase();

        // If it's a file-based plugin then replace sincedb_path (check that it's not used during the JSON-ification):
        if (inputType.equalsIgnoreCase("file") || inputType.equalsIgnoreCase("s3")) {
            config = _validationRegexInputReplace.matcher(config)
                    .replaceFirst("$1\n      sincedb_path => \"_XXX_DOTSINCEDB_XXX_\"\n");
        } //TESTED

    } //TESTED

    m = _validationRegexNoSourceKey.matcher(config);
    // (this won't help malicious changes to source key, but will let people know they're not supposed to)
    if (m.find()) {
        errorMessage.append(
                "Not allowed to reference sourceKey - this is automatically appended by the logstash harvester");
        return null;
    } //TESTED      

    // OK now need to append the sourceKey at each stage of the pipeline to really really ensure that nobody sets sourceKey to be different 

    m = _validationRegexAppendFields.matcher(config);
    StringBuffer newConfig = new StringBuffer();
    if (m.find()) {
        m.appendReplacement(newConfig, "add_field => [ \"sourceKey\", \"" + sourceKey + "\"] \n\n" + m.group()
                + " \n if [sourceKey] == \"" + sourceKey + "\" { \n\n ");
    } else {
        errorMessage.append(
                "Invalid input format, should be 'input { INPUT_TYPE { ... } }' (only one INPUT_TYPE) and also contain a filter, no \"s around them. (4)");
        return null;
    }
    m.appendTail(newConfig);
    config = newConfig.toString();
    config = config.replaceAll("}[^}]*$", ""); // (remove the last })
    config += "\n\n mutate { update => [ \"sourceKey\", \"" + sourceKey + "\"] } \n}\n}\n"; // double check the sourceKey hasn't been overwritten and close the if from above
    //TESTED (syntactically correct and does overwrite sourceKey everywhere - success_2_2)

    return config;
}