Example usage for com.mongodb BasicDBObject get

List of usage examples for com.mongodb BasicDBObject get

Introduction

In this page you can find the example usage for com.mongodb BasicDBObject get.

Prototype

public Object get(final String key) 

Source Link

Document

Gets a value from this object

Usage

From source file:com.ikanow.infinit.e.api.knowledge.output.RssOutput.java

License:Open Source License

public String getDocs(ResponsePojo rp) {
    // Create the feed using Rome
    SyndFeed feed = new SyndFeedImpl(); // create the feed
    String feedType = "rss_2.0";

    // Setup a list of feeds
    @SuppressWarnings("unchecked")
    List<BasicDBObject> docs = (List<BasicDBObject>) rp.getData();

    // Set the title of the feed
    feed.setTitle("Infinit.e Knowledge Discovery RSS Feed");
    feed.setDescription("Infinit.e Search Results RSS Feed");
    feed.setLanguage("en-us");
    feed.setPublishedDate(new Date(System.currentTimeMillis()));
    feed.setFeedType(feedType); // set the type of your feed
    feed.setLink("http://www.ikanow.com");

    // Establish the list to contain the feeds
    List<SyndEntry> entries = new ArrayList<SyndEntry>();

    // loop through the result set
    for (BasicDBObject fdbo : docs) {
        SyndEntry entry = new SyndEntryImpl(); // create a feed entry

        if (fdbo.getString("title") != null) {
            entry.setTitle(fdbo.getString("title"));

            Date pubDate = (Date) fdbo.get("publishedDate");
            if (pubDate != null)
                entry.setPublishedDate(pubDate);

            if (fdbo.getString("url") != null)
                entry.setLink(fdbo.getString("url"));

            if (fdbo.getString("description") != null) {
                // Create the content for the entry
                SyndContent content = new SyndContentImpl(); // create the content of your entry
                content.setType("text/plain");
                content.setValue(fdbo.getString("description"));
                entry.setDescription(content);
            }//from  w  w  w. j  a va 2  s .  c om
            entries.add(entry);
        }
    }

    feed.setEntries(entries); // you can add multiple entries in your feed

    SyndFeedOutput output = new SyndFeedOutput();
    String rss = null;

    try {
        rss = output.outputString(feed);
    } catch (FeedException e) {
        e.printStackTrace();
        logger.error("Line: [" + e.getStackTrace()[2].getLineNumber() + "] " + e.getMessage());
    }
    return rss;
}

From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java

License:Open Source License

@SuppressWarnings("unchecked")
private void stage1_initialCountingLoop(DBCursor docs, AdvancedQueryPojo.QueryScorePojo scoreParams,
        int toReturn, StatisticsPojo scores, LinkedList<BasicDBObject> standaloneEventsReturn,
        int nCommunities) {
    double s0_nQuerySubsetDocCountInv = 1.0 / (double) _s0_nQuerySubsetDocCount;

    // Some memory management:
    DBCollection dbc = MongoDbManager.getDocument().getMetadata();
    DBDecoderFactory defaultDecoder = dbc.getDBDecoderFactory();

    try {// w  w w . jav  a  2  s  .c o m
        SizeReportingBasicBSONDecoder sizeReportingDecoder = new SizeReportingBasicBSONDecoder();
        dbc.setDBDecoderFactory(sizeReportingDecoder);

        long currMemUsage = 0;
        int ndocs = 0;
        long lastBatch = 0L;

        long initialUnusedMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory();
        long initialFreeMemory = Runtime.getRuntime().freeMemory();

        for (DBObject f0 : docs) {
            BasicDBObject f = (BasicDBObject) f0;
            long newMemUsage = sizeReportingDecoder.getSize();
            if ((newMemUsage - currMemUsage) > 0) { // check every batch               
                long now = new Date().getTime();

                //DEBUG
                //logger.warn(ndocs + " : " + (now - lastBatch) + " : " + newMemUsage + " VS " + Runtime.getRuntime().maxMemory() + " UNUSED " + (Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory()) + " FREE " + Runtime.getRuntime().freeMemory());

                // Check vs total memory:
                long runtimeMem = Runtime.getRuntime().maxMemory();
                // note newMemUsage is the input memory ... gets expanded ~6x by the BSON-ification, allowed at most 1/4rd of memory...
                // Also if we're taking more than 20s for a batch then limp over the limit and exit...
                if (((newMemUsage * 24) > runtimeMem)
                        || (((now - lastBatch) > 20000L) && (ndocs >= toReturn))) {
                    long finalUnusedMemory = Runtime.getRuntime().maxMemory()
                            - Runtime.getRuntime().totalMemory();
                    long finalFreeMemory = Runtime.getRuntime().freeMemory();

                    logger.error("Query truncated memUsage=" + newMemUsage + ", memory=" + runtimeMem
                            + ", docs=" + ndocs + ", totaldocs=" + scores.found + ", init_free_mem="
                            + initialFreeMemory + ", end_free_mem=" + finalFreeMemory + ", init_unused_mem="
                            + initialUnusedMemory + ", end_unused_mem=" + finalUnusedMemory);
                    break;
                } //TESTED
                currMemUsage = newMemUsage;
                lastBatch = now;
            } //TESTED
            ndocs++;

            // Simple handling for standalone events
            if ((null != _s0_standaloneEventAggregator) && !_s0_bNeedToCalcSig) {
                //if _s0_bNeedToCalcSig then do this elsewhere
                ScoringUtils_Associations.addStandaloneEvents(f, 0.0, 0, _s0_standaloneEventAggregator,
                        _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter,
                        _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts);
            } //TESTED

            if (!_s0_bNeedToCalcSig) {
                continue;
            } //TESTED

            if (nCommunities > 1) { // (could have pan-community entities)
                ObjectId communityId = (ObjectId) f.get(DocumentPojo.communityId_);
                if (null != communityId) { // (have big problems if so, but anyway!)
                    int retval = _s0_multiCommunityHandler.community_getIdAndInitialize(communityId,
                            _s1_entitiesInDataset);
                    // (returns an int community id but also sets it into the cache, so just use that below)
                    if (Integer.MIN_VALUE == retval) {
                        //this document cannot be viewed from within this set of communities
                        continue;
                    }
                }
            } //TESTED      

            TempDocBucket docBucket = new TempDocBucket();
            docBucket.dbo = f;
            ObjectId id = (ObjectId) f.get(DocumentPojo._id_);

            // If we're going to weight relevance in, or we need the geo temporal decay:
            if ((0 != scoreParams.relWeight) || (null != scoreParams.timeProx)
                    || (null != scoreParams.geoProx)) {
                StatisticsPojo.Score scoreObj = scores.getScore().get(id);
                if (null != scoreObj) {
                    docBucket.explain = scoreObj.explain; // (will normally be null)
                    docBucket.luceneScore = scoreObj.score;
                    if ((null != scoreParams.timeProx) || (null != scoreParams.geoProx)) {
                        if (scoreObj.decay >= 0.0) {
                            docBucket.geoTemporalDecay = scoreObj.decay;
                        }
                        // (see also below for low accuracy geo scoring)
                    }
                } else {
                    docBucket.luceneScore = 1.0;
                }
            } //TESTED
            else if (this._s0_sortingByDate) {
                StatisticsPojo.Score scoreObj = scores.getScore().get(id);
                if (null != scoreObj) {
                    docBucket.nLuceneIndex = scoreObj.nIndex;
                }
            }
            docBucket.manualWeighting = this.getManualScoreWeights(scoreParams, f);

            BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_));
            if (null != l) {

                long nEntsInDoc = l.size();
                double dBestGeoScore = 0.0; // (for low accuracy geo only)
                for (Iterator<?> e0 = l.iterator(); e0.hasNext();) {
                    BasicDBObject e = (BasicDBObject) e0.next();
                    BasicDBObject tmpGeotag = null;
                    if (_s3_bLowAccuracyGeo || (null != _s1_dManualGeoDecay_latLonInvdecay)) {
                        // low accuracy geo, need to look for geotag
                        tmpGeotag = (BasicDBObject) e.get(EntityPojo.geotag_);
                    }

                    // Get attributes

                    double freq = -1.0;
                    long ntotaldoccount = -1;
                    String entity_index;
                    Double sentiment = null;
                    try {
                        sentiment = (Double) e.get(EntityPojo.sentiment_);
                        ntotaldoccount = e.getLong(EntityPojo.doccount_);
                        freq = e.getDouble(EntityPojo.frequency_);
                        entity_index = e.getString(EntityPojo.index_);
                        if (null == entity_index) {
                            // Just bypass the entity 
                            e.put(EntityPojo.significance_, 0.0);
                            nEntsInDoc--;
                            continue;
                        }
                    } catch (Exception ex) {
                        try {
                            String sfreq;
                            if (ntotaldoccount < 0) {
                                sfreq = e.getString(EntityPojo.doccount_);
                                ntotaldoccount = Long.valueOf(sfreq);
                            }
                            if (freq < -0.5) {
                                sfreq = e.getString(EntityPojo.frequency_);
                                freq = Long.valueOf(sfreq).doubleValue();
                            }
                            entity_index = e.getString(EntityPojo.index_);
                            if (null == entity_index) {
                                // Just bypass the entity 
                                e.put(EntityPojo.significance_, 0.0);
                                nEntsInDoc--;
                                continue;
                            }
                        } catch (Exception e2) {
                            // Just bypass the entity 
                            e.put(EntityPojo.significance_, 0.0);
                            nEntsInDoc--;
                            continue;
                        }
                    } //TESTED

                    // First loop through is just counting

                    // Retrieve entity (create/initialzie if necessary)
                    EntSigHolder shp = _s1_entitiesInDataset.get(entity_index);
                    if (null == shp) {
                        if (ntotaldoccount > (long) _s0_globalDocCount) { // obviously can't have more entities-in-dos than docs... 
                            ntotaldoccount = (long) _s0_globalDocCount;
                        }
                        shp = new EntSigHolder(entity_index, ntotaldoccount, _s0_multiCommunityHandler);

                        // Stage 1a alias handling: set up infrastructure, calculate doc overlap
                        if (null != _s1_aliasLookup) {
                            stage1_initAlias(shp);
                        }
                        if ((null != shp.aliasInfo) && (null == shp.masterAliasSH)) { // this is the discard alias
                            nEntsInDoc--;
                            continue;
                        } //TESTED

                        // Check if entity is in type filter list
                        if (null != _s0_entityTypeFilter) {
                            String entType = null;
                            if (null != shp.aliasInfo) {
                                entType = shp.aliasInfo.getType();
                            } else {
                                entType = e.getString(EntityPojo.type_);
                            }
                            if (_s0_bEntityTypeFilterPositive) {
                                if ((null != entType)
                                        && !_s0_entityTypeFilter.contains(entType.toLowerCase())) {
                                    nEntsInDoc--;
                                    continue;
                                }
                            } else if ((null != entType)
                                    && _s0_entityTypeFilter.contains(entType.toLowerCase())) {
                                //(negative filter)
                                nEntsInDoc--;
                                continue;
                            }

                        } //TESTED (end entity filter)

                        // Geo:
                        if (null != shp.aliasInfo) {
                            if (null != shp.aliasInfo.getGeotag()) { //Geo, overwrite/create tmpGeotag
                                if (_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo
                                        || (null != _s1_dManualGeoDecay_latLonInvdecay)) {
                                    // Always capture alias geo, even if not in low accuracy mode because we add it to the 
                                    // legitimate geo:
                                    if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo)
                                            && (null == _s3_geoBuckets)) {
                                        // Initialize the buckets if this is for aggregation not just decay
                                        _s3_geoBuckets = (LinkedList<EntSigHolder>[]) new LinkedList[_s3_nGEO_BUCKETS];
                                    }

                                    if (null == tmpGeotag) {
                                        tmpGeotag = new BasicDBObject();
                                    }
                                    tmpGeotag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat);
                                    tmpGeotag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon);

                                    if (null != shp.aliasInfo.getOntology_type()) {
                                        e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type());
                                    }
                                }
                            }
                        } //TESTED (end geo for aggregation or decay)

                        _s1_entitiesInDataset.put(entity_index, shp);
                        // end Stage 1a alias handling
                    } //(end if is alias)

                    // Stage 1b alias handling: calculate document counts (taking overlaps into account)
                    if (null != shp.masterAliasSH) {
                        // Counts:
                        shp.masterAliasSH.nTotalDocCount++;
                        // docs including overlaps
                        shp.masterAliasSH.avgFreqOverQuerySubset += freq;

                        // Keep track of overlaps:
                        if (f != shp.masterAliasSH.unusedDbo) {
                            shp.masterAliasSH.unusedDbo = f;
                            // (note this is only used in stage 1, alias.unusedDbo is re-used differently in stage 3/4)
                            shp.masterAliasSH.nDocCountInQuerySubset++;
                            // non-overlapping docs ie < shp.nDocCountInQuerySubset
                        }

                        // Sentiment:
                        shp.masterAliasSH.positiveSentiment += shp.positiveSentiment;
                        shp.masterAliasSH.negativeSentiment += shp.negativeSentiment;
                        if (null != sentiment) {
                            shp.masterAliasSH.nTotalSentimentValues++;
                        }

                    } //TESTED (end if is alias)
                      // end Stage 1b

                    // Pan-community logic (this needs to be before the entity object is updated)
                    if (_s0_multiCommunityHandler.isActive()) {
                        _s0_multiCommunityHandler.community_updateCorrelations(shp, ntotaldoccount,
                                entity_index);
                    } else { // (Once we've started multi-community logic, this is no longer desirable)
                        if ((ntotaldoccount > shp.nTotalDocCount) && (ntotaldoccount <= _s0_globalDocCount)) {
                            shp.nTotalDocCount = ntotaldoccount;
                        }
                        //(note there used to be some cases where we adjusted for dc/tf==0, but the 
                        // underlying issue in the data model that caused this has been fixed, so it's 
                        // now a pathological case that can be ignored)
                    } //(TESTED)

                    // Update counts:
                    _s1_sumFreqInQuerySubset += freq;
                    shp.avgFreqOverQuerySubset += freq;
                    shp.nDocCountInQuerySubset++;
                    shp.decayedDocCountInQuerySubset += docBucket.geoTemporalDecay;
                    // (note this doesn't handle low accuracy geo-decay ... we'll address that via a separate term)

                    TempEntityInDocBucket entBucket = new TempEntityInDocBucket();
                    entBucket.dbo = e;
                    entBucket.freq = freq;
                    entBucket.doc = docBucket;
                    shp.entityInstances.add(entBucket);
                    if (null != tmpGeotag) { // (only needed for low accuracy geo aggregation)

                        if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo) && (null == shp.geotag)) { // (first time for shp only)
                            shp.geotag = tmpGeotag;
                            shp.geotaggedEntity = e; // (ie for onto type, which has been overwritten in the alias case...)
                        }
                        if (null != _s1_dManualGeoDecay_latLonInvdecay) {
                            // Emulate scripted Lucene calculations
                            double minlat = tmpGeotag.getDouble(GeoPojo.lat_);
                            double minlon = tmpGeotag.getDouble(GeoPojo.lon_);
                            double paramlat = _s1_dManualGeoDecay_latLonInvdecay[0];
                            double paramlon = _s1_dManualGeoDecay_latLonInvdecay[1];
                            double gdecay = _s1_dManualGeoDecay_latLonInvdecay[2];
                            char ontCode = GeoOntologyMapping
                                    .encodeOntologyCode(e.getString(EntityPojo.ontology_type_));
                            double dDecay = QueryDecayScript.getGeoDecay(minlat, minlon, paramlat, paramlon,
                                    gdecay, ontCode);
                            if (dDecay > dBestGeoScore) {
                                dBestGeoScore = dDecay;
                            }
                        } //TESTED
                    } //(end if entity has geo and need to process entity geo)

                    if (freq > shp.maxFreq) {
                        shp.maxFreq = freq;
                    }
                    // Sentiment:
                    if ((null != sentiment) && (Math.abs(sentiment) <= 1.1)) { // (actually 1.0)
                        shp.nTotalSentimentValues++;
                        if (sentiment > 0.0) {
                            shp.positiveSentiment += sentiment;
                        } else {
                            shp.negativeSentiment += sentiment;
                        }
                    } else if (null != sentiment) { // corrupt sentiment for some reason?!
                        e.put(EntityPojo.sentiment_, null);
                    }
                    docBucket.docLength += freq;

                } //(end loop over entities)

                docBucket.nLeftToProcess = nEntsInDoc;
                docBucket.nEntsInDoc = (int) nEntsInDoc;

                if (null != this._s1_dManualGeoDecay_latLonInvdecay) { // Low accuracy geo-calculations
                    docBucket.geoTemporalDecay *= dBestGeoScore;
                    docBucket.luceneScore *= dBestGeoScore;
                    _s2_dAvgLowAccuracyGeoDecay += dBestGeoScore * s0_nQuerySubsetDocCountInv;
                } //TESTED            

            } // (end if feed has entities)

            // Handle documents with no entities - can still promote them
            if (0 == docBucket.nLeftToProcess) { // (use this rather than doc length in case all the entities had freq 0)
                _s1_noEntityBuckets.add(docBucket);
            }

        } // (end loop over feeds)
          //TESTED
    } finally {
        dbc.setDBDecoderFactory(defaultDecoder);
    }
}

From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java

License:Open Source License

private void stage4_prepareDocsForOutput(AdvancedQueryPojo.QueryScorePojo scoreParams, StatisticsPojo scores,
        long nToClientLimit, LinkedList<BasicDBObject> returnList) {
    // Get the documents
    long nDocs = 0;
    double dBestScore = 0.0;
    double dAvgScore = 0.0;

    double dSigFactor = 100.0 / (_s3_dSigScalingFactor * _s2_dApproxAverageDocumentSig);
    double dRelFactor = 100.0 / (_s3_dLuceneScalingFactor * _s0_avgLuceneScore);

    // Start at the bottom of the list, so don't need to worry about skipping documents, just count out from the bottom
    // The call to stage3_calculateTFTerms with nStart+nToClientLimit handles the rest

    Iterator<TempDocBucket> pqIt = _s3_pqDocs.iterator();
    while (pqIt.hasNext() && (nDocs < nToClientLimit)) {
        TempDocBucket qsf = pqIt.next();
        nDocs++;//from   w  ww .  j  a v a  2s . c o m
        if (!_s0_sortingByDate) {
            dBestScore = qsf.totalScore;
        }
        dAvgScore += dBestScore;

        BasicDBObject f = qsf.dbo;

        // Phase "0" - these are the highest prio events
        boolean bNeedToFilterAndAliasAssoc_event = true;
        boolean bNeedToFilterAndAliasAssoc_fact = true;
        boolean bNeedToFilterAndAliasAssoc_summary = true;
        if (null != _s0_standaloneEventAggregator) {
            ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0,
                    _s0_standaloneEventAggregator, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                    _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts);
            bNeedToFilterAndAliasAssoc_event = false;
            bNeedToFilterAndAliasAssoc_fact = false;
            bNeedToFilterAndAliasAssoc_summary = false;
        } //TESTED
        if (null != _s0_lowAccuracyAssociationAggregator_events) {
            ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0,
                    _s0_lowAccuracyAssociationAggregator_events, _s0_bEntityTypeFilterPositive,
                    _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, true, false,
                    false);
            bNeedToFilterAndAliasAssoc_event = false;
        } //TESTED                        
        if (null != _s0_lowAccuracyAssociationAggregator_facts) {
            ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0,
                    _s0_lowAccuracyAssociationAggregator_facts, _s0_bEntityTypeFilterPositive,
                    _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, false, false,
                    true);
            bNeedToFilterAndAliasAssoc_fact = false;
        } //TESTED

        try {
            DocumentPojoApiMap.mapToApi(f);
            // Handle deduplication/multi-community code:
            if (null != qsf.dupList) {
                try {
                    ScoringUtils_MultiCommunity.community_combineDuplicateDocs(qsf);
                } catch (Exception e) {
                    // Do nothing, just carry on with minimal damage!
                }
            }

            // Scoring:
            double d = qsf.aggSignificance * dSigFactor;
            if (Double.isNaN(d)) {
                f.put(DocumentPojo.aggregateSignif_, 0.0);
            } else {
                f.put(DocumentPojo.aggregateSignif_, d);
            }
            d = qsf.luceneScore * dRelFactor;
            if (Double.isNaN(d)) {
                f.put(DocumentPojo.queryRelevance_, 0.0);
            } else {
                f.put(DocumentPojo.queryRelevance_, d);
            }
            if (!_s0_sortingByDate) {
                f.put(DocumentPojo.score_, qsf.totalScore);
            }

            BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_));

            // Handle update ids vs normal ids:
            ObjectId updateId = (ObjectId) f.get(DocumentPojo.updateId_);
            if (null != updateId) { // swap the 2...
                f.put(DocumentPojo.updateId_, f.get(DocumentPojo._id_));
                f.put(DocumentPojo._id_, updateId);
            }

            // Check if entities enabled            
            if ((null != l) && (!_s0_bGeoEnts && !_s0_bNonGeoEnts)) {
                f.removeField(DocumentPojo.entities_);
                l = null;
            } //TESTED

            // Check if events etc enabled
            if ((!_s0_bEvents && !_s0_bFacts && !_s0_bSummaries)) {
                f.removeField(DocumentPojo.associations_);
            } //TESTED            
            else if (!_s0_bEvents || !_s0_bFacts || !_s0_bSummaries || (null != _s0_assocVerbFilter)) {

                // Keep only specified event_types
                BasicDBList lev = (BasicDBList) (f.get(DocumentPojo.associations_));
                if (null != lev) {
                    for (Iterator<?> e0 = lev.iterator(); e0.hasNext();) {
                        BasicDBObject e = (BasicDBObject) e0.next();

                        // Type filter
                        boolean bNeedToFilterAndAliasAssoc = true;
                        String sEvType = e.getString(AssociationPojo.assoc_type_);
                        boolean bKeep = true;
                        if (null == sEvType) {
                            bKeep = false;
                        } else if (sEvType.equalsIgnoreCase("event")) {
                            if (!_s0_bEvents)
                                bKeep = false;
                            bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_event;
                        } else if (sEvType.equalsIgnoreCase("fact")) {
                            if (!_s0_bFacts)
                                bKeep = false;
                            bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_fact;
                        } else if (sEvType.equalsIgnoreCase("summary")) {
                            if (!_s0_bSummaries)
                                bKeep = false;
                            bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_summary;
                        }
                        if (!bKeep) {
                            e0.remove();
                        } else { // Type matches, now for some more complex logic....

                            if (bNeedToFilterAndAliasAssoc) { // (otherwise done already)

                                bKeep = ScoringUtils_Associations.filterAndAliasAssociation(e, _s1_aliasLookup,
                                        true, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                                        _s0_entityTypeFilter, _s0_assocVerbFilter);
                                if (!bKeep) {
                                    e0.remove();
                                }

                            } //TESTED

                        } //(end output filter logic)

                    } // (end loop over events)   
                } // (end if this doc has events)

            } //TESTED            

            // Check if metadata is enabled
            if (!_s0_bMetadata) {
                f.removeField(DocumentPojo.metadata_);
            } //TESTED

            if (null != l) {

                for (Iterator<?> e0 = l.iterator(); e0.hasNext();) {
                    BasicDBObject e = (BasicDBObject) e0.next();

                    if (!_s0_bNonGeoEnts) { // then must only be getting geo (else wouldn't be in this loop)
                        if (null == e.get(EntityPojo.geotag_)) {
                            e0.remove();
                            continue;
                        }
                    }

                    String entity_index = e.getString(EntityPojo.index_);
                    if (null == entity_index)
                        continue;

                    EntSigHolder shp = (EntSigHolder) _s1_entitiesInDataset.get(entity_index);

                    if (null != shp) {
                        // Stage 4x: alias processing, just overwrite 
                        // (note don't delete "duplicate entities", hard-to-be-globally-consistent
                        //  and will potentially throw data away which might be undesirable)
                        if (null != shp.masterAliasSH) {
                            shp = shp.masterAliasSH; // (already has all the aggregated values used below)
                            if (!entity_index.equals(shp.aliasInfo.getIndex())) {
                                e.put(EntityPojo.index_, shp.aliasInfo.getIndex());
                                e.put(EntityPojo.disambiguated_name_, shp.aliasInfo.getDisambiguatedName());
                                e.put(EntityPojo.type_, shp.aliasInfo.getType());
                                e.put(EntityPojo.dimension_, shp.aliasInfo.getDimension());

                                if (null != shp.aliasInfo.getGeotag()) {
                                    BasicDBObject aliasedGeoTag = new BasicDBObject();
                                    aliasedGeoTag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat);
                                    aliasedGeoTag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon);
                                    e.put(EntityPojo.geotag_, aliasedGeoTag);
                                    if (null != shp.aliasInfo.getOntology_type()) {
                                        e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type());
                                    }
                                } //TESTED
                            }
                        } //TESTED
                          // end Stage 4x of alias processing                  

                        double dataSig = shp.datasetSignificance;
                        if (Double.isNaN(dataSig)) {
                            e.put(EntityPojo.datasetSignificance_, 0.0);
                        } else {
                            e.put(EntityPojo.datasetSignificance_, dataSig);
                        }
                        e.put(EntityPojo.queryCoverage_, shp.queryCoverage);
                        e.put(EntityPojo.averageFreq_, shp.avgFreqOverQuerySubset);
                        if (shp.nTotalSentimentValues > 0) {
                            e.put(EntityPojo.positiveSentiment_, shp.positiveSentiment);
                            e.put(EntityPojo.negativeSentiment_, shp.negativeSentiment);
                            e.put(EntityPojo.sentimentCount_, shp.nTotalSentimentValues);
                        }
                    } else { // (most likely to occur if the entity is discarded (alias/filter) or is corrupt in some way)
                        e0.remove();
                        continue;
                    }

                } //(end loop over entities)
            } // (end if feed has entities)
              //TESTED

            // Explain if enabled
            if (null != qsf.explain) {
                f.put(DocumentPojo.explain_, qsf.explain);
            }

            // Add to the end of the list (so will come back from API call in natural order, highest first)
            returnList.addFirst(f);
            // (add elements to the front of the list so that the top of the list is ordered by priority)
        } catch (Exception e) {
            // Probably a JSON error, just carry on
            String title = f.getString(DocumentPojo.title_);
            logger.error(title + ": " + e.getMessage());
        }

    } // (end loop over feeds)
      //TESTED

    // Update the scores:
    scores.maxScore = (float) dBestScore;
    if (nDocs > 0) {
        scores.avgScore = (float) dAvgScore / nDocs;
    }
}

From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java

License:Open Source License

private void stage4_prepareEntsForOutput(LinkedList<BasicDBObject> entityReturn) {
    if (_s0_nNumEntsReturn > 0) { // (else entities not enabled)

        for (EntSigHolder qsf = _s3_pqEnt.poll(); null != qsf; qsf = _s3_pqEnt.poll()) // (start with lowest ranking)
        {/*  w  ww .  jav a 2 s .c  om*/
            BasicDBObject ent = qsf.unusedDbo;
            if (null == ent) {
                int nTries = 0;
                if (null != qsf.entityInstances) { // (should never be null but just to be on the safe side...
                    for (TempEntityInDocBucket tefb : qsf.entityInstances) {
                        // (Try to find an entity that wasn't promoted ie can now be re-used
                        //  if we can't find one quite quickly then bail out and we'll pay the cost of cloning it)
                        if (!tefb.doc.bPromoted) {
                            ent = tefb.dbo;
                            break;
                        } else if (++nTries > 10) {
                            break;
                        }
                    }
                    if (null == ent) {
                        ent = qsf.entityInstances.get(0).dbo;
                    }
                } else { // (no entityInstances, something alias-related has gone wrong, just skip) 
                    continue;
                }
            } //TESTED
            qsf.entityInstances = null; // (don't need this any more, can be gc'd)

            try {

                if (null != qsf.aliasInfo) {
                    if (!qsf.index.equals(qsf.aliasInfo.getIndex())) {
                        ent.put(EntityPojo.index_, qsf.aliasInfo.getIndex());
                        ent.put(EntityPojo.disambiguated_name_, qsf.aliasInfo.getDisambiguatedName());
                        ent.put(EntityPojo.type_, qsf.aliasInfo.getType());
                        ent.put(EntityPojo.dimension_, qsf.aliasInfo.getDimension());
                        if (null != qsf.aliasInfo.getGeotag()) {
                            BasicDBObject aliasedGeoTag = new BasicDBObject();
                            aliasedGeoTag.put(GeoPojo.lat_, qsf.aliasInfo.getGeotag().lat);
                            aliasedGeoTag.put(GeoPojo.lon_, qsf.aliasInfo.getGeotag().lon);
                            ent.put(EntityPojo.geotag_, aliasedGeoTag);
                            if (null != qsf.aliasInfo.getOntology_type()) {
                                ent.put(EntityPojo.ontology_type_, qsf.aliasInfo.getOntology_type());
                            }
                        } //TESTED
                    }
                } //TESTED

                if (null == ent.get(EntityPojo.datasetSignificance_)) { // Not getting promoted so need to add fields...                  
                    if (Double.isNaN(qsf.datasetSignificance)) {
                        ent.put("datasetSignificance", 0.0);
                    } else {
                        ent.put(EntityPojo.datasetSignificance_, qsf.datasetSignificance);
                    }
                    ent.put(EntityPojo.queryCoverage_, qsf.queryCoverage);
                    ent.put(EntityPojo.averageFreq_, qsf.avgFreqOverQuerySubset);
                    if (qsf.nTotalSentimentValues > 0) {
                        ent.put(EntityPojo.positiveSentiment_, qsf.positiveSentiment);
                        ent.put(EntityPojo.negativeSentiment_, qsf.negativeSentiment);
                        ent.put(EntityPojo.sentimentCount_, qsf.nTotalSentimentValues);
                    }
                } else { // (... but can just use it without cloning)
                    BasicDBObject ent2 = new BasicDBObject();
                    for (Map.Entry<String, Object> kv : ent.entrySet()) {
                        ent2.append(kv.getKey(), kv.getValue());
                    }
                    ent = ent2;
                }
                ent.removeField(EntityPojo.relevance_);
                if (Double.isNaN(qsf.maxDocSig)) {
                    ent.put(EntityPojo.significance_, 0.0);
                } else {
                    ent.put(EntityPojo.significance_, qsf.maxDocSig);
                }
                ent.put(EntityPojo.frequency_, (long) qsf.maxFreq);
                entityReturn.addFirst(ent);
            } catch (Exception e) {
                // Probably a JSON error, just carry on
                String title = ent.getString(EntityPojo.index_);
                logger.error(title + ": " + e.getMessage());
            } //TESTED
        }
    } //TESTED            
}

From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java

License:Open Source License

private double getManualScoreWeights(AdvancedQueryPojo.QueryScorePojo scoreParams, BasicDBObject doc) {
    // Highest prio: source key weight
    if (null != scoreParams.sourceWeights) {
        String sourceKey = DocumentPojo.getSourceKey(doc.getString(DocumentPojo.sourceKey_));
        Double dWeight = scoreParams.sourceWeights.get(sourceKey);

        if (null != dWeight) {
            return dWeight;
        }/*ww  w .  ja  v a  2s .c om*/
    }
    // Middle prio: type
    if (null != scoreParams.typeWeights) {
        String mediaType = doc.getString(DocumentPojo.mediaType_);
        Double dWeight = scoreParams.typeWeights.get(mediaType);

        if (null != dWeight) {
            return dWeight;
        }
    }
    // Lowest prio: average of tags
    if (null != scoreParams.tagWeights) {
        double dScore = 0.0;
        int nComps = 0;
        BasicDBList tags = (BasicDBList) doc.get(DocumentPojo.tags_);
        if (null != tags) {
            for (Object tagObj : tags) {
                String tag = (String) tagObj;
                Double dWeight = scoreParams.tagWeights.get(tag);
                if (null != dWeight) {
                    nComps++;
                    dScore += dWeight;
                }
            }
            if (nComps > 0) {
                return dScore / nComps;
            }
        }
    }
    return 1.0;
}

From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils_Associations.java

License:Open Source License

static void addStandaloneEvents(BasicDBObject doc, double dDocSig, int nPhase,
        StandaloneEventHashAggregator standaloneEventAggregator, boolean bEntTypeFilterPositive,
        boolean bAssocVerbFilterPositive, HashSet<String> entTypeFilter, HashSet<String> assocVerbFilter,
        boolean bEvents, boolean bSummaries, boolean bFacts) {
    if (standaloneEventAggregator.bSimulateAggregation) {
        bSummaries = false;//  ww w .ja va  2  s .  c  om
    }
    String sDocIsoPubDate = null;

    BasicDBList lev = (BasicDBList) (doc.get(DocumentPojo.associations_));
    if (null != lev) {
        for (Iterator<?> e0 = lev.iterator(); e0.hasNext();) {
            BasicDBObject e = (BasicDBObject) e0.next();

            String sEvType = e.getString(AssociationPojo.assoc_type_);
            boolean bIsFact = false;
            boolean bIsSummary = false;
            boolean bKeep = true;
            if (null == sEvType) {
                bKeep = false;
            } else if (sEvType.equalsIgnoreCase("event")) {
                if (!bEvents)
                    bKeep = false;
            } else if (sEvType.equalsIgnoreCase("fact")) {
                if (!bFacts)
                    bKeep = false;
                bIsFact = true;
            } else if (sEvType.equalsIgnoreCase("summary")) {
                if (!bSummaries)
                    bKeep = false;
                bIsSummary = true;
            } //TESTED x4

            // Filter and aliasing logic:
            if (bKeep) {
                boolean bKeep2 = filterAndAliasAssociation(e, standaloneEventAggregator.aliasLookup, true,
                        bEntTypeFilterPositive, bAssocVerbFilterPositive, entTypeFilter, assocVerbFilter);
                if (!bKeep2) {
                    e0.remove();
                    // (remove/rename events based on filters where we can, 
                    //  means we don't have to do it in stage4)
                    bKeep = false;
                }
            } //TESTED

            if (bKeep) {
                String time_start = null;
                String time_end = null; // (normally not needed)

                if (!standaloneEventAggregator.bSimulateAggregation) { //else times are discarded                  
                    // Add time from document
                    time_start = e.getString(AssociationPojo.time_start_);

                    if (null == time_start) {
                        if (null == sDocIsoPubDate) {
                            // Convert docu pub date to ISO (day granularity):
                            Date pubDate = (Date) doc.get(DocumentPojo.publishedDate_);

                            if (null != pubDate) {
                                SimpleDateFormat f2 = new SimpleDateFormat("yyyy-MM-dd");
                                time_start = f2.format(pubDate);
                            }
                        } else {
                            time_start = sDocIsoPubDate; // (so it doesn't get added again below)
                        }
                    } //TESTED               
                    else { // Remove hourly granularity for consistency                  
                        time_start = time_start.replaceAll("T.*$", "");
                        time_end = e.getString(AssociationPojo.time_end_);

                        if (null != time_end) {
                            time_end = time_end.replaceAll("T.*$", "");
                        }
                    } //TESTED (with debug code, eg time_start = "1997-07-16T19:20:30+01:00")
                    if (null != time_start) { // Ensure it has day granularity, to help with aggregation
                        e.put(AssociationPojo.time_start_, time_start);
                        if (null != time_end) {
                            e.put(AssociationPojo.time_end_, time_end);
                        }
                    } //TESTED
                } //(end if normal standalone mode, not aggregation simulation)

                StandaloneEventHashCode evtHolder = new StandaloneEventHashCode(
                        standaloneEventAggregator.bSimulateAggregation, e, bIsSummary, bIsFact);
                BasicDBObject oldEvt = standaloneEventAggregator.store.get(evtHolder);

                if (null == oldEvt) {
                    // Doc count (see below)
                    e.put(AssociationPojo.doccount_, 1);
                    double dAssocSig = dDocSig * dDocSig;

                    // Weight down summaries slightly (80%), and summaries with missing entities a lot (50%)  
                    if (bIsSummary) {
                        String sEntity2 = (String) e.get(AssociationPojo.entity2_);
                        if (null == sEntity2) {
                            dAssocSig *= 0.50;
                        } else {
                            dAssocSig *= 0.80;
                        }
                    }

                    // Running significance count:
                    e.put(AssociationPojo.assoc_sig_, dAssocSig); // (use sum-squared to score up events that occur frequently)
                    if (dAssocSig > standaloneEventAggregator.dMaxSig) {
                        standaloneEventAggregator.dMaxSig = dAssocSig;
                    }

                    standaloneEventAggregator.store.put(evtHolder, e);

                    // Add to list in some sort of very basic order...
                    if (2 == nPhase) { // Put at the back, it's probably really low sig
                        standaloneEventAggregator.tmpList.add(e);
                    } else if (1 == nPhase) { // Put at the front until Phase 0 comes along
                        standaloneEventAggregator.tmpList.addFirst(e);
                        standaloneEventAggregator.nPhase1Events++;
                    } else { // phases 0 and 1 get the higher orderings
                        standaloneEventAggregator.tmpList.addFirst(e);
                        standaloneEventAggregator.nPhase0Events++;
                    }
                } else { // Update doc count
                    long nDocCount = oldEvt.getInt(AssociationPojo.doccount_, 1) + 1;
                    oldEvt.put(AssociationPojo.doccount_, nDocCount);
                    // Running significance count:
                    double dAssocSig = oldEvt.getDouble(AssociationPojo.doccount_) + dDocSig * dDocSig;
                    oldEvt.put(AssociationPojo.assoc_sig_, dAssocSig);
                    if (dAssocSig / nDocCount > standaloneEventAggregator.dMaxSig) {
                        standaloneEventAggregator.dMaxSig = dAssocSig;
                    }

                    if (bIsFact && !standaloneEventAggregator.bSimulateAggregation) {
                        // For facts, also update the time range:
                        String old_time_start = oldEvt.getString(AssociationPojo.time_start_);
                        String old_time_end = oldEvt.getString(AssociationPojo.time_end_);
                        // Just keep this really simple and inefficient:
                        TreeSet<String> timeOrder = new TreeSet<String>();
                        if (null != old_time_start) {
                            timeOrder.add(old_time_start);
                        }
                        if (null != old_time_end) {
                            timeOrder.add(old_time_end);
                        }
                        if (null != time_start) {
                            timeOrder.add(time_start);
                        }
                        if (null != time_end) {
                            timeOrder.add(time_end);
                        }
                        if (timeOrder.size() > 1) {
                            Iterator<String> itStart = timeOrder.iterator();
                            oldEvt.put(AssociationPojo.time_start_, itStart.next());
                            Iterator<String> itEnd = timeOrder.descendingIterator();
                            oldEvt.put(AssociationPojo.time_end_, itEnd.next());
                        }

                    } // end if is fact - treat times different
                }
                //TESTED

            } // (end if keeping this event)
        } // (end loop over events)   
    } // (end if this doc has events)

}

From source file:com.ikanow.infinit.e.api.knowledge.QueryHandler.java

License:Open Source License

private ResponsePojo getSavedQueryInstead(String storedQueryNameOrId, String[] communityIdStrs,
        AdvancedQueryPojo query) {/*from   w w w  .j av  a 2  s  .  c o  m*/
    ResponsePojo rp = null;
    ObjectId oid = null;
    BasicDBObject jobQuery = null;
    try {
        oid = new ObjectId(storedQueryNameOrId);
        jobQuery = new BasicDBObject(CustomMapReduceJobPojo._id_, oid);
    } catch (Exception e) {
        jobQuery = new BasicDBObject(CustomMapReduceJobPojo.jobtitle_, storedQueryNameOrId);
    }
    CustomMapReduceJobPojo savedJob = CustomMapReduceJobPojo
            .fromDb(DbManager.getCustom().getLookup().findOne(jobQuery), CustomMapReduceJobPojo.class);

    if (null != savedJob) { // Is this even a saved job?
        if (null != savedJob.jarURL) {
            savedJob = null;
        }
    }
    if (null != savedJob) { // Authorization
        boolean auth = false;
        String communityIdStrList = Arrays.toString(communityIdStrs);
        for (ObjectId commId : savedJob.communityIds) {

            if (communityIdStrList.contains(commId.toString())) {
                auth = true;
                break;
            }
        }
        if (!auth) {
            savedJob = null;
        }
        if (null == savedJob) {
            throw new RuntimeException(
                    "Can't find saved query, or is a custom job not a query, or authorization error");
        }
        // OK go get the results of the job
        DBCollection coll = DbManager.getCollection(savedJob.getOutputDatabase(), savedJob.outputCollection);
        BasicDBObject result = (BasicDBObject) coll.findOne(); // (at some point support multiple saved queries)
        if (null == result) {
            throw new RuntimeException("Saved query is empty");
        }
        BasicDBObject apiResultToConvert = (BasicDBObject) result.get("value");
        if (null == apiResultToConvert) {
            throw new RuntimeException("Saved query has invalid format");
        }
        rp = ResponsePojo.fromDb(apiResultToConvert);
    } else if (null != oid) { // Support new user/doc queues
        SharePojo share = SharePojo.fromDb(DbManager.getSocial().getShare().findOne(jobQuery), SharePojo.class);
        if ((null == share) || (null == share.getShare())
                || (!share.getType().equals(DocumentQueueControlPojo.UserQueue)
                        && !share.getType().equals(DocumentQueueControlPojo.SavedQueryQueue))) {
            throw new RuntimeException(
                    "Can't find saved query, or is a custom job not a query, or authorization error");
        } else { // share.share is a  DocumentQueueControlPojo
            DocumentQueueControlPojo queue = DocumentQueueControlPojo.fromApi(share.getShare(),
                    DocumentQueueControlPojo.class);
            BasicDBObject docQuery1 = new BasicDBObject(DocumentPojo._id_,
                    new BasicDBObject(DbManager.in_, queue.getQueueList()));
            BasicDBObject docQuery2 = new BasicDBObject(DocumentPojo.updateId_,
                    new BasicDBObject(DbManager.in_, queue.getQueueList()));
            BasicDBObject docQuery = new BasicDBObject(DbManager.or_, Arrays.asList(docQuery1, docQuery2));
            DBCursor dbc = DbManager.getDocument().getMetadata().find(docQuery).limit(query.score.numAnalyze);
            ScoringUtils scoreStats = new ScoringUtils();
            List<BasicDBObject> docs = null;
            StatisticsPojo stats = new StatisticsPojo();
            stats.setSavedScores(query.output.docs.skip, dbc.count());
            try {
                boolean lockAcquired = true;
                try {
                    lockAcquired = this.acquireConcurrentAccessLock();

                } catch (InterruptedException e) {
                    //(that's fine just carry on)
                    lockAcquired = false;
                }
                if (!lockAcquired) {
                    rp.setResponse(
                            new ResponseObject("Query", false, "Query engine busy, please try again later."));
                    return rp;
                }
                scoreStats.setAliasLookupTable(_aliasLookup);
                docs = scoreStats.calcTFIDFAndFilter(DbManager.getDocument().getMetadata(), dbc, query.score,
                        query.output, stats, false, query.output.docs.skip, query.output.docs.numReturn,
                        communityIdStrs, null, null, null, null, null, null, null, null);
            } finally {
                scoreStats.clearAsMuchMemoryAsPossible();
                this.releaseConcurrentAccessLock();
            }
            rp = new ResponsePojo();
            rp.setResponse(new ResponseObject("Query", true, "Saved Query: " + share.getTitle()));
            rp.setStats(stats);
            if ((null != docs) && (docs.size() > 0)) {
                rp.setData(docs, (BasePojoApiMap<BasicDBObject>) null);
            } else { // (ensure there's always an empty list)
                docs = new ArrayList<BasicDBObject>(0);
                rp.setData(docs, (BasePojoApiMap<BasicDBObject>) null);
            }
        } //end if user or saved query queue
    }
    return rp;

}

From source file:com.ikanow.infinit.e.api.knowledge.SearchHandler.java

License:Open Source License

/**
 * Performs a reverse geolookup, takes a lat/lon and returns a list of nearby
 * locations/*from  w ww  .j av  a2 s . com*/
 * 
 * @param latitude
 * @param longitude
 * @return
 */
private List<SearchSuggestPojo> reverseGeoLookup(Double latitude, Double longitude) {
    List<SearchSuggestPojo> locations = null;

    BasicDBList results = runGeoNear(latitude, longitude);

    if (results != null) {
        locations = new ArrayList<SearchSuggestPojo>();
        if (results.size() > 0) {
            for (int i = 0; i < 10 && i < results.size(); i++) {
                BasicDBObject result = (BasicDBObject) results.get(i);
                Double distance = result.getDouble("dis");
                BasicDBObject obj = (BasicDBObject) result.get("obj");
                locations.add(buildLocation(obj, distance));
            }
        }
    }
    return locations;
}

From source file:com.ikanow.infinit.e.api.social.sharing.ShareHandler.java

License:Open Source License

private String getReferenceString(SharePojo share) {
    // FILE:// w  w  w . ja va2  s .c  om
    if (null == share.getDocumentLocation().get_id()) { // local file based reference
        FileInputStream fin = null;
        Scanner s = null;
        try {
            File f = new File(share.getDocumentLocation().getCollection());
            fin = new FileInputStream(f);
            s = new Scanner(fin, "UTF-8");
            return (s.useDelimiter("\n").next());
        } catch (Exception e) {
            return null;
        } finally {
            try {
                if (null != fin)
                    fin.close();
                if (null != s)
                    s.close();
            } catch (Exception e) {
            } // (probably just never opened)               
        }
    }
    // DB:
    // Carry on, this is a database object
    HashSet<String> shareIdStrs = new HashSet<String>();
    for (ShareCommunityPojo commIds : share.getCommunities()) {
        shareIdStrs.add(commIds.get_id().toString());
    }
    String retVal = null;
    BasicDBObject query = new BasicDBObject(DocumentPojo._id_, share.getDocumentLocation().get_id()); // (same for all artifacts)
    String dbName = share.getDocumentLocation().getDatabase();
    String collectionName = share.getDocumentLocation().getCollection();
    BasicDBObject returnVal = (BasicDBObject) MongoDbManager.getCollection(dbName, collectionName)
            .findOne(query);
    try {
        BasicDBList communities = null;
        boolean bCustomJob = dbName.equals("custommr"); // (a bit different)
        boolean bFoundOverlap = false;
        if (!bCustomJob) {
            ObjectId communityId = (ObjectId) returnVal.get(DocumentPojo.communityId_); // (same for other artifacts)
            bFoundOverlap = shareIdStrs.contains(communityId.toString());
        } else {
            communities = (BasicDBList) returnVal.get("communityIds"); // (shared across multiple json types)
            for (Object commIdObj : communities) {
                ObjectId commId = (ObjectId) commIdObj;
                if (shareIdStrs.contains(commId.toString())) {
                    bFoundOverlap = true;
                    break;
                }
            }
        }
        if (!bFoundOverlap) {
            throw new RuntimeException(""); // (turned into the common message below)
        }
        if (!bCustomJob) { // everything but custom jobs
            Date modifiedTime = returnVal.getDate(DocumentPojo.modified_); // (same for other artifacts)
            if (null != modifiedTime) {
                share.setModified(modifiedTime);
            }
            retVal = returnVal.toString();
        } else { // custom jobs
            String database = returnVal.getString(CustomMapReduceJobPojo.outputDatabase_);
            if (null == database) {
                database = dbName;
            }
            Date modifiedTime = returnVal.getDate(CustomMapReduceJobPojo.lastCompletionTime_);
            if (null != modifiedTime) {
                share.setModified(modifiedTime);
            }
            String collection = returnVal.getString(CustomMapReduceJobPojo.outputCollection_);
            BasicDBObject returnVal2 = (BasicDBObject) MongoDbManager.getCollection(database, collection)
                    .findOne();
            retVal = returnVal2.toString();
        }
    } catch (Exception e) {
        throw new RuntimeException("Document not found or permission issue (no overlapping communities)");
    }
    return retVal;
}

From source file:com.ikanow.infinit.e.application.utils.LogstashConfigUtils.java

License:Open Source License

public static String validateLogstashInput(String sourceKey, String config, StringBuffer errorMessage,
        boolean isAdmin) {

    if (null == _props) {
        _props = new PropertiesManager();
        String allowedInputs = _props.getProperty("harvest.logstash.allowed_inputs");

        if ((null == allowedInputs) || (allowedInputs.isEmpty())) {
            allowedInputs = "collectd,drupal_dblog,gelf,gemfire,imap,irc,lumberjack,s3,snmptrap,sqs,syslog,twitter,udp,xmpp,zenoss";
            // currently *not* allowed by default: elasticsearch,eventlog,exec,file,ganglia,generator,graphite,heroku,jmx,log4j,pipe,puppet_facter,rabbitmq,redit,relp,sqlite,stdin,stomp,tcp,unix,varnishlog,websocket,wmi,zeromq
        }// ww w.  j  a  v a  2s .  c  om
        _allowedInputs.addAll(Arrays.asList(allowedInputs.toLowerCase().split("\\s*,\\s*")));

        String allowedFilters = _props.getProperty("harvest.logstash.allowed_filters");
        if ((null == allowedFilters) || (allowedFilters.isEmpty())) {
            allowedFilters = "advisor,alter,anonymize,checksum,cidr,cipher,clone,collate,csv,date,dns,drop,elapsed,extractnumbers,fingerprint,geoip,gelfify,grep,grok,grokdiscovery,l18n,json,json_encode,kv,metaevent,metrics,multiline,mutate,noop,prune,punct,railsparallelrequest,range,sleep,split,sumnumbers,syslog_pri,throttle,translate,unique,urldecode,useragent,uuid,wms,wmts,xml";
            // currently *not* allowed by default: elasticsearch,ruby,zeromq
        }
        _allowedFilters.addAll(Arrays.asList(allowedFilters.toLowerCase().split("\\s*,\\s*")));
    } //TESTED (3_2a)

    // Configuration validation, phase 1

    errorMessage.append("Validation error:");
    BasicDBObject jsonifiedConfig = parseLogstashConfig(config, errorMessage);
    if (null == jsonifiedConfig) {
        return null;
    }
    errorMessage.setLength(0);

    // Configuration validation, phase 2 - very basic checks on the structure of the object

    Object input = jsonifiedConfig.get("input");
    if ((null == input) || !(input instanceof BasicDBObject)) { // Does input exist?
        errorMessage.append(
                "Invalid input format, should be 'input { INPUT_TYPE { ... } }' (only one INPUT_TYPE) and also contain a filter, no \"s around them. (0)");
        return null;
    } //TESTED (3_1d)
    else { // Check there's only one input type and (unless admin) it's one of the allowed types
        BasicDBObject inputDbo = (BasicDBObject) input;
        if (1 != inputDbo.size()) {
            errorMessage.append(
                    "Invalid input format, should be 'input { INPUT_TYPE { ... } }' (only one INPUT_TYPE) and also contain a filter, no \"s around them. (1)");
            return null;
        } //TESTED
        if (!isAdmin) {
            for (String key : inputDbo.keySet()) {
                if (!_allowedInputs.contains(key.toLowerCase())) {
                    errorMessage.append("Security error, non-admin not allowed input type " + key
                            + ", allowed options: " + _allowedInputs.toString());
                    return null;
                } //TESTED
            }
        } //TESTED (3_1abc)
    }
    Object filter = jsonifiedConfig.get("filter");
    if ((null == filter) || !(filter instanceof BasicDBObject)) { // Does filter exist?
        errorMessage.append(
                "Invalid input format, should be 'input { INPUT_TYPE { ... } }' (only one INPUT_TYPE) and also contain a filter, no \"s around them. (2)");
        return null;
    } //TESTED (3_2d)
    else { // Check there's only one input type and (unless admin) it's one of the allowed types
        if (!isAdmin) {
            BasicDBObject filterDbo = (BasicDBObject) filter;
            for (String key : filterDbo.keySet()) {
                if (!_allowedFilters.contains(key.toLowerCase())) {
                    errorMessage.append("Security error, non-admin not allowed filter type " + key
                            + ", allowed options: " + _allowedFilters.toString());
                    return null;
                } //TESTED
            }
        } //TESTED (3_2abc)
    }

    // Configuration validation, phase 3

    Matcher m = null;
    m = _validationRegexInputReplace.matcher(config);
    if (!m.find()) {
        errorMessage.append(
                "Invalid input format, should be 'input { INPUT_TYPE { ... } }' (only one INPUT_TYPE) and also contain a filter, no \"s around them. (3)");
        return null;
    } //TESTED (see above)
    else { // If admin check on allowed types
        String inputType = m.group(2).toLowerCase();

        // If it's a file-based plugin then replace sincedb_path (check that it's not used during the JSON-ification):
        if (inputType.equalsIgnoreCase("file") || inputType.equalsIgnoreCase("s3")) {
            config = _validationRegexInputReplace.matcher(config)
                    .replaceFirst("$1\n      sincedb_path => \"_XXX_DOTSINCEDB_XXX_\"\n");
        } //TESTED

    } //TESTED

    m = _validationRegexNoSourceKey.matcher(config);
    // (this won't help malicious changes to source key, but will let people know they're not supposed to)
    if (m.find()) {
        errorMessage.append(
                "Not allowed to reference sourceKey - this is automatically appended by the logstash harvester");
        return null;
    } //TESTED      

    // OK now need to append the sourceKey at each stage of the pipeline to really really ensure that nobody sets sourceKey to be different 

    m = _validationRegexAppendFields.matcher(config);
    StringBuffer newConfig = new StringBuffer();
    if (m.find()) {
        m.appendReplacement(newConfig, "add_field => [ \"sourceKey\", \"" + sourceKey + "\"] \n\n" + m.group()
                + " \n if [sourceKey] == \"" + sourceKey + "\" { \n\n ");
    } else {
        errorMessage.append(
                "Invalid input format, should be 'input { INPUT_TYPE { ... } }' (only one INPUT_TYPE) and also contain a filter, no \"s around them. (4)");
        return null;
    }
    m.appendTail(newConfig);
    config = newConfig.toString();
    config = config.replaceAll("}[^}]*$", ""); // (remove the last })
    config += "\n\n mutate { update => [ \"sourceKey\", \"" + sourceKey + "\"] } \n}\n}\n"; // double check the sourceKey hasn't been overwritten and close the if from above
    //TESTED (syntactically correct and does overwrite sourceKey everywhere - success_2_2)

    return config;
}