Example usage for com.mongodb BasicDBObject getString

Introduction

In this page you can find the example usage for com.mongodb BasicDBObject getString.

Prototype

public String getString(final String key)

Source Link

Document

Returns the value of a field as a string

Usage

From source file:com.ijuru.ijambo.dao.WordDAO.java

License:Open Source License

/**
 * Gets a random word//from  w  ww.j  a v  a  2  s  .c om
 * @param difficulty the difficulty (may be null)
 * @return the word
 */
public Word getRandomWord(Difficulty difficulty) {
    DBCollection words = db.getCollection("words");
    BasicDBObject obj;

    if (difficulty != null) {
        // Get count of words of this difficulty
        BasicDBObject query = new BasicDBObject();
        query.put("difficulty", difficulty.ordinal());
        int count = words.find(query).count();

        // Pick random one
        int randOffset = (int) (Math.random() * count);
        obj = (BasicDBObject) words.find(query).limit(-1).skip(randOffset).next();
    } else {
        int randOffset = (int) (Math.random() * words.find().count());
        obj = (BasicDBObject) words.find().limit(-1).skip(randOffset).next();
    }

    return new Word(obj.getString("word"), obj.getString("meaning"),
            Difficulty.fromInt(obj.getInt("difficulty")));
}

From source file:com.ikanow.infinit.e.api.config.source.SourceHandler.java

License:Open Source License

/**
 * testSource//from w  w  w  .  jav a  2s.  c  o  m
 * @param sourceJson
 * @param nNumDocsToReturn
 * @param bReturnFullText
 * @param userIdStr
 * @return
 */
public ResponsePojo testSource(String sourceJson, int nNumDocsToReturn, boolean bReturnFullText,
        boolean bRealDedup, String userIdStr) {
    ResponsePojo rp = new ResponsePojo();
    try {
        SourcePojo source = null;
        SourcePojoSubstitutionApiMap apiMap = new SourcePojoSubstitutionApiMap(new ObjectId(userIdStr));
        try {
            source = ApiManager.mapFromApi(sourceJson, SourcePojo.class, apiMap);
            source.fillInSourcePipelineFields();
        } catch (Exception e) {
            rp.setResponse(new ResponseObject("Test Source", false,
                    "Error deserializing source (JSON is valid but does not match schema): " + e.getMessage()));
            return rp;
        }
        if (null == source.getKey()) {
            source.setKey(source.generateSourceKey()); // (a dummy value, not guaranteed to be unique)
        }
        if ((null == source.getExtractType()) || !source.getExtractType().equals("Federated")) {
            String testUrl = source.getRepresentativeUrl();
            if (null == testUrl) {
                rp.setResponse(
                        new ResponseObject("Test Source", false, "Error, source contains no URL to harvest"));
                return rp;
            }
        }
        if (null == source.getTags()) {
            source.setTags(new HashSet<String>());
        }

        // This is the only field that you don't normally need to specify in save but will cause 
        // problems if it's not populated in test.
        ObjectId userId = new ObjectId(userIdStr);
        // Set owner (overwrite, for security reasons)
        source.setOwnerId(userId);
        if (null == source.getCommunityIds()) {
            source.setCommunityIds(new TreeSet<ObjectId>());
        }
        if (!source.getCommunityIds().isEmpty()) { // need to check that I'm allowed the specified community...
            if ((1 == source.getCommunityIds().size())
                    && (userId.equals(source.getCommunityIds().iterator().next()))) {
                // we're OK only community id is user community
            } //TESTED
            else {
                HashSet<ObjectId> communities = SocialUtils.getUserCommunities(userIdStr);
                Iterator<ObjectId> it = source.getCommunityIds().iterator();
                while (it.hasNext()) {
                    ObjectId src = it.next();
                    if (!communities.contains(src)) {
                        rp.setResponse(new ResponseObject("Test Source", false,
                                "Authentication error: you don't belong to this community: " + src));
                        return rp;
                    } //TESTED
                }
            } //TESTED
        }
        // Always add the userId to the source community Id (so harvesters can tell if they're running in test mode or not...) 
        source.addToCommunityIds(userId); // (ie user's personal community, always has same _id - not that it matters)

        // Check the source's admin status
        source.setOwnedByAdmin(RESTTools.adminLookup(userId.toString(), false));

        if (bRealDedup) { // Want to test update code, so ignore update cycle
            if (null != source.getRssConfig()) {
                source.getRssConfig().setUpdateCycle_secs(1); // always update
            }
        }
        HarvestController harvester = new HarvestController(true);
        if (nNumDocsToReturn > 100) { // (seems reasonable)
            nNumDocsToReturn = 100;
        }
        harvester.setStandaloneMode(nNumDocsToReturn, bRealDedup);
        List<DocumentPojo> toAdd = new LinkedList<DocumentPojo>();
        List<DocumentPojo> toUpdate = new LinkedList<DocumentPojo>();
        List<DocumentPojo> toRemove = new LinkedList<DocumentPojo>();
        if (null == source.getHarvestStatus()) {
            source.setHarvestStatus(new SourceHarvestStatusPojo());
        }
        String oldMessage = source.getHarvestStatus().getHarvest_message();
        // SPECIAL CASE: FOR FEDERATED QUERIES
        if ((null != source.getExtractType()) && source.getExtractType().equals("Federated")) {
            int federatedQueryEnts = 0;
            SourceFederatedQueryConfigPojo endpoint = null;
            try {
                endpoint = source.getProcessingPipeline().get(0).federatedQuery;
            } catch (Exception e) {
            }
            if (null == endpoint) {
                rp.setResponse(
                        new ResponseObject("Test Source", false, "source error: no federated query specified"));
                return rp;
            }
            AdvancedQueryPojo testQuery = null;
            String errMessage = "no query specified";
            try {
                testQuery = AdvancedQueryPojo.fromApi(endpoint.testQueryJson, AdvancedQueryPojo.class);
            } catch (Exception e) {
                errMessage = e.getMessage();
            }
            if (null == testQuery) {
                rp.setResponse(new ResponseObject("Test Source", false,
                        "source error: need to specifiy a valid IKANOW query to test federated queries, error: "
                                + errMessage));
                return rp;
            }
            // OK if we're here then we can test the query
            SimpleFederatedQueryEngine testFederatedQuery = new SimpleFederatedQueryEngine();
            endpoint.parentSource = source;
            testFederatedQuery.addEndpoint(endpoint);
            ObjectId queryId = new ObjectId();
            String[] communityIdStrs = new String[source.getCommunityIds().size()];
            int i = 0;
            for (ObjectId commId : source.getCommunityIds()) {
                communityIdStrs[i] = commId.toString();
                i++;
            }
            testFederatedQuery.setTestMode(true);
            testFederatedQuery.preQueryActivities(queryId, testQuery, communityIdStrs);
            StatisticsPojo stats = new StatisticsPojo();
            stats.setSavedScores(0, 0);
            rp.setStats(stats);
            ArrayList<BasicDBObject> toAddTemp = new ArrayList<BasicDBObject>(1);
            testFederatedQuery.postQueryActivities(queryId, toAddTemp, rp);
            for (BasicDBObject docObj : toAddTemp) {
                DocumentPojo doc = DocumentPojo.fromDb(docObj, DocumentPojo.class);
                if (bReturnFullText) {
                    doc.setFullText(docObj.getString(DocumentPojo.fullText_));
                    doc.makeFullTextNonTransient();
                }
                if (null != doc.getEntities()) {
                    federatedQueryEnts += doc.getEntities().size();
                }

                //Metadata workaround:
                @SuppressWarnings("unchecked")
                LinkedHashMap<String, Object[]> meta = (LinkedHashMap<String, Object[]>) docObj
                        .get(DocumentPojo.metadata_);
                if (null != meta) {
                    Object metaJson = meta.get("json");
                    if (metaJson instanceof Object[]) { // (in this case ... non-cached, need to recopy in, I forget why)
                        doc.addToMetadata("json", (Object[]) metaJson);
                    }
                }
                toAdd.add(doc);
            }
            // (currently can't run harvest source federated query)
            if (0 == federatedQueryEnts) { // (more fed query exceptions)
                source.getHarvestStatus().setHarvest_message(
                        "Warning: no entities extracted, probably docConversionMap is wrong?");
            } else {
                source.getHarvestStatus().setHarvest_message(federatedQueryEnts + " entities extracted");
            }

        } //TESTED (END FEDERATED QUERY TEST MODE, WHICH IS A BIT DIFFERENT)
        else {
            harvester.harvestSource(source, toAdd, toUpdate, toRemove);
        }

        // (don't parrot the old message back - v confusing)
        if (oldMessage == source.getHarvestStatus().getHarvest_message()) { // (ptr ==)
            source.getHarvestStatus()
                    .setHarvest_message("(no documents extracted - likely a source or configuration error)");
        } //TESTED

        String message = null;
        if ((null != source.getHarvestStatus()) && (null != source.getHarvestStatus().getHarvest_message())) {
            message = source.getHarvestStatus().getHarvest_message();
        } else {
            message = "";
        }
        List<String> errMessagesFromSourceDeser = apiMap.getErrorMessages();
        if (null != errMessagesFromSourceDeser) {
            StringBuffer sbApiMapErr = new StringBuffer("Substitution errors:\n");
            for (String err : errMessagesFromSourceDeser) {
                sbApiMapErr.append(err).append("\n");
            }
            message = message + "\n" + sbApiMapErr.toString();
        } //TESTED (by hand)

        if ((null != source.getHarvestStatus())
                && (HarvestEnum.error == source.getHarvestStatus().getHarvest_status())) {
            rp.setResponse(new ResponseObject("Test Source", false, "source error: " + message));
            rp.setData(toAdd, new DocumentPojoApiMap());
        } else {
            if ((null == message) || message.isEmpty()) {
                message = "no messages from harvester";
            }
            rp.setResponse(new ResponseObject("Test Source", true,
                    "successfully returned " + toAdd.size() + " docs: " + message));
            try {
                // If grabbing full text
                // Also some logstash/custom specific logic - these aren't docs so just output the entire record
                boolean isLogstash = (null != source.getExtractType())
                        && source.getExtractType().equalsIgnoreCase("logstash");
                boolean isCustom = (null != source.getExtractType())
                        && source.getExtractType().equalsIgnoreCase("custom");
                List<BasicDBObject> records = null;
                if (bReturnFullText || isLogstash || isCustom) {
                    for (DocumentPojo doc : toAdd) {
                        if (isLogstash || isCustom) {
                            if (null == records) {
                                records = new ArrayList<BasicDBObject>(toAdd.size());
                            }
                            BasicDBObject dbo = (BasicDBObject) doc.getMetadata().get("record")[0];
                            Object test = dbo.get("_id");
                            if ((null != test) && (test instanceof ObjectId)) {
                                dbo.remove("_id"); // (unless it's a custom _id added from logstash then remove it)
                            }
                            records.add(dbo);
                        } //TESTED
                        else if (bReturnFullText) {
                            doc.makeFullTextNonTransient();
                        }
                    }
                } //TESTED
                if (null != records) {
                    rp.setData(records, (BasePojoApiMap<BasicDBObject>) null);
                } //TESTED
                else {
                    rp.setData(toAdd, new DocumentPojoApiMap());
                } //TESTED

                //Test deserialization:
                rp.toApi();
            } catch (Exception e) {
                //e.printStackTrace();
                StringBuffer sb = new StringBuffer();
                Globals.populateStackTrace(sb, e);
                rp.setData(
                        new BasicDBObject("error_message", "Error deserializing documents: " + sb.toString()),
                        null);
            }
        }
    } catch (Exception e) {
        // If an exception occurs log the error
        logger.error("Exception Message: " + e.getMessage(), e);
        rp.setResponse(new ResponseObject("Test Source", false, "Error testing source: " + e.getMessage()));
    } catch (Error e) {
        // If an exception occurs log the error
        logger.error("Exception Message: " + e.getMessage(), e);
        rp.setResponse(new ResponseObject("Test Source", false,
                "Configuration/Installation error: " + e.getMessage()));
    }
    return rp;
}

From source file:com.ikanow.infinit.e.api.knowledge.DocumentHandler.java

License:Open Source License

/**
 * Get information function that returns the user information in the form of a JSON String.
 * @param isAdmin //from www . j a v  a2s.  c o m
 * 
 * @param  key   the key definition of the user ( example email@email.com )
 * @return      a JSON string representation of the person information on success
 */
public ResponsePojo getInfo(String userIdStr, String sourceKey, String idStrOrUrl, boolean bReturnFullText,
        boolean returnRawData, boolean isAdmin) {
    ResponsePojo rp = new ResponsePojo();

    try {
        // Set up the query
        BasicDBObject query = new BasicDBObject();
        ObjectId id = null;
        if (null == sourceKey) {
            id = new ObjectId(idStrOrUrl);
            query.put(DocumentPojo._id_, id);
        } else {
            query.put(DocumentPojo.sourceKey_, sourceKey);
            query.put(DocumentPojo.url_, idStrOrUrl);
        }

        if (!isAdmin)
            query.put(DocumentPojo.communityId_,
                    new BasicDBObject(MongoDbManager.in_, SocialUtils.getUserCommunities(userIdStr)));
        // (use DBObject here because DocumentPojo is pretty big and this call could conceivably have perf implications)

        BasicDBObject fieldsQ = new BasicDBObject();
        if (!bReturnFullText) {
            fieldsQ.put(DocumentPojo.fullText_, 0); // (XML/JSON have fullText as part of pojo)
        }

        BasicDBObject dbo = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(query, fieldsQ);

        if ((null == dbo) || ((null != dbo.get(DocumentPojo.url_))
                && dbo.getString(DocumentPojo.url_).startsWith("?DEL?"))) {
            if (null != id) { // this might be the update id...               
                query = new BasicDBObject(DocumentPojo.updateId_, id);
                dbo = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(query, fieldsQ);
            }
        }
        //TESTED (update case, normal case, and intermediate case where both update and original still exist)

        if (null == dbo) {
            rp.setResponse(new ResponseObject("Doc Info", true, "Document not found"));
            return rp;
        }
        DocumentPojo dp = DocumentPojo.fromDb(dbo, DocumentPojo.class);
        if (bReturnFullText) {
            if (null == dp.getFullText()) { // (Some things like database records might have this stored already)
                byte[] storageArray = new byte[200000];
                DBCollection contentDB = DbManager.getDocument().getContent();
                BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, dp.getUrl());
                contentQ.put(CompressedFullTextPojo.sourceKey_,
                        new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, dp.getSourceKey())));
                BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
                BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ, fields);
                if (null != dboContent) {
                    byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_));
                    ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
                    GZIPInputStream gzip = new GZIPInputStream(in);
                    int nRead = 0;
                    StringBuffer output = new StringBuffer();
                    while (nRead >= 0) {
                        nRead = gzip.read(storageArray, 0, 200000);
                        if (nRead > 0) {
                            String s = new String(storageArray, 0, nRead, "UTF-8");
                            output.append(s);
                        }
                    }
                    dp.setFullText(output.toString());
                    dp.makeFullTextNonTransient();
                }
            }
        } else if (!returnRawData) {
            dp.setFullText(null); // (obviously will normally contain full text anyway)
        } else // if ( returnRawData )
        {
            //check if the harvest type is file, return the file instead
            //if file is db return the json
            //get source
            SourcePojo source = getSourceFromKey(dp.getSourceKey());
            if (source.getExtractType().equals("File")) {
                //get file from harvester
                String fileURL = dp.getUrl();
                if (dp.getSourceUrl() != null)
                    fileURL = dp.getSourceUrl();
                byte[] bytes = FileHarvester.getFile(fileURL, source);
                if (bytes == null) {
                    // Try returning JSON instead
                    String json = ApiManager.mapToApi(dp, new DocumentPojoApiMap());
                    DocumentFileInterface dfp = new DocumentFileInterface();

                    dfp.bytes = json.getBytes();
                    dfp.mediaType = "application/json";

                    rp.setResponse(
                            new ResponseObject("Doc Info", true, "Document bytes returned successfully"));
                    rp.setData(dfp, null);
                    return rp;
                } else {
                    DocumentFileInterface dfp = new DocumentFileInterface();
                    dfp.bytes = bytes;
                    dfp.mediaType = getMediaType(fileURL);
                    rp.setResponse(
                            new ResponseObject("Doc Info", true, "Document bytes returned successfully"));
                    rp.setData(dfp, null);
                    return rp;
                }
            } else {
                String json = ApiManager.mapToApi(dp, new DocumentPojoApiMap());
                DocumentFileInterface dfp = new DocumentFileInterface();

                dfp.bytes = json.getBytes();
                dfp.mediaType = "application/json";

                rp.setResponse(new ResponseObject("Doc Info", true, "Document bytes returned successfully"));
                rp.setData(dfp, null);
                return rp;
            }
        }
        rp.setData(dp, new DocumentPojoApiMap());
        rp.setResponse(new ResponseObject("Doc Info", true, "Feed info returned successfully"));
    } //(end full text vs raw data)
    catch (Exception e) {
        // If an exception occurs log the error
        logger.error("Exception Message: " + e.getMessage(), e);
        rp.setResponse(new ResponseObject("Doc Info", false, "error returning feed: " + e.getMessage()));
    }
    // Return Json String representing the user
    return rp;
}

From source file:com.ikanow.infinit.e.api.knowledge.federated.SimpleFederatedQueryEngine.java

License:Open Source License

@Override
public void preQueryActivities(ObjectId queryId, AdvancedQueryPojo query, String[] communityIdStrs) {

    _scoreStats = null;//from w w w.j a v a 2  s .  c om
    _asyncRequestsPerQuery = null;

    // 1] Check whether this makes sense to query, get the (sole) entity if so
    String entityType = null;
    String entityValue = null;
    String entityIndex = null;
    String textToTest = null;
    if ((null != query.qt) && (query.qt.size() > 0) && (query.qt.size() < 4)) {

        String logic = query.logic;
        if (null != logic) {
            logic = logic.toLowerCase();
        }
        if ((null != logic) && (logic.contains("or") || logic.contains("not"))) {
            //DEBUG
            if (_DEBUG)
                _logger.debug("DEB: preQA1: Logic too complex: " + query.logic);
            if (_testMode) {
                throw new RuntimeException("Bad testQueryJson: Logic too complex: " + query.logic);
            }

            return; // logic too complex
        } //TESTED (1.3)
        for (AdvancedQueryPojo.QueryTermPojo qt : query.qt) {
            if ((null != qt.entity) || ((null != qt.entityType) && (null != qt.entityValue))) {
                if (null == entityType) { // we now have == 1 entity 
                    if (null != qt.entityValue) {
                        entityValue = qt.entityValue;
                        entityType = qt.entityType;
                        entityIndex = entityValue.toLowerCase() + "/" + entityType.toLowerCase();
                    } //TESTED (1.5)
                    else {
                        entityIndex = qt.entity.toLowerCase();
                        int index = qt.entity.lastIndexOf('/');
                        if (index > 0) {
                            entityValue = qt.entity.substring(0, index);
                            entityType = qt.entity.substring(index + 1).toLowerCase();
                        }
                    } //TESTED (1.6)
                } else { // >1 entity, not supported
                    //DEBUG
                    if (_DEBUG)
                        _logger.debug("DEB: preQA2a: >1 entity: " + qt.entity + " / " + entityType + " / "
                                + query.toApi());
                    if (_testMode) {
                        throw new RuntimeException("Bad testQueryJson: >1 entity: " + qt.entity + " / "
                                + entityType + " / " + query.toApi());
                    }

                    return;
                } //TESTED (1.4)
            } //TESTED
            else if ((null != qt.etext) && (qt.etext.equals("*"))) {
                //this is fine provided it's only ANDed together (eg above logic case)
            } else if (null != qt.etext) { // Only work if it matches one of the regexes
                if (null == entityType) {
                    textToTest = qt.etext;
                    entityType = "etext";
                } else { // >1 entity, not supported
                    //DEBUG
                    if (_DEBUG)
                        _logger.debug("DEB: preQA2b: >1 entity: " + qt.entity + " / " + entityType + " / "
                                + query.toApi());
                    if (_testMode) {
                        throw new RuntimeException("Bad testQueryJson: >1 entity: " + qt.entity + " / "
                                + entityType + " / " + query.toApi());
                    }

                    return;
                } //TESTED (1.4)               
            } else if (null == qt.time) { // temporal 
                //DEBUG
                if (_DEBUG)
                    _logger.debug("DEB: preQA3: non-entity/date " + query.toApi());
                if (_testMode) {
                    throw new RuntimeException("Bad testQueryJson: non-entity/date " + query.toApi());
                }
                return;
            } //TESTED (1.1)
        } //(end loop over query terms)

    } //TESTED (1.*)
    if (null == entityType) { // Query too complex
        //DEBUG
        if (_DEBUG)
            _logger.debug("DEB: preQA4: query missing entity " + query.toApi());
        if (_testMode) {
            throw new RuntimeException("Bad testQueryJson: query missing entity " + query.toApi());
        }

        return;
    } //TESTED (1.2)
    entityType = entityType.toLowerCase();

    // 2] If so, query across all the end

    for (SourceFederatedQueryConfigPojo endpoint : _endpoints) {

        // Endpoint validation:
        if (null == endpoint.entityTypes) {
            if (_testMode) {
                throw new RuntimeException("No entity types specified");
            } else {
                continue;
            }
        }
        if (null != textToTest) { // This is text, see if you can convert to an entity
            entityValue = null; //(reset for different endpoints - used in the check to decide whether to continue)

            for (String entityTypeRegex : endpoint.entityTypes) {
                if (entityTypeRegex.startsWith("/")) {
                    int regexIndex = entityTypeRegex.lastIndexOf('/'); // (guaranteed to be >= 0)
                    try {
                        Pattern regex = Pattern.compile(entityTypeRegex.substring(1, regexIndex));
                        if (regex.matcher(textToTest).matches()) {
                            entityType = entityTypeRegex.substring(1 + regexIndex);
                            if (entityType.length() > 0) {
                                entityValue = textToTest;
                                entityIndex = entityValue.toLowerCase() + "/" + entityType.toLowerCase();
                            }
                        }
                    } catch (Exception e) { // if not in test mode, carry on
                        if (_testMode) {
                            throw new RuntimeException(e);
                        }
                    }
                }
            } //(end loop over entity regexes)
        } //TESTED 
        if (null == entityValue) { // None of the regexes matched
            if (_testMode) {
                throw new RuntimeException("Text specified, does not match any of the regexes: "
                        + Arrays.toString(endpoint.entityTypes.toArray()) + " ... text = " + textToTest);
            }
            continue;
        }

        //DEBUG
        if (_DEBUG)
            _logger.debug("DEB: preQA5: ENDPOINT: " + Arrays.toString(endpoint.entityTypes.toArray()) + " / "
                    + entityType);

        if ((null != endpoint.importScript) && !endpoint.importScript.isEmpty()) {
            if (null == endpoint.scriptlang) {
                endpoint.scriptlang = "python"; // python ==default
            }
            if (endpoint.scriptlang.equalsIgnoreCase("python")) {
                _pyEngine = new ScriptEngineManager().getEngineByName("python");
                if (null == _pyEngine) {
                    _logger.error(
                            "Python not installed - copy jython 2.5+ into /opt/infinite-home/lib/unbundled");
                    if (_testMode) {
                        throw new RuntimeException(
                                "Python not installed - copy jython 2.5+ into /opt/infinite-home/lib/unbundled");
                    }
                } //TESTED (by hand, importScript != null and scriptlang: "python", jython not on classpath)
            } else if (endpoint.scriptlang.equalsIgnoreCase("external")) {
                //nothing to do here, just carry on, will handle the external bit later on
            } else {
                _logger.error("Python/External is currently the only supported scriptlang");
                if (_testMode) {
                    throw new RuntimeException("Python is currently the only supported scriptlang");
                }
            } //TESTED (by hand, importScript != null and scriptlang: "none")
        } //TESTED

        if ((null != endpoint.bypassSimpleQueryParsing) && endpoint.bypassSimpleQueryParsing) {
            throw new RuntimeException("Currently only simple query parsing is supported");
        }
        if ((null != endpoint.entityTypes) && endpoint.entityTypes.contains(entityType)) {

            // If not using the full source pipeline processing capability (ie always generating 0/1
            BasicDBObject cachedDoc = null;
            String cachedDocUrl = buildScriptUrl(endpoint.parentSource.getKey(), entityIndex);
            BasicDBObject cachedDoc_expired = null;
            if (!isComplexSource(endpoint.parentSource)) {
                // Check if the *doc* (not *API response*) generated from this endpoint/entity has been cached, check expiry if so
                if (_cacheMode && ((null == endpoint.cacheTime_days) || (endpoint.cacheTime_days >= 0))) {

                    if (_DEBUG)
                        _logger.debug("DEB: preQA6ya: Search Doc Cache: " + cachedDocUrl + " , "
                                + endpoint.cacheTime_days);

                    BasicDBObject cachedDocQuery = new BasicDBObject(DocumentPojo.url_, cachedDocUrl);
                    cachedDocQuery.put(DocumentPojo.sourceKey_, endpoint.parentSource.getKey());
                    cachedDoc = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(cachedDocQuery);
                    if (null != cachedDoc) {
                        // (quick check if we have a complex source in here)
                        String sourceUrl = cachedDoc.getString(DocumentPojo.sourceUrl_);
                        if (null != sourceUrl) { // switching from complex to simple source - delete the cached docs

                            if (_DEBUG)
                                _logger.debug("DEB: preQA6yb: Clear Search Doc Cache: " + cachedDocUrl + " , "
                                        + sourceUrl);

                            cachedDocQuery.remove(DocumentPojo.url_);
                            cachedDocQuery.put(DocumentPojo.sourceUrl_, sourceUrl);
                            DbManager.getDocument().getMetadata().remove(cachedDocQuery);
                            cachedDoc = null;
                        } //TESTED (by hand)
                        else if (checkDocCache_isExpired(cachedDoc, endpoint)) {
                            cachedDoc_expired = cachedDoc;
                            cachedDoc = null;
                        }
                    }
                } //TESTED (by hand)
            }

            if (null == _asyncRequestsPerQuery) {
                // If we've got this far create a list to store the async requests
                _asyncRequestsPerQuery = new LinkedList<FederatedRequest>();
            }

            if (null != cachedDoc) { // (simple sources only, by construction)
                // Common params:
                FederatedRequest requestOverview = new FederatedRequest();
                requestOverview.endpointInfo = endpoint;
                requestOverview.communityIdStrs = communityIdStrs;
                requestOverview.requestParameter = entityValue;
                requestOverview.queryIndex = entityIndex;
                requestOverview.mergeKey = endpoint.parentSource.getKey();

                if (_DEBUG)
                    _logger.debug("DEB: preQA6z: Doc Cache: " + cachedDocUrl + " , " + cachedDoc);

                requestOverview.cachedDoc = cachedDoc;
                _asyncRequestsPerQuery.add(requestOverview);
            } //TESTED (by hand)
            else if (null != endpoint.importScript) {

                BasicDBObject cachedVal = null;
                if (_cacheMode) { // (source key not static, plus not sure it's desirable, so for simplicity just don't cache requests in test mode) 
                    cachedVal = this.getCache(cachedDocUrl, endpoint);
                }

                // Common params:
                FederatedRequest requestOverview = new FederatedRequest();
                requestOverview.endpointInfo = endpoint;
                requestOverview.communityIdStrs = communityIdStrs;
                requestOverview.requestParameter = entityValue;
                requestOverview.queryIndex = entityIndex;
                requestOverview.mergeKey = endpoint.parentSource.getKey();
                requestOverview.cachedDoc_expired = cachedDoc_expired;

                if (null != cachedVal) {
                    if (checkIfNeedToClearCache(cachedVal, endpoint.parentSource)) {
                        if (_DEBUG)
                            _logger.debug("DEB: preQA6aa: Clear cache: " + cachedDocUrl + " , " + cachedVal);
                        cachedVal = null;
                    }
                }
                requestOverview.cachedResult = cachedVal; // will often be null                  

                if ((null == cachedVal) || isComplexSource(endpoint.parentSource)) {
                    if (null != cachedVal) {
                        if (_DEBUG)
                            _logger.debug(
                                    "DEB: preQA6ab: Complex Src Cache: " + cachedDocUrl + " , " + cachedVal);
                    }
                    if (endpoint.scriptlang.equalsIgnoreCase("external")) {
                        requestOverview.importThread = new FederatedScriptHarvest();
                    } else {
                        requestOverview.importThread = new FederatedJythonHarvest();
                    }
                    requestOverview.importThread.queryEngine = this;
                    requestOverview.importThread.request = requestOverview;
                    requestOverview.importThread.start();
                } else {
                    if (_DEBUG)
                        _logger.debug("DEB: preQA6a: Cache: " + cachedDocUrl + " , " + cachedVal);
                }
                // Launch thread
                _asyncRequestsPerQuery.add(requestOverview);
            } //TESTED (by hand)
            else {

                if (isComplexSource(endpoint.parentSource)) {

                    //DEBUG
                    if (_DEBUG)
                        _logger.debug("DEB: preQA6ba: Build complex source, num requests = "
                                + endpoint.requests.size());

                    FederatedRequest requestOverview = new FederatedRequest();
                    requestOverview.endpointInfo = endpoint;
                    requestOverview.communityIdStrs = communityIdStrs;
                    requestOverview.requestParameter = entityValue;
                    requestOverview.queryIndex = entityIndex;
                    requestOverview.mergeKey = endpoint.parentSource.getKey();
                    requestOverview.cachedDoc_expired = cachedDoc_expired;

                    requestOverview.importThread = new FederatedSimpleHarvest();
                    requestOverview.importThread.queryEngine = this;
                    requestOverview.importThread.request = requestOverview;
                    requestOverview.importThread.start();

                    // Launch thread
                    _asyncRequestsPerQuery.add(requestOverview);
                } else { // simple source               
                    try {
                        for (SourceFederatedQueryConfigPojo.FederatedQueryEndpointUrl request : endpoint.requests) {
                            FederatedRequest requestOverview = createSimpleHttpEndpoint_includingCache(
                                    entityValue, entityIndex, communityIdStrs, endpoint, request,
                                    cachedDoc_expired);

                            //DEBUG
                            if (_DEBUG)
                                _logger.debug("DEB: preQA6bb: Build request: " + request.endPointUrl);

                            _asyncRequestsPerQuery.add(requestOverview);
                        } //(end loop over multiple requests
                    } catch (Exception e) {
                        _logger.error("Unknown error creating federated query for " + endpoint.titlePrefix
                                + ": " + e.getMessage());
                        if (_testMode) {
                            throw new RuntimeException("Unknown error creating federated query for "
                                    + endpoint.titlePrefix + ": " + e.getMessage(), e);
                        }
                    }
                } //(end if simple not complex)
            } //(end cached doc vs script vs request mode for queries)

        } //(end if this request is for this entity type)
        else { // no entity matches - if in test mode then bomb out with useful error
            if (_testMode) {
                throw new RuntimeException("Specified entity: " + entityIndex + " not in set: "
                        + Arrays.toString(endpoint.entityTypes.toArray()));
            }
        }
    } //(end loop over endpoints)
}

From source file:com.ikanow.infinit.e.api.knowledge.federated.SimpleFederatedQueryEngine.java

License:Open Source License

public static void simpleDocCache(FederatedRequest request, BasicDBObject doc) {
    if (null != request.cachedDoc_expired) {
        ObjectId updateId = request.cachedDoc_expired.getObjectId(DocumentPojo.updateId_);
        if (null != updateId) {
            doc.put(DocumentPojo.updateId_, updateId);
        } else {// w  ww  .jav a  2s. com
            doc.put(DocumentPojo.updateId_, request.cachedDoc_expired.getObjectId(DocumentPojo._id_));
        }
        BasicDBObject docUpdate = new BasicDBObject(DocumentPojo.url_, doc.getString(DocumentPojo.url_));
        docUpdate.put(DocumentPojo.sourceKey_, doc.getString(DocumentPojo.sourceKey_));
        DbManager.getDocument().getMetadata().remove(docUpdate);

        //DEBUG
        if (_DEBUG)
            _logger.debug("DEB: postQA4a: re-cached ... " + docUpdate.toString() + ": "
                    + doc.getObjectId(DocumentPojo.updateId_));
    } else if (null == request.cachedDoc) { // if no currently cached doc, simply save what we have
        //DEBUG
        if (_DEBUG)
            _logger.debug("DEB: postQA4b: cached ... " + doc);
        DbManager.getDocument().getMetadata().save(doc);
    }
    // (else already have a valid cached doc so nothing to do)
}

From source file:com.ikanow.infinit.e.api.knowledge.federated.SimpleFederatedQueryEngine.java

License:Open Source License

public static boolean checkDocCache_isExpired(BasicDBObject cachedDoc,
        SourceFederatedQueryConfigPojo endpoint) {
    if (null == endpoint.cacheTime_days)
        endpoint.cacheTime_days = DEFAULT_CACHE_TIME_DAYS;

    Date now = new Date();
    long cacheThreshold = cachedDoc.getDate(DocumentPojo.created_, now).getTime()
            + endpoint.cacheTime_days * 3600L * 24L * 1000L;

    if (cacheThreshold < now.getTime()) // (ie doc-creation-time + cache is earlier than now => time to decache)
    {/*from ww  w .  j a v a  2 s . co m*/
        //DEBUG
        if (_DEBUG)
            _logger.debug("DEB: preQA6zz: Cache expired: " + cachedDoc.getString(DocumentPojo.url_) + ": "
                    + new Date(cacheThreshold) + " vs " + now);

        return true;
    } else
        return false;
}

From source file:com.ikanow.infinit.e.api.knowledge.output.KmlOutput.java

License:Open Source License

/**
 * Public function used to return ResponsePojo object as KML representation
 * @param rp//from   ww  w  .j  a va  2 s  .  co m
 * @return
 */
//TODO (INF-1298): Complete this code (see InfiniteMapWidget for examples, though this may want to be different, ie handle documents and aggregations?)
@SuppressWarnings("unused")
public String getDocs(ResponsePojo rp) {

    // Setup a list of feeds
    @SuppressWarnings("unchecked")
    List<BasicDBObject> docs = (List<BasicDBObject>) rp.getData();

    // Setup the Kml object used to generate the kml document
    Kml kml = new Kml();

    // Create the document
    Document document = kml.createAndSetDocument().withName("Infinit.e KML Interface")
            .withDescription("Infinit.e search KML representation");

    // Create the folder to contain the placemarks (allows us to have multiple folders
    Folder placemarksFolder = document.createAndAddFolder().withName("Documents")
            .withDescription("Placemarks for the document locations in the query");

    // loop through the result set
    for (BasicDBObject fdbo : docs) {
        // start out by checking to see if the title is not null
        if (fdbo.getString("title") != null) {
            // add logic to check for entities or event
            // Add in loop to create all the placemark points

            String description = "";
            if (fdbo.getString("description") != null)
                description = fdbo.getString("description");

            Point placemark = placemarksFolder.createAndAddPlacemark().withName(fdbo.getString("title"))
                    .withOpen(Boolean.TRUE).withDescription(description).createAndSetPoint()
                    .addToCoordinates(-0.126236, 51.500152);
        }
    }

    // Create a string writer to contain the kml string
    StringWriter writer = new StringWriter();
    // marshal the string writer to get a string out to the kml object
    kml.marshal(writer);

    // return the kml to the client
    return writer.toString();

}

From source file:com.ikanow.infinit.e.api.knowledge.output.RssOutput.java

License:Open Source License

public String getDocs(ResponsePojo rp) {
    // Create the feed using Rome
    SyndFeed feed = new SyndFeedImpl(); // create the feed
    String feedType = "rss_2.0";

    // Setup a list of feeds
    @SuppressWarnings("unchecked")
    List<BasicDBObject> docs = (List<BasicDBObject>) rp.getData();

    // Set the title of the feed
    feed.setTitle("Infinit.e Knowledge Discovery RSS Feed");
    feed.setDescription("Infinit.e Search Results RSS Feed");
    feed.setLanguage("en-us");
    feed.setPublishedDate(new Date(System.currentTimeMillis()));
    feed.setFeedType(feedType); // set the type of your feed
    feed.setLink("http://www.ikanow.com");

    // Establish the list to contain the feeds
    List<SyndEntry> entries = new ArrayList<SyndEntry>();

    // loop through the result set
    for (BasicDBObject fdbo : docs) {
        SyndEntry entry = new SyndEntryImpl(); // create a feed entry

        if (fdbo.getString("title") != null) {
            entry.setTitle(fdbo.getString("title"));

            Date pubDate = (Date) fdbo.get("publishedDate");
            if (pubDate != null)
                entry.setPublishedDate(pubDate);

            if (fdbo.getString("url") != null)
                entry.setLink(fdbo.getString("url"));

            if (fdbo.getString("description") != null) {
                // Create the content for the entry
                SyndContent content = new SyndContentImpl(); // create the content of your entry
                content.setType("text/plain");
                content.setValue(fdbo.getString("description"));
                entry.setDescription(content);
            }//from  w ww. j av a  2  s.  c om
            entries.add(entry);
        }
    }

    feed.setEntries(entries); // you can add multiple entries in your feed

    SyndFeedOutput output = new SyndFeedOutput();
    String rss = null;

    try {
        rss = output.outputString(feed);
    } catch (FeedException e) {
        e.printStackTrace();
        logger.error("Line: [" + e.getStackTrace()[2].getLineNumber() + "] " + e.getMessage());
    }
    return rss;
}

From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java

License:Open Source License

@SuppressWarnings("unchecked")
private void stage1_initialCountingLoop(DBCursor docs, AdvancedQueryPojo.QueryScorePojo scoreParams,
        int toReturn, StatisticsPojo scores, LinkedList<BasicDBObject> standaloneEventsReturn,
        int nCommunities) {
    double s0_nQuerySubsetDocCountInv = 1.0 / (double) _s0_nQuerySubsetDocCount;

    // Some memory management:
    DBCollection dbc = MongoDbManager.getDocument().getMetadata();
    DBDecoderFactory defaultDecoder = dbc.getDBDecoderFactory();

    try {/*www  .j  a va2  s . c  om*/
        SizeReportingBasicBSONDecoder sizeReportingDecoder = new SizeReportingBasicBSONDecoder();
        dbc.setDBDecoderFactory(sizeReportingDecoder);

        long currMemUsage = 0;
        int ndocs = 0;
        long lastBatch = 0L;

        long initialUnusedMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory();
        long initialFreeMemory = Runtime.getRuntime().freeMemory();

        for (DBObject f0 : docs) {
            BasicDBObject f = (BasicDBObject) f0;
            long newMemUsage = sizeReportingDecoder.getSize();
            if ((newMemUsage - currMemUsage) > 0) { // check every batch               
                long now = new Date().getTime();

                //DEBUG
                //logger.warn(ndocs + " : " + (now - lastBatch) + " : " + newMemUsage + " VS " + Runtime.getRuntime().maxMemory() + " UNUSED " + (Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory()) + " FREE " + Runtime.getRuntime().freeMemory());

                // Check vs total memory:
                long runtimeMem = Runtime.getRuntime().maxMemory();
                // note newMemUsage is the input memory ... gets expanded ~6x by the BSON-ification, allowed at most 1/4rd of memory...
                // Also if we're taking more than 20s for a batch then limp over the limit and exit...
                if (((newMemUsage * 24) > runtimeMem)
                        || (((now - lastBatch) > 20000L) && (ndocs >= toReturn))) {
                    long finalUnusedMemory = Runtime.getRuntime().maxMemory()
                            - Runtime.getRuntime().totalMemory();
                    long finalFreeMemory = Runtime.getRuntime().freeMemory();

                    logger.error("Query truncated memUsage=" + newMemUsage + ", memory=" + runtimeMem
                            + ", docs=" + ndocs + ", totaldocs=" + scores.found + ", init_free_mem="
                            + initialFreeMemory + ", end_free_mem=" + finalFreeMemory + ", init_unused_mem="
                            + initialUnusedMemory + ", end_unused_mem=" + finalUnusedMemory);
                    break;
                } //TESTED
                currMemUsage = newMemUsage;
                lastBatch = now;
            } //TESTED
            ndocs++;

            // Simple handling for standalone events
            if ((null != _s0_standaloneEventAggregator) && !_s0_bNeedToCalcSig) {
                //if _s0_bNeedToCalcSig then do this elsewhere
                ScoringUtils_Associations.addStandaloneEvents(f, 0.0, 0, _s0_standaloneEventAggregator,
                        _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter,
                        _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts);
            } //TESTED

            if (!_s0_bNeedToCalcSig) {
                continue;
            } //TESTED

            if (nCommunities > 1) { // (could have pan-community entities)
                ObjectId communityId = (ObjectId) f.get(DocumentPojo.communityId_);
                if (null != communityId) { // (have big problems if so, but anyway!)
                    int retval = _s0_multiCommunityHandler.community_getIdAndInitialize(communityId,
                            _s1_entitiesInDataset);
                    // (returns an int community id but also sets it into the cache, so just use that below)
                    if (Integer.MIN_VALUE == retval) {
                        //this document cannot be viewed from within this set of communities
                        continue;
                    }
                }
            } //TESTED      

            TempDocBucket docBucket = new TempDocBucket();
            docBucket.dbo = f;
            ObjectId id = (ObjectId) f.get(DocumentPojo._id_);

            // If we're going to weight relevance in, or we need the geo temporal decay:
            if ((0 != scoreParams.relWeight) || (null != scoreParams.timeProx)
                    || (null != scoreParams.geoProx)) {
                StatisticsPojo.Score scoreObj = scores.getScore().get(id);
                if (null != scoreObj) {
                    docBucket.explain = scoreObj.explain; // (will normally be null)
                    docBucket.luceneScore = scoreObj.score;
                    if ((null != scoreParams.timeProx) || (null != scoreParams.geoProx)) {
                        if (scoreObj.decay >= 0.0) {
                            docBucket.geoTemporalDecay = scoreObj.decay;
                        }
                        // (see also below for low accuracy geo scoring)
                    }
                } else {
                    docBucket.luceneScore = 1.0;
                }
            } //TESTED
            else if (this._s0_sortingByDate) {
                StatisticsPojo.Score scoreObj = scores.getScore().get(id);
                if (null != scoreObj) {
                    docBucket.nLuceneIndex = scoreObj.nIndex;
                }
            }
            docBucket.manualWeighting = this.getManualScoreWeights(scoreParams, f);

            BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_));
            if (null != l) {

                long nEntsInDoc = l.size();
                double dBestGeoScore = 0.0; // (for low accuracy geo only)
                for (Iterator<?> e0 = l.iterator(); e0.hasNext();) {
                    BasicDBObject e = (BasicDBObject) e0.next();
                    BasicDBObject tmpGeotag = null;
                    if (_s3_bLowAccuracyGeo || (null != _s1_dManualGeoDecay_latLonInvdecay)) {
                        // low accuracy geo, need to look for geotag
                        tmpGeotag = (BasicDBObject) e.get(EntityPojo.geotag_);
                    }

                    // Get attributes

                    double freq = -1.0;
                    long ntotaldoccount = -1;
                    String entity_index;
                    Double sentiment = null;
                    try {
                        sentiment = (Double) e.get(EntityPojo.sentiment_);
                        ntotaldoccount = e.getLong(EntityPojo.doccount_);
                        freq = e.getDouble(EntityPojo.frequency_);
                        entity_index = e.getString(EntityPojo.index_);
                        if (null == entity_index) {
                            // Just bypass the entity 
                            e.put(EntityPojo.significance_, 0.0);
                            nEntsInDoc--;
                            continue;
                        }
                    } catch (Exception ex) {
                        try {
                            String sfreq;
                            if (ntotaldoccount < 0) {
                                sfreq = e.getString(EntityPojo.doccount_);
                                ntotaldoccount = Long.valueOf(sfreq);
                            }
                            if (freq < -0.5) {
                                sfreq = e.getString(EntityPojo.frequency_);
                                freq = Long.valueOf(sfreq).doubleValue();
                            }
                            entity_index = e.getString(EntityPojo.index_);
                            if (null == entity_index) {
                                // Just bypass the entity 
                                e.put(EntityPojo.significance_, 0.0);
                                nEntsInDoc--;
                                continue;
                            }
                        } catch (Exception e2) {
                            // Just bypass the entity 
                            e.put(EntityPojo.significance_, 0.0);
                            nEntsInDoc--;
                            continue;
                        }
                    } //TESTED

                    // First loop through is just counting

                    // Retrieve entity (create/initialzie if necessary)
                    EntSigHolder shp = _s1_entitiesInDataset.get(entity_index);
                    if (null == shp) {
                        if (ntotaldoccount > (long) _s0_globalDocCount) { // obviously can't have more entities-in-dos than docs... 
                            ntotaldoccount = (long) _s0_globalDocCount;
                        }
                        shp = new EntSigHolder(entity_index, ntotaldoccount, _s0_multiCommunityHandler);

                        // Stage 1a alias handling: set up infrastructure, calculate doc overlap
                        if (null != _s1_aliasLookup) {
                            stage1_initAlias(shp);
                        }
                        if ((null != shp.aliasInfo) && (null == shp.masterAliasSH)) { // this is the discard alias
                            nEntsInDoc--;
                            continue;
                        } //TESTED

                        // Check if entity is in type filter list
                        if (null != _s0_entityTypeFilter) {
                            String entType = null;
                            if (null != shp.aliasInfo) {
                                entType = shp.aliasInfo.getType();
                            } else {
                                entType = e.getString(EntityPojo.type_);
                            }
                            if (_s0_bEntityTypeFilterPositive) {
                                if ((null != entType)
                                        && !_s0_entityTypeFilter.contains(entType.toLowerCase())) {
                                    nEntsInDoc--;
                                    continue;
                                }
                            } else if ((null != entType)
                                    && _s0_entityTypeFilter.contains(entType.toLowerCase())) {
                                //(negative filter)
                                nEntsInDoc--;
                                continue;
                            }

                        } //TESTED (end entity filter)

                        // Geo:
                        if (null != shp.aliasInfo) {
                            if (null != shp.aliasInfo.getGeotag()) { //Geo, overwrite/create tmpGeotag
                                if (_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo
                                        || (null != _s1_dManualGeoDecay_latLonInvdecay)) {
                                    // Always capture alias geo, even if not in low accuracy mode because we add it to the 
                                    // legitimate geo:
                                    if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo)
                                            && (null == _s3_geoBuckets)) {
                                        // Initialize the buckets if this is for aggregation not just decay
                                        _s3_geoBuckets = (LinkedList<EntSigHolder>[]) new LinkedList[_s3_nGEO_BUCKETS];
                                    }

                                    if (null == tmpGeotag) {
                                        tmpGeotag = new BasicDBObject();
                                    }
                                    tmpGeotag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat);
                                    tmpGeotag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon);

                                    if (null != shp.aliasInfo.getOntology_type()) {
                                        e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type());
                                    }
                                }
                            }
                        } //TESTED (end geo for aggregation or decay)

                        _s1_entitiesInDataset.put(entity_index, shp);
                        // end Stage 1a alias handling
                    } //(end if is alias)

                    // Stage 1b alias handling: calculate document counts (taking overlaps into account)
                    if (null != shp.masterAliasSH) {
                        // Counts:
                        shp.masterAliasSH.nTotalDocCount++;
                        // docs including overlaps
                        shp.masterAliasSH.avgFreqOverQuerySubset += freq;

                        // Keep track of overlaps:
                        if (f != shp.masterAliasSH.unusedDbo) {
                            shp.masterAliasSH.unusedDbo = f;
                            // (note this is only used in stage 1, alias.unusedDbo is re-used differently in stage 3/4)
                            shp.masterAliasSH.nDocCountInQuerySubset++;
                            // non-overlapping docs ie < shp.nDocCountInQuerySubset
                        }

                        // Sentiment:
                        shp.masterAliasSH.positiveSentiment += shp.positiveSentiment;
                        shp.masterAliasSH.negativeSentiment += shp.negativeSentiment;
                        if (null != sentiment) {
                            shp.masterAliasSH.nTotalSentimentValues++;
                        }

                    } //TESTED (end if is alias)
                      // end Stage 1b

                    // Pan-community logic (this needs to be before the entity object is updated)
                    if (_s0_multiCommunityHandler.isActive()) {
                        _s0_multiCommunityHandler.community_updateCorrelations(shp, ntotaldoccount,
                                entity_index);
                    } else { // (Once we've started multi-community logic, this is no longer desirable)
                        if ((ntotaldoccount > shp.nTotalDocCount) && (ntotaldoccount <= _s0_globalDocCount)) {
                            shp.nTotalDocCount = ntotaldoccount;
                        }
                        //(note there used to be some cases where we adjusted for dc/tf==0, but the 
                        // underlying issue in the data model that caused this has been fixed, so it's 
                        // now a pathological case that can be ignored)
                    } //(TESTED)

                    // Update counts:
                    _s1_sumFreqInQuerySubset += freq;
                    shp.avgFreqOverQuerySubset += freq;
                    shp.nDocCountInQuerySubset++;
                    shp.decayedDocCountInQuerySubset += docBucket.geoTemporalDecay;
                    // (note this doesn't handle low accuracy geo-decay ... we'll address that via a separate term)

                    TempEntityInDocBucket entBucket = new TempEntityInDocBucket();
                    entBucket.dbo = e;
                    entBucket.freq = freq;
                    entBucket.doc = docBucket;
                    shp.entityInstances.add(entBucket);
                    if (null != tmpGeotag) { // (only needed for low accuracy geo aggregation)

                        if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo) && (null == shp.geotag)) { // (first time for shp only)
                            shp.geotag = tmpGeotag;
                            shp.geotaggedEntity = e; // (ie for onto type, which has been overwritten in the alias case...)
                        }
                        if (null != _s1_dManualGeoDecay_latLonInvdecay) {
                            // Emulate scripted Lucene calculations
                            double minlat = tmpGeotag.getDouble(GeoPojo.lat_);
                            double minlon = tmpGeotag.getDouble(GeoPojo.lon_);
                            double paramlat = _s1_dManualGeoDecay_latLonInvdecay[0];
                            double paramlon = _s1_dManualGeoDecay_latLonInvdecay[1];
                            double gdecay = _s1_dManualGeoDecay_latLonInvdecay[2];
                            char ontCode = GeoOntologyMapping
                                    .encodeOntologyCode(e.getString(EntityPojo.ontology_type_));
                            double dDecay = QueryDecayScript.getGeoDecay(minlat, minlon, paramlat, paramlon,
                                    gdecay, ontCode);
                            if (dDecay > dBestGeoScore) {
                                dBestGeoScore = dDecay;
                            }
                        } //TESTED
                    } //(end if entity has geo and need to process entity geo)

                    if (freq > shp.maxFreq) {
                        shp.maxFreq = freq;
                    }
                    // Sentiment:
                    if ((null != sentiment) && (Math.abs(sentiment) <= 1.1)) { // (actually 1.0)
                        shp.nTotalSentimentValues++;
                        if (sentiment > 0.0) {
                            shp.positiveSentiment += sentiment;
                        } else {
                            shp.negativeSentiment += sentiment;
                        }
                    } else if (null != sentiment) { // corrupt sentiment for some reason?!
                        e.put(EntityPojo.sentiment_, null);
                    }
                    docBucket.docLength += freq;

                } //(end loop over entities)

                docBucket.nLeftToProcess = nEntsInDoc;
                docBucket.nEntsInDoc = (int) nEntsInDoc;

                if (null != this._s1_dManualGeoDecay_latLonInvdecay) { // Low accuracy geo-calculations
                    docBucket.geoTemporalDecay *= dBestGeoScore;
                    docBucket.luceneScore *= dBestGeoScore;
                    _s2_dAvgLowAccuracyGeoDecay += dBestGeoScore * s0_nQuerySubsetDocCountInv;
                } //TESTED            

            } // (end if feed has entities)

            // Handle documents with no entities - can still promote them
            if (0 == docBucket.nLeftToProcess) { // (use this rather than doc length in case all the entities had freq 0)
                _s1_noEntityBuckets.add(docBucket);
            }

        } // (end loop over feeds)
          //TESTED
    } finally {
        dbc.setDBDecoderFactory(defaultDecoder);
    }
}

From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java

License:Open Source License

private void stage4_prepareDocsForOutput(AdvancedQueryPojo.QueryScorePojo scoreParams, StatisticsPojo scores,
        long nToClientLimit, LinkedList<BasicDBObject> returnList) {
    // Get the documents
    long nDocs = 0;
    double dBestScore = 0.0;
    double dAvgScore = 0.0;

    double dSigFactor = 100.0 / (_s3_dSigScalingFactor * _s2_dApproxAverageDocumentSig);
    double dRelFactor = 100.0 / (_s3_dLuceneScalingFactor * _s0_avgLuceneScore);

    // Start at the bottom of the list, so don't need to worry about skipping documents, just count out from the bottom
    // The call to stage3_calculateTFTerms with nStart+nToClientLimit handles the rest

    Iterator<TempDocBucket> pqIt = _s3_pqDocs.iterator();
    while (pqIt.hasNext() && (nDocs < nToClientLimit)) {
        TempDocBucket qsf = pqIt.next();
        nDocs++;/*from   w w w . j a  v  a2 s .  c  o  m*/
        if (!_s0_sortingByDate) {
            dBestScore = qsf.totalScore;
        }
        dAvgScore += dBestScore;

        BasicDBObject f = qsf.dbo;

        // Phase "0" - these are the highest prio events
        boolean bNeedToFilterAndAliasAssoc_event = true;
        boolean bNeedToFilterAndAliasAssoc_fact = true;
        boolean bNeedToFilterAndAliasAssoc_summary = true;
        if (null != _s0_standaloneEventAggregator) {
            ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0,
                    _s0_standaloneEventAggregator, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                    _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts);
            bNeedToFilterAndAliasAssoc_event = false;
            bNeedToFilterAndAliasAssoc_fact = false;
            bNeedToFilterAndAliasAssoc_summary = false;
        } //TESTED
        if (null != _s0_lowAccuracyAssociationAggregator_events) {
            ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0,
                    _s0_lowAccuracyAssociationAggregator_events, _s0_bEntityTypeFilterPositive,
                    _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, true, false,
                    false);
            bNeedToFilterAndAliasAssoc_event = false;
        } //TESTED                        
        if (null != _s0_lowAccuracyAssociationAggregator_facts) {
            ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0,
                    _s0_lowAccuracyAssociationAggregator_facts, _s0_bEntityTypeFilterPositive,
                    _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, false, false,
                    true);
            bNeedToFilterAndAliasAssoc_fact = false;
        } //TESTED

        try {
            DocumentPojoApiMap.mapToApi(f);
            // Handle deduplication/multi-community code:
            if (null != qsf.dupList) {
                try {
                    ScoringUtils_MultiCommunity.community_combineDuplicateDocs(qsf);
                } catch (Exception e) {
                    // Do nothing, just carry on with minimal damage!
                }
            }

            // Scoring:
            double d = qsf.aggSignificance * dSigFactor;
            if (Double.isNaN(d)) {
                f.put(DocumentPojo.aggregateSignif_, 0.0);
            } else {
                f.put(DocumentPojo.aggregateSignif_, d);
            }
            d = qsf.luceneScore * dRelFactor;
            if (Double.isNaN(d)) {
                f.put(DocumentPojo.queryRelevance_, 0.0);
            } else {
                f.put(DocumentPojo.queryRelevance_, d);
            }
            if (!_s0_sortingByDate) {
                f.put(DocumentPojo.score_, qsf.totalScore);
            }

            BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_));

            // Handle update ids vs normal ids:
            ObjectId updateId = (ObjectId) f.get(DocumentPojo.updateId_);
            if (null != updateId) { // swap the 2...
                f.put(DocumentPojo.updateId_, f.get(DocumentPojo._id_));
                f.put(DocumentPojo._id_, updateId);
            }

            // Check if entities enabled            
            if ((null != l) && (!_s0_bGeoEnts && !_s0_bNonGeoEnts)) {
                f.removeField(DocumentPojo.entities_);
                l = null;
            } //TESTED

            // Check if events etc enabled
            if ((!_s0_bEvents && !_s0_bFacts && !_s0_bSummaries)) {
                f.removeField(DocumentPojo.associations_);
            } //TESTED            
            else if (!_s0_bEvents || !_s0_bFacts || !_s0_bSummaries || (null != _s0_assocVerbFilter)) {

                // Keep only specified event_types
                BasicDBList lev = (BasicDBList) (f.get(DocumentPojo.associations_));
                if (null != lev) {
                    for (Iterator<?> e0 = lev.iterator(); e0.hasNext();) {
                        BasicDBObject e = (BasicDBObject) e0.next();

                        // Type filter
                        boolean bNeedToFilterAndAliasAssoc = true;
                        String sEvType = e.getString(AssociationPojo.assoc_type_);
                        boolean bKeep = true;
                        if (null == sEvType) {
                            bKeep = false;
                        } else if (sEvType.equalsIgnoreCase("event")) {
                            if (!_s0_bEvents)
                                bKeep = false;
                            bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_event;
                        } else if (sEvType.equalsIgnoreCase("fact")) {
                            if (!_s0_bFacts)
                                bKeep = false;
                            bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_fact;
                        } else if (sEvType.equalsIgnoreCase("summary")) {
                            if (!_s0_bSummaries)
                                bKeep = false;
                            bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_summary;
                        }
                        if (!bKeep) {
                            e0.remove();
                        } else { // Type matches, now for some more complex logic....

                            if (bNeedToFilterAndAliasAssoc) { // (otherwise done already)

                                bKeep = ScoringUtils_Associations.filterAndAliasAssociation(e, _s1_aliasLookup,
                                        true, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive,
                                        _s0_entityTypeFilter, _s0_assocVerbFilter);
                                if (!bKeep) {
                                    e0.remove();
                                }

                            } //TESTED

                        } //(end output filter logic)

                    } // (end loop over events)   
                } // (end if this doc has events)

            } //TESTED            

            // Check if metadata is enabled
            if (!_s0_bMetadata) {
                f.removeField(DocumentPojo.metadata_);
            } //TESTED

            if (null != l) {

                for (Iterator<?> e0 = l.iterator(); e0.hasNext();) {
                    BasicDBObject e = (BasicDBObject) e0.next();

                    if (!_s0_bNonGeoEnts) { // then must only be getting geo (else wouldn't be in this loop)
                        if (null == e.get(EntityPojo.geotag_)) {
                            e0.remove();
                            continue;
                        }
                    }

                    String entity_index = e.getString(EntityPojo.index_);
                    if (null == entity_index)
                        continue;

                    EntSigHolder shp = (EntSigHolder) _s1_entitiesInDataset.get(entity_index);

                    if (null != shp) {
                        // Stage 4x: alias processing, just overwrite 
                        // (note don't delete "duplicate entities", hard-to-be-globally-consistent
                        //  and will potentially throw data away which might be undesirable)
                        if (null != shp.masterAliasSH) {
                            shp = shp.masterAliasSH; // (already has all the aggregated values used below)
                            if (!entity_index.equals(shp.aliasInfo.getIndex())) {
                                e.put(EntityPojo.index_, shp.aliasInfo.getIndex());
                                e.put(EntityPojo.disambiguated_name_, shp.aliasInfo.getDisambiguatedName());
                                e.put(EntityPojo.type_, shp.aliasInfo.getType());
                                e.put(EntityPojo.dimension_, shp.aliasInfo.getDimension());

                                if (null != shp.aliasInfo.getGeotag()) {
                                    BasicDBObject aliasedGeoTag = new BasicDBObject();
                                    aliasedGeoTag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat);
                                    aliasedGeoTag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon);
                                    e.put(EntityPojo.geotag_, aliasedGeoTag);
                                    if (null != shp.aliasInfo.getOntology_type()) {
                                        e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type());
                                    }
                                } //TESTED
                            }
                        } //TESTED
                          // end Stage 4x of alias processing                  

                        double dataSig = shp.datasetSignificance;
                        if (Double.isNaN(dataSig)) {
                            e.put(EntityPojo.datasetSignificance_, 0.0);
                        } else {
                            e.put(EntityPojo.datasetSignificance_, dataSig);
                        }
                        e.put(EntityPojo.queryCoverage_, shp.queryCoverage);
                        e.put(EntityPojo.averageFreq_, shp.avgFreqOverQuerySubset);
                        if (shp.nTotalSentimentValues > 0) {
                            e.put(EntityPojo.positiveSentiment_, shp.positiveSentiment);
                            e.put(EntityPojo.negativeSentiment_, shp.negativeSentiment);
                            e.put(EntityPojo.sentimentCount_, shp.nTotalSentimentValues);
                        }
                    } else { // (most likely to occur if the entity is discarded (alias/filter) or is corrupt in some way)
                        e0.remove();
                        continue;
                    }

                } //(end loop over entities)
            } // (end if feed has entities)
              //TESTED

            // Explain if enabled
            if (null != qsf.explain) {
                f.put(DocumentPojo.explain_, qsf.explain);
            }

            // Add to the end of the list (so will come back from API call in natural order, highest first)
            returnList.addFirst(f);
            // (add elements to the front of the list so that the top of the list is ordered by priority)
        } catch (Exception e) {
            // Probably a JSON error, just carry on
            String title = f.getString(DocumentPojo.title_);
            logger.error(title + ": " + e.getMessage());
        }

    } // (end loop over feeds)
      //TESTED

    // Update the scores:
    scores.maxScore = (float) dBestScore;
    if (nDocs > 0) {
        scores.avgScore = (float) dAvgScore / nDocs;
    }
}