List of usage examples for com.mongodb BasicDBObject getString
public String getString(final String key)
From source file:com.ijuru.ijambo.dao.WordDAO.java
License:Open Source License
/** * Gets a random word//from w ww.j a v a 2 s .c om * @param difficulty the difficulty (may be null) * @return the word */ public Word getRandomWord(Difficulty difficulty) { DBCollection words = db.getCollection("words"); BasicDBObject obj; if (difficulty != null) { // Get count of words of this difficulty BasicDBObject query = new BasicDBObject(); query.put("difficulty", difficulty.ordinal()); int count = words.find(query).count(); // Pick random one int randOffset = (int) (Math.random() * count); obj = (BasicDBObject) words.find(query).limit(-1).skip(randOffset).next(); } else { int randOffset = (int) (Math.random() * words.find().count()); obj = (BasicDBObject) words.find().limit(-1).skip(randOffset).next(); } return new Word(obj.getString("word"), obj.getString("meaning"), Difficulty.fromInt(obj.getInt("difficulty"))); }
From source file:com.ikanow.infinit.e.api.config.source.SourceHandler.java
License:Open Source License
/** * testSource//from w w w . jav a 2s. c o m * @param sourceJson * @param nNumDocsToReturn * @param bReturnFullText * @param userIdStr * @return */ public ResponsePojo testSource(String sourceJson, int nNumDocsToReturn, boolean bReturnFullText, boolean bRealDedup, String userIdStr) { ResponsePojo rp = new ResponsePojo(); try { SourcePojo source = null; SourcePojoSubstitutionApiMap apiMap = new SourcePojoSubstitutionApiMap(new ObjectId(userIdStr)); try { source = ApiManager.mapFromApi(sourceJson, SourcePojo.class, apiMap); source.fillInSourcePipelineFields(); } catch (Exception e) { rp.setResponse(new ResponseObject("Test Source", false, "Error deserializing source (JSON is valid but does not match schema): " + e.getMessage())); return rp; } if (null == source.getKey()) { source.setKey(source.generateSourceKey()); // (a dummy value, not guaranteed to be unique) } if ((null == source.getExtractType()) || !source.getExtractType().equals("Federated")) { String testUrl = source.getRepresentativeUrl(); if (null == testUrl) { rp.setResponse( new ResponseObject("Test Source", false, "Error, source contains no URL to harvest")); return rp; } } if (null == source.getTags()) { source.setTags(new HashSet<String>()); } // This is the only field that you don't normally need to specify in save but will cause // problems if it's not populated in test. ObjectId userId = new ObjectId(userIdStr); // Set owner (overwrite, for security reasons) source.setOwnerId(userId); if (null == source.getCommunityIds()) { source.setCommunityIds(new TreeSet<ObjectId>()); } if (!source.getCommunityIds().isEmpty()) { // need to check that I'm allowed the specified community... if ((1 == source.getCommunityIds().size()) && (userId.equals(source.getCommunityIds().iterator().next()))) { // we're OK only community id is user community } //TESTED else { HashSet<ObjectId> communities = SocialUtils.getUserCommunities(userIdStr); Iterator<ObjectId> it = source.getCommunityIds().iterator(); while (it.hasNext()) { ObjectId src = it.next(); if (!communities.contains(src)) { rp.setResponse(new ResponseObject("Test Source", false, "Authentication error: you don't belong to this community: " + src)); return rp; } //TESTED } } //TESTED } // Always add the userId to the source community Id (so harvesters can tell if they're running in test mode or not...) source.addToCommunityIds(userId); // (ie user's personal community, always has same _id - not that it matters) // Check the source's admin status source.setOwnedByAdmin(RESTTools.adminLookup(userId.toString(), false)); if (bRealDedup) { // Want to test update code, so ignore update cycle if (null != source.getRssConfig()) { source.getRssConfig().setUpdateCycle_secs(1); // always update } } HarvestController harvester = new HarvestController(true); if (nNumDocsToReturn > 100) { // (seems reasonable) nNumDocsToReturn = 100; } harvester.setStandaloneMode(nNumDocsToReturn, bRealDedup); List<DocumentPojo> toAdd = new LinkedList<DocumentPojo>(); List<DocumentPojo> toUpdate = new LinkedList<DocumentPojo>(); List<DocumentPojo> toRemove = new LinkedList<DocumentPojo>(); if (null == source.getHarvestStatus()) { source.setHarvestStatus(new SourceHarvestStatusPojo()); } String oldMessage = source.getHarvestStatus().getHarvest_message(); // SPECIAL CASE: FOR FEDERATED QUERIES if ((null != source.getExtractType()) && source.getExtractType().equals("Federated")) { int federatedQueryEnts = 0; SourceFederatedQueryConfigPojo endpoint = null; try { endpoint = source.getProcessingPipeline().get(0).federatedQuery; } catch (Exception e) { } if (null == endpoint) { rp.setResponse( new ResponseObject("Test Source", false, "source error: no federated query specified")); return rp; } AdvancedQueryPojo testQuery = null; String errMessage = "no query specified"; try { testQuery = AdvancedQueryPojo.fromApi(endpoint.testQueryJson, AdvancedQueryPojo.class); } catch (Exception e) { errMessage = e.getMessage(); } if (null == testQuery) { rp.setResponse(new ResponseObject("Test Source", false, "source error: need to specifiy a valid IKANOW query to test federated queries, error: " + errMessage)); return rp; } // OK if we're here then we can test the query SimpleFederatedQueryEngine testFederatedQuery = new SimpleFederatedQueryEngine(); endpoint.parentSource = source; testFederatedQuery.addEndpoint(endpoint); ObjectId queryId = new ObjectId(); String[] communityIdStrs = new String[source.getCommunityIds().size()]; int i = 0; for (ObjectId commId : source.getCommunityIds()) { communityIdStrs[i] = commId.toString(); i++; } testFederatedQuery.setTestMode(true); testFederatedQuery.preQueryActivities(queryId, testQuery, communityIdStrs); StatisticsPojo stats = new StatisticsPojo(); stats.setSavedScores(0, 0); rp.setStats(stats); ArrayList<BasicDBObject> toAddTemp = new ArrayList<BasicDBObject>(1); testFederatedQuery.postQueryActivities(queryId, toAddTemp, rp); for (BasicDBObject docObj : toAddTemp) { DocumentPojo doc = DocumentPojo.fromDb(docObj, DocumentPojo.class); if (bReturnFullText) { doc.setFullText(docObj.getString(DocumentPojo.fullText_)); doc.makeFullTextNonTransient(); } if (null != doc.getEntities()) { federatedQueryEnts += doc.getEntities().size(); } //Metadata workaround: @SuppressWarnings("unchecked") LinkedHashMap<String, Object[]> meta = (LinkedHashMap<String, Object[]>) docObj .get(DocumentPojo.metadata_); if (null != meta) { Object metaJson = meta.get("json"); if (metaJson instanceof Object[]) { // (in this case ... non-cached, need to recopy in, I forget why) doc.addToMetadata("json", (Object[]) metaJson); } } toAdd.add(doc); } // (currently can't run harvest source federated query) if (0 == federatedQueryEnts) { // (more fed query exceptions) source.getHarvestStatus().setHarvest_message( "Warning: no entities extracted, probably docConversionMap is wrong?"); } else { source.getHarvestStatus().setHarvest_message(federatedQueryEnts + " entities extracted"); } } //TESTED (END FEDERATED QUERY TEST MODE, WHICH IS A BIT DIFFERENT) else { harvester.harvestSource(source, toAdd, toUpdate, toRemove); } // (don't parrot the old message back - v confusing) if (oldMessage == source.getHarvestStatus().getHarvest_message()) { // (ptr ==) source.getHarvestStatus() .setHarvest_message("(no documents extracted - likely a source or configuration error)"); } //TESTED String message = null; if ((null != source.getHarvestStatus()) && (null != source.getHarvestStatus().getHarvest_message())) { message = source.getHarvestStatus().getHarvest_message(); } else { message = ""; } List<String> errMessagesFromSourceDeser = apiMap.getErrorMessages(); if (null != errMessagesFromSourceDeser) { StringBuffer sbApiMapErr = new StringBuffer("Substitution errors:\n"); for (String err : errMessagesFromSourceDeser) { sbApiMapErr.append(err).append("\n"); } message = message + "\n" + sbApiMapErr.toString(); } //TESTED (by hand) if ((null != source.getHarvestStatus()) && (HarvestEnum.error == source.getHarvestStatus().getHarvest_status())) { rp.setResponse(new ResponseObject("Test Source", false, "source error: " + message)); rp.setData(toAdd, new DocumentPojoApiMap()); } else { if ((null == message) || message.isEmpty()) { message = "no messages from harvester"; } rp.setResponse(new ResponseObject("Test Source", true, "successfully returned " + toAdd.size() + " docs: " + message)); try { // If grabbing full text // Also some logstash/custom specific logic - these aren't docs so just output the entire record boolean isLogstash = (null != source.getExtractType()) && source.getExtractType().equalsIgnoreCase("logstash"); boolean isCustom = (null != source.getExtractType()) && source.getExtractType().equalsIgnoreCase("custom"); List<BasicDBObject> records = null; if (bReturnFullText || isLogstash || isCustom) { for (DocumentPojo doc : toAdd) { if (isLogstash || isCustom) { if (null == records) { records = new ArrayList<BasicDBObject>(toAdd.size()); } BasicDBObject dbo = (BasicDBObject) doc.getMetadata().get("record")[0]; Object test = dbo.get("_id"); if ((null != test) && (test instanceof ObjectId)) { dbo.remove("_id"); // (unless it's a custom _id added from logstash then remove it) } records.add(dbo); } //TESTED else if (bReturnFullText) { doc.makeFullTextNonTransient(); } } } //TESTED if (null != records) { rp.setData(records, (BasePojoApiMap<BasicDBObject>) null); } //TESTED else { rp.setData(toAdd, new DocumentPojoApiMap()); } //TESTED //Test deserialization: rp.toApi(); } catch (Exception e) { //e.printStackTrace(); StringBuffer sb = new StringBuffer(); Globals.populateStackTrace(sb, e); rp.setData( new BasicDBObject("error_message", "Error deserializing documents: " + sb.toString()), null); } } } catch (Exception e) { // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); rp.setResponse(new ResponseObject("Test Source", false, "Error testing source: " + e.getMessage())); } catch (Error e) { // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); rp.setResponse(new ResponseObject("Test Source", false, "Configuration/Installation error: " + e.getMessage())); } return rp; }
From source file:com.ikanow.infinit.e.api.knowledge.DocumentHandler.java
License:Open Source License
/** * Get information function that returns the user information in the form of a JSON String. * @param isAdmin //from www . j a v a2s. c o m * * @param key the key definition of the user ( example email@email.com ) * @return a JSON string representation of the person information on success */ public ResponsePojo getInfo(String userIdStr, String sourceKey, String idStrOrUrl, boolean bReturnFullText, boolean returnRawData, boolean isAdmin) { ResponsePojo rp = new ResponsePojo(); try { // Set up the query BasicDBObject query = new BasicDBObject(); ObjectId id = null; if (null == sourceKey) { id = new ObjectId(idStrOrUrl); query.put(DocumentPojo._id_, id); } else { query.put(DocumentPojo.sourceKey_, sourceKey); query.put(DocumentPojo.url_, idStrOrUrl); } if (!isAdmin) query.put(DocumentPojo.communityId_, new BasicDBObject(MongoDbManager.in_, SocialUtils.getUserCommunities(userIdStr))); // (use DBObject here because DocumentPojo is pretty big and this call could conceivably have perf implications) BasicDBObject fieldsQ = new BasicDBObject(); if (!bReturnFullText) { fieldsQ.put(DocumentPojo.fullText_, 0); // (XML/JSON have fullText as part of pojo) } BasicDBObject dbo = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(query, fieldsQ); if ((null == dbo) || ((null != dbo.get(DocumentPojo.url_)) && dbo.getString(DocumentPojo.url_).startsWith("?DEL?"))) { if (null != id) { // this might be the update id... query = new BasicDBObject(DocumentPojo.updateId_, id); dbo = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(query, fieldsQ); } } //TESTED (update case, normal case, and intermediate case where both update and original still exist) if (null == dbo) { rp.setResponse(new ResponseObject("Doc Info", true, "Document not found")); return rp; } DocumentPojo dp = DocumentPojo.fromDb(dbo, DocumentPojo.class); if (bReturnFullText) { if (null == dp.getFullText()) { // (Some things like database records might have this stored already) byte[] storageArray = new byte[200000]; DBCollection contentDB = DbManager.getDocument().getContent(); BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, dp.getUrl()); contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, dp.getSourceKey()))); BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1); BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ, fields); if (null != dboContent) { byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_)); ByteArrayInputStream in = new ByteArrayInputStream(compressedData); GZIPInputStream gzip = new GZIPInputStream(in); int nRead = 0; StringBuffer output = new StringBuffer(); while (nRead >= 0) { nRead = gzip.read(storageArray, 0, 200000); if (nRead > 0) { String s = new String(storageArray, 0, nRead, "UTF-8"); output.append(s); } } dp.setFullText(output.toString()); dp.makeFullTextNonTransient(); } } } else if (!returnRawData) { dp.setFullText(null); // (obviously will normally contain full text anyway) } else // if ( returnRawData ) { //check if the harvest type is file, return the file instead //if file is db return the json //get source SourcePojo source = getSourceFromKey(dp.getSourceKey()); if (source.getExtractType().equals("File")) { //get file from harvester String fileURL = dp.getUrl(); if (dp.getSourceUrl() != null) fileURL = dp.getSourceUrl(); byte[] bytes = FileHarvester.getFile(fileURL, source); if (bytes == null) { // Try returning JSON instead String json = ApiManager.mapToApi(dp, new DocumentPojoApiMap()); DocumentFileInterface dfp = new DocumentFileInterface(); dfp.bytes = json.getBytes(); dfp.mediaType = "application/json"; rp.setResponse( new ResponseObject("Doc Info", true, "Document bytes returned successfully")); rp.setData(dfp, null); return rp; } else { DocumentFileInterface dfp = new DocumentFileInterface(); dfp.bytes = bytes; dfp.mediaType = getMediaType(fileURL); rp.setResponse( new ResponseObject("Doc Info", true, "Document bytes returned successfully")); rp.setData(dfp, null); return rp; } } else { String json = ApiManager.mapToApi(dp, new DocumentPojoApiMap()); DocumentFileInterface dfp = new DocumentFileInterface(); dfp.bytes = json.getBytes(); dfp.mediaType = "application/json"; rp.setResponse(new ResponseObject("Doc Info", true, "Document bytes returned successfully")); rp.setData(dfp, null); return rp; } } rp.setData(dp, new DocumentPojoApiMap()); rp.setResponse(new ResponseObject("Doc Info", true, "Feed info returned successfully")); } //(end full text vs raw data) catch (Exception e) { // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); rp.setResponse(new ResponseObject("Doc Info", false, "error returning feed: " + e.getMessage())); } // Return Json String representing the user return rp; }
From source file:com.ikanow.infinit.e.api.knowledge.federated.SimpleFederatedQueryEngine.java
License:Open Source License
@Override public void preQueryActivities(ObjectId queryId, AdvancedQueryPojo query, String[] communityIdStrs) { _scoreStats = null;//from w w w.j a v a 2 s . c om _asyncRequestsPerQuery = null; // 1] Check whether this makes sense to query, get the (sole) entity if so String entityType = null; String entityValue = null; String entityIndex = null; String textToTest = null; if ((null != query.qt) && (query.qt.size() > 0) && (query.qt.size() < 4)) { String logic = query.logic; if (null != logic) { logic = logic.toLowerCase(); } if ((null != logic) && (logic.contains("or") || logic.contains("not"))) { //DEBUG if (_DEBUG) _logger.debug("DEB: preQA1: Logic too complex: " + query.logic); if (_testMode) { throw new RuntimeException("Bad testQueryJson: Logic too complex: " + query.logic); } return; // logic too complex } //TESTED (1.3) for (AdvancedQueryPojo.QueryTermPojo qt : query.qt) { if ((null != qt.entity) || ((null != qt.entityType) && (null != qt.entityValue))) { if (null == entityType) { // we now have == 1 entity if (null != qt.entityValue) { entityValue = qt.entityValue; entityType = qt.entityType; entityIndex = entityValue.toLowerCase() + "/" + entityType.toLowerCase(); } //TESTED (1.5) else { entityIndex = qt.entity.toLowerCase(); int index = qt.entity.lastIndexOf('/'); if (index > 0) { entityValue = qt.entity.substring(0, index); entityType = qt.entity.substring(index + 1).toLowerCase(); } } //TESTED (1.6) } else { // >1 entity, not supported //DEBUG if (_DEBUG) _logger.debug("DEB: preQA2a: >1 entity: " + qt.entity + " / " + entityType + " / " + query.toApi()); if (_testMode) { throw new RuntimeException("Bad testQueryJson: >1 entity: " + qt.entity + " / " + entityType + " / " + query.toApi()); } return; } //TESTED (1.4) } //TESTED else if ((null != qt.etext) && (qt.etext.equals("*"))) { //this is fine provided it's only ANDed together (eg above logic case) } else if (null != qt.etext) { // Only work if it matches one of the regexes if (null == entityType) { textToTest = qt.etext; entityType = "etext"; } else { // >1 entity, not supported //DEBUG if (_DEBUG) _logger.debug("DEB: preQA2b: >1 entity: " + qt.entity + " / " + entityType + " / " + query.toApi()); if (_testMode) { throw new RuntimeException("Bad testQueryJson: >1 entity: " + qt.entity + " / " + entityType + " / " + query.toApi()); } return; } //TESTED (1.4) } else if (null == qt.time) { // temporal //DEBUG if (_DEBUG) _logger.debug("DEB: preQA3: non-entity/date " + query.toApi()); if (_testMode) { throw new RuntimeException("Bad testQueryJson: non-entity/date " + query.toApi()); } return; } //TESTED (1.1) } //(end loop over query terms) } //TESTED (1.*) if (null == entityType) { // Query too complex //DEBUG if (_DEBUG) _logger.debug("DEB: preQA4: query missing entity " + query.toApi()); if (_testMode) { throw new RuntimeException("Bad testQueryJson: query missing entity " + query.toApi()); } return; } //TESTED (1.2) entityType = entityType.toLowerCase(); // 2] If so, query across all the end for (SourceFederatedQueryConfigPojo endpoint : _endpoints) { // Endpoint validation: if (null == endpoint.entityTypes) { if (_testMode) { throw new RuntimeException("No entity types specified"); } else { continue; } } if (null != textToTest) { // This is text, see if you can convert to an entity entityValue = null; //(reset for different endpoints - used in the check to decide whether to continue) for (String entityTypeRegex : endpoint.entityTypes) { if (entityTypeRegex.startsWith("/")) { int regexIndex = entityTypeRegex.lastIndexOf('/'); // (guaranteed to be >= 0) try { Pattern regex = Pattern.compile(entityTypeRegex.substring(1, regexIndex)); if (regex.matcher(textToTest).matches()) { entityType = entityTypeRegex.substring(1 + regexIndex); if (entityType.length() > 0) { entityValue = textToTest; entityIndex = entityValue.toLowerCase() + "/" + entityType.toLowerCase(); } } } catch (Exception e) { // if not in test mode, carry on if (_testMode) { throw new RuntimeException(e); } } } } //(end loop over entity regexes) } //TESTED if (null == entityValue) { // None of the regexes matched if (_testMode) { throw new RuntimeException("Text specified, does not match any of the regexes: " + Arrays.toString(endpoint.entityTypes.toArray()) + " ... text = " + textToTest); } continue; } //DEBUG if (_DEBUG) _logger.debug("DEB: preQA5: ENDPOINT: " + Arrays.toString(endpoint.entityTypes.toArray()) + " / " + entityType); if ((null != endpoint.importScript) && !endpoint.importScript.isEmpty()) { if (null == endpoint.scriptlang) { endpoint.scriptlang = "python"; // python ==default } if (endpoint.scriptlang.equalsIgnoreCase("python")) { _pyEngine = new ScriptEngineManager().getEngineByName("python"); if (null == _pyEngine) { _logger.error( "Python not installed - copy jython 2.5+ into /opt/infinite-home/lib/unbundled"); if (_testMode) { throw new RuntimeException( "Python not installed - copy jython 2.5+ into /opt/infinite-home/lib/unbundled"); } } //TESTED (by hand, importScript != null and scriptlang: "python", jython not on classpath) } else if (endpoint.scriptlang.equalsIgnoreCase("external")) { //nothing to do here, just carry on, will handle the external bit later on } else { _logger.error("Python/External is currently the only supported scriptlang"); if (_testMode) { throw new RuntimeException("Python is currently the only supported scriptlang"); } } //TESTED (by hand, importScript != null and scriptlang: "none") } //TESTED if ((null != endpoint.bypassSimpleQueryParsing) && endpoint.bypassSimpleQueryParsing) { throw new RuntimeException("Currently only simple query parsing is supported"); } if ((null != endpoint.entityTypes) && endpoint.entityTypes.contains(entityType)) { // If not using the full source pipeline processing capability (ie always generating 0/1 BasicDBObject cachedDoc = null; String cachedDocUrl = buildScriptUrl(endpoint.parentSource.getKey(), entityIndex); BasicDBObject cachedDoc_expired = null; if (!isComplexSource(endpoint.parentSource)) { // Check if the *doc* (not *API response*) generated from this endpoint/entity has been cached, check expiry if so if (_cacheMode && ((null == endpoint.cacheTime_days) || (endpoint.cacheTime_days >= 0))) { if (_DEBUG) _logger.debug("DEB: preQA6ya: Search Doc Cache: " + cachedDocUrl + " , " + endpoint.cacheTime_days); BasicDBObject cachedDocQuery = new BasicDBObject(DocumentPojo.url_, cachedDocUrl); cachedDocQuery.put(DocumentPojo.sourceKey_, endpoint.parentSource.getKey()); cachedDoc = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(cachedDocQuery); if (null != cachedDoc) { // (quick check if we have a complex source in here) String sourceUrl = cachedDoc.getString(DocumentPojo.sourceUrl_); if (null != sourceUrl) { // switching from complex to simple source - delete the cached docs if (_DEBUG) _logger.debug("DEB: preQA6yb: Clear Search Doc Cache: " + cachedDocUrl + " , " + sourceUrl); cachedDocQuery.remove(DocumentPojo.url_); cachedDocQuery.put(DocumentPojo.sourceUrl_, sourceUrl); DbManager.getDocument().getMetadata().remove(cachedDocQuery); cachedDoc = null; } //TESTED (by hand) else if (checkDocCache_isExpired(cachedDoc, endpoint)) { cachedDoc_expired = cachedDoc; cachedDoc = null; } } } //TESTED (by hand) } if (null == _asyncRequestsPerQuery) { // If we've got this far create a list to store the async requests _asyncRequestsPerQuery = new LinkedList<FederatedRequest>(); } if (null != cachedDoc) { // (simple sources only, by construction) // Common params: FederatedRequest requestOverview = new FederatedRequest(); requestOverview.endpointInfo = endpoint; requestOverview.communityIdStrs = communityIdStrs; requestOverview.requestParameter = entityValue; requestOverview.queryIndex = entityIndex; requestOverview.mergeKey = endpoint.parentSource.getKey(); if (_DEBUG) _logger.debug("DEB: preQA6z: Doc Cache: " + cachedDocUrl + " , " + cachedDoc); requestOverview.cachedDoc = cachedDoc; _asyncRequestsPerQuery.add(requestOverview); } //TESTED (by hand) else if (null != endpoint.importScript) { BasicDBObject cachedVal = null; if (_cacheMode) { // (source key not static, plus not sure it's desirable, so for simplicity just don't cache requests in test mode) cachedVal = this.getCache(cachedDocUrl, endpoint); } // Common params: FederatedRequest requestOverview = new FederatedRequest(); requestOverview.endpointInfo = endpoint; requestOverview.communityIdStrs = communityIdStrs; requestOverview.requestParameter = entityValue; requestOverview.queryIndex = entityIndex; requestOverview.mergeKey = endpoint.parentSource.getKey(); requestOverview.cachedDoc_expired = cachedDoc_expired; if (null != cachedVal) { if (checkIfNeedToClearCache(cachedVal, endpoint.parentSource)) { if (_DEBUG) _logger.debug("DEB: preQA6aa: Clear cache: " + cachedDocUrl + " , " + cachedVal); cachedVal = null; } } requestOverview.cachedResult = cachedVal; // will often be null if ((null == cachedVal) || isComplexSource(endpoint.parentSource)) { if (null != cachedVal) { if (_DEBUG) _logger.debug( "DEB: preQA6ab: Complex Src Cache: " + cachedDocUrl + " , " + cachedVal); } if (endpoint.scriptlang.equalsIgnoreCase("external")) { requestOverview.importThread = new FederatedScriptHarvest(); } else { requestOverview.importThread = new FederatedJythonHarvest(); } requestOverview.importThread.queryEngine = this; requestOverview.importThread.request = requestOverview; requestOverview.importThread.start(); } else { if (_DEBUG) _logger.debug("DEB: preQA6a: Cache: " + cachedDocUrl + " , " + cachedVal); } // Launch thread _asyncRequestsPerQuery.add(requestOverview); } //TESTED (by hand) else { if (isComplexSource(endpoint.parentSource)) { //DEBUG if (_DEBUG) _logger.debug("DEB: preQA6ba: Build complex source, num requests = " + endpoint.requests.size()); FederatedRequest requestOverview = new FederatedRequest(); requestOverview.endpointInfo = endpoint; requestOverview.communityIdStrs = communityIdStrs; requestOverview.requestParameter = entityValue; requestOverview.queryIndex = entityIndex; requestOverview.mergeKey = endpoint.parentSource.getKey(); requestOverview.cachedDoc_expired = cachedDoc_expired; requestOverview.importThread = new FederatedSimpleHarvest(); requestOverview.importThread.queryEngine = this; requestOverview.importThread.request = requestOverview; requestOverview.importThread.start(); // Launch thread _asyncRequestsPerQuery.add(requestOverview); } else { // simple source try { for (SourceFederatedQueryConfigPojo.FederatedQueryEndpointUrl request : endpoint.requests) { FederatedRequest requestOverview = createSimpleHttpEndpoint_includingCache( entityValue, entityIndex, communityIdStrs, endpoint, request, cachedDoc_expired); //DEBUG if (_DEBUG) _logger.debug("DEB: preQA6bb: Build request: " + request.endPointUrl); _asyncRequestsPerQuery.add(requestOverview); } //(end loop over multiple requests } catch (Exception e) { _logger.error("Unknown error creating federated query for " + endpoint.titlePrefix + ": " + e.getMessage()); if (_testMode) { throw new RuntimeException("Unknown error creating federated query for " + endpoint.titlePrefix + ": " + e.getMessage(), e); } } } //(end if simple not complex) } //(end cached doc vs script vs request mode for queries) } //(end if this request is for this entity type) else { // no entity matches - if in test mode then bomb out with useful error if (_testMode) { throw new RuntimeException("Specified entity: " + entityIndex + " not in set: " + Arrays.toString(endpoint.entityTypes.toArray())); } } } //(end loop over endpoints) }
From source file:com.ikanow.infinit.e.api.knowledge.federated.SimpleFederatedQueryEngine.java
License:Open Source License
public static void simpleDocCache(FederatedRequest request, BasicDBObject doc) { if (null != request.cachedDoc_expired) { ObjectId updateId = request.cachedDoc_expired.getObjectId(DocumentPojo.updateId_); if (null != updateId) { doc.put(DocumentPojo.updateId_, updateId); } else {// w ww .jav a 2s. com doc.put(DocumentPojo.updateId_, request.cachedDoc_expired.getObjectId(DocumentPojo._id_)); } BasicDBObject docUpdate = new BasicDBObject(DocumentPojo.url_, doc.getString(DocumentPojo.url_)); docUpdate.put(DocumentPojo.sourceKey_, doc.getString(DocumentPojo.sourceKey_)); DbManager.getDocument().getMetadata().remove(docUpdate); //DEBUG if (_DEBUG) _logger.debug("DEB: postQA4a: re-cached ... " + docUpdate.toString() + ": " + doc.getObjectId(DocumentPojo.updateId_)); } else if (null == request.cachedDoc) { // if no currently cached doc, simply save what we have //DEBUG if (_DEBUG) _logger.debug("DEB: postQA4b: cached ... " + doc); DbManager.getDocument().getMetadata().save(doc); } // (else already have a valid cached doc so nothing to do) }
From source file:com.ikanow.infinit.e.api.knowledge.federated.SimpleFederatedQueryEngine.java
License:Open Source License
public static boolean checkDocCache_isExpired(BasicDBObject cachedDoc, SourceFederatedQueryConfigPojo endpoint) { if (null == endpoint.cacheTime_days) endpoint.cacheTime_days = DEFAULT_CACHE_TIME_DAYS; Date now = new Date(); long cacheThreshold = cachedDoc.getDate(DocumentPojo.created_, now).getTime() + endpoint.cacheTime_days * 3600L * 24L * 1000L; if (cacheThreshold < now.getTime()) // (ie doc-creation-time + cache is earlier than now => time to decache) {/*from ww w . j a v a 2 s . co m*/ //DEBUG if (_DEBUG) _logger.debug("DEB: preQA6zz: Cache expired: " + cachedDoc.getString(DocumentPojo.url_) + ": " + new Date(cacheThreshold) + " vs " + now); return true; } else return false; }
From source file:com.ikanow.infinit.e.api.knowledge.output.KmlOutput.java
License:Open Source License
/** * Public function used to return ResponsePojo object as KML representation * @param rp//from ww w .j a va 2 s . co m * @return */ //TODO (INF-1298): Complete this code (see InfiniteMapWidget for examples, though this may want to be different, ie handle documents and aggregations?) @SuppressWarnings("unused") public String getDocs(ResponsePojo rp) { // Setup a list of feeds @SuppressWarnings("unchecked") List<BasicDBObject> docs = (List<BasicDBObject>) rp.getData(); // Setup the Kml object used to generate the kml document Kml kml = new Kml(); // Create the document Document document = kml.createAndSetDocument().withName("Infinit.e KML Interface") .withDescription("Infinit.e search KML representation"); // Create the folder to contain the placemarks (allows us to have multiple folders Folder placemarksFolder = document.createAndAddFolder().withName("Documents") .withDescription("Placemarks for the document locations in the query"); // loop through the result set for (BasicDBObject fdbo : docs) { // start out by checking to see if the title is not null if (fdbo.getString("title") != null) { // add logic to check for entities or event // Add in loop to create all the placemark points String description = ""; if (fdbo.getString("description") != null) description = fdbo.getString("description"); Point placemark = placemarksFolder.createAndAddPlacemark().withName(fdbo.getString("title")) .withOpen(Boolean.TRUE).withDescription(description).createAndSetPoint() .addToCoordinates(-0.126236, 51.500152); } } // Create a string writer to contain the kml string StringWriter writer = new StringWriter(); // marshal the string writer to get a string out to the kml object kml.marshal(writer); // return the kml to the client return writer.toString(); }
From source file:com.ikanow.infinit.e.api.knowledge.output.RssOutput.java
License:Open Source License
public String getDocs(ResponsePojo rp) { // Create the feed using Rome SyndFeed feed = new SyndFeedImpl(); // create the feed String feedType = "rss_2.0"; // Setup a list of feeds @SuppressWarnings("unchecked") List<BasicDBObject> docs = (List<BasicDBObject>) rp.getData(); // Set the title of the feed feed.setTitle("Infinit.e Knowledge Discovery RSS Feed"); feed.setDescription("Infinit.e Search Results RSS Feed"); feed.setLanguage("en-us"); feed.setPublishedDate(new Date(System.currentTimeMillis())); feed.setFeedType(feedType); // set the type of your feed feed.setLink("http://www.ikanow.com"); // Establish the list to contain the feeds List<SyndEntry> entries = new ArrayList<SyndEntry>(); // loop through the result set for (BasicDBObject fdbo : docs) { SyndEntry entry = new SyndEntryImpl(); // create a feed entry if (fdbo.getString("title") != null) { entry.setTitle(fdbo.getString("title")); Date pubDate = (Date) fdbo.get("publishedDate"); if (pubDate != null) entry.setPublishedDate(pubDate); if (fdbo.getString("url") != null) entry.setLink(fdbo.getString("url")); if (fdbo.getString("description") != null) { // Create the content for the entry SyndContent content = new SyndContentImpl(); // create the content of your entry content.setType("text/plain"); content.setValue(fdbo.getString("description")); entry.setDescription(content); }//from w ww. j av a 2 s. c om entries.add(entry); } } feed.setEntries(entries); // you can add multiple entries in your feed SyndFeedOutput output = new SyndFeedOutput(); String rss = null; try { rss = output.outputString(feed); } catch (FeedException e) { e.printStackTrace(); logger.error("Line: [" + e.getStackTrace()[2].getLineNumber() + "] " + e.getMessage()); } return rss; }
From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java
License:Open Source License
@SuppressWarnings("unchecked") private void stage1_initialCountingLoop(DBCursor docs, AdvancedQueryPojo.QueryScorePojo scoreParams, int toReturn, StatisticsPojo scores, LinkedList<BasicDBObject> standaloneEventsReturn, int nCommunities) { double s0_nQuerySubsetDocCountInv = 1.0 / (double) _s0_nQuerySubsetDocCount; // Some memory management: DBCollection dbc = MongoDbManager.getDocument().getMetadata(); DBDecoderFactory defaultDecoder = dbc.getDBDecoderFactory(); try {/*www .j a va2 s . c om*/ SizeReportingBasicBSONDecoder sizeReportingDecoder = new SizeReportingBasicBSONDecoder(); dbc.setDBDecoderFactory(sizeReportingDecoder); long currMemUsage = 0; int ndocs = 0; long lastBatch = 0L; long initialUnusedMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory(); long initialFreeMemory = Runtime.getRuntime().freeMemory(); for (DBObject f0 : docs) { BasicDBObject f = (BasicDBObject) f0; long newMemUsage = sizeReportingDecoder.getSize(); if ((newMemUsage - currMemUsage) > 0) { // check every batch long now = new Date().getTime(); //DEBUG //logger.warn(ndocs + " : " + (now - lastBatch) + " : " + newMemUsage + " VS " + Runtime.getRuntime().maxMemory() + " UNUSED " + (Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory()) + " FREE " + Runtime.getRuntime().freeMemory()); // Check vs total memory: long runtimeMem = Runtime.getRuntime().maxMemory(); // note newMemUsage is the input memory ... gets expanded ~6x by the BSON-ification, allowed at most 1/4rd of memory... // Also if we're taking more than 20s for a batch then limp over the limit and exit... if (((newMemUsage * 24) > runtimeMem) || (((now - lastBatch) > 20000L) && (ndocs >= toReturn))) { long finalUnusedMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory(); long finalFreeMemory = Runtime.getRuntime().freeMemory(); logger.error("Query truncated memUsage=" + newMemUsage + ", memory=" + runtimeMem + ", docs=" + ndocs + ", totaldocs=" + scores.found + ", init_free_mem=" + initialFreeMemory + ", end_free_mem=" + finalFreeMemory + ", init_unused_mem=" + initialUnusedMemory + ", end_unused_mem=" + finalUnusedMemory); break; } //TESTED currMemUsage = newMemUsage; lastBatch = now; } //TESTED ndocs++; // Simple handling for standalone events if ((null != _s0_standaloneEventAggregator) && !_s0_bNeedToCalcSig) { //if _s0_bNeedToCalcSig then do this elsewhere ScoringUtils_Associations.addStandaloneEvents(f, 0.0, 0, _s0_standaloneEventAggregator, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts); } //TESTED if (!_s0_bNeedToCalcSig) { continue; } //TESTED if (nCommunities > 1) { // (could have pan-community entities) ObjectId communityId = (ObjectId) f.get(DocumentPojo.communityId_); if (null != communityId) { // (have big problems if so, but anyway!) int retval = _s0_multiCommunityHandler.community_getIdAndInitialize(communityId, _s1_entitiesInDataset); // (returns an int community id but also sets it into the cache, so just use that below) if (Integer.MIN_VALUE == retval) { //this document cannot be viewed from within this set of communities continue; } } } //TESTED TempDocBucket docBucket = new TempDocBucket(); docBucket.dbo = f; ObjectId id = (ObjectId) f.get(DocumentPojo._id_); // If we're going to weight relevance in, or we need the geo temporal decay: if ((0 != scoreParams.relWeight) || (null != scoreParams.timeProx) || (null != scoreParams.geoProx)) { StatisticsPojo.Score scoreObj = scores.getScore().get(id); if (null != scoreObj) { docBucket.explain = scoreObj.explain; // (will normally be null) docBucket.luceneScore = scoreObj.score; if ((null != scoreParams.timeProx) || (null != scoreParams.geoProx)) { if (scoreObj.decay >= 0.0) { docBucket.geoTemporalDecay = scoreObj.decay; } // (see also below for low accuracy geo scoring) } } else { docBucket.luceneScore = 1.0; } } //TESTED else if (this._s0_sortingByDate) { StatisticsPojo.Score scoreObj = scores.getScore().get(id); if (null != scoreObj) { docBucket.nLuceneIndex = scoreObj.nIndex; } } docBucket.manualWeighting = this.getManualScoreWeights(scoreParams, f); BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_)); if (null != l) { long nEntsInDoc = l.size(); double dBestGeoScore = 0.0; // (for low accuracy geo only) for (Iterator<?> e0 = l.iterator(); e0.hasNext();) { BasicDBObject e = (BasicDBObject) e0.next(); BasicDBObject tmpGeotag = null; if (_s3_bLowAccuracyGeo || (null != _s1_dManualGeoDecay_latLonInvdecay)) { // low accuracy geo, need to look for geotag tmpGeotag = (BasicDBObject) e.get(EntityPojo.geotag_); } // Get attributes double freq = -1.0; long ntotaldoccount = -1; String entity_index; Double sentiment = null; try { sentiment = (Double) e.get(EntityPojo.sentiment_); ntotaldoccount = e.getLong(EntityPojo.doccount_); freq = e.getDouble(EntityPojo.frequency_); entity_index = e.getString(EntityPojo.index_); if (null == entity_index) { // Just bypass the entity e.put(EntityPojo.significance_, 0.0); nEntsInDoc--; continue; } } catch (Exception ex) { try { String sfreq; if (ntotaldoccount < 0) { sfreq = e.getString(EntityPojo.doccount_); ntotaldoccount = Long.valueOf(sfreq); } if (freq < -0.5) { sfreq = e.getString(EntityPojo.frequency_); freq = Long.valueOf(sfreq).doubleValue(); } entity_index = e.getString(EntityPojo.index_); if (null == entity_index) { // Just bypass the entity e.put(EntityPojo.significance_, 0.0); nEntsInDoc--; continue; } } catch (Exception e2) { // Just bypass the entity e.put(EntityPojo.significance_, 0.0); nEntsInDoc--; continue; } } //TESTED // First loop through is just counting // Retrieve entity (create/initialzie if necessary) EntSigHolder shp = _s1_entitiesInDataset.get(entity_index); if (null == shp) { if (ntotaldoccount > (long) _s0_globalDocCount) { // obviously can't have more entities-in-dos than docs... ntotaldoccount = (long) _s0_globalDocCount; } shp = new EntSigHolder(entity_index, ntotaldoccount, _s0_multiCommunityHandler); // Stage 1a alias handling: set up infrastructure, calculate doc overlap if (null != _s1_aliasLookup) { stage1_initAlias(shp); } if ((null != shp.aliasInfo) && (null == shp.masterAliasSH)) { // this is the discard alias nEntsInDoc--; continue; } //TESTED // Check if entity is in type filter list if (null != _s0_entityTypeFilter) { String entType = null; if (null != shp.aliasInfo) { entType = shp.aliasInfo.getType(); } else { entType = e.getString(EntityPojo.type_); } if (_s0_bEntityTypeFilterPositive) { if ((null != entType) && !_s0_entityTypeFilter.contains(entType.toLowerCase())) { nEntsInDoc--; continue; } } else if ((null != entType) && _s0_entityTypeFilter.contains(entType.toLowerCase())) { //(negative filter) nEntsInDoc--; continue; } } //TESTED (end entity filter) // Geo: if (null != shp.aliasInfo) { if (null != shp.aliasInfo.getGeotag()) { //Geo, overwrite/create tmpGeotag if (_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo || (null != _s1_dManualGeoDecay_latLonInvdecay)) { // Always capture alias geo, even if not in low accuracy mode because we add it to the // legitimate geo: if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo) && (null == _s3_geoBuckets)) { // Initialize the buckets if this is for aggregation not just decay _s3_geoBuckets = (LinkedList<EntSigHolder>[]) new LinkedList[_s3_nGEO_BUCKETS]; } if (null == tmpGeotag) { tmpGeotag = new BasicDBObject(); } tmpGeotag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat); tmpGeotag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon); if (null != shp.aliasInfo.getOntology_type()) { e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type()); } } } } //TESTED (end geo for aggregation or decay) _s1_entitiesInDataset.put(entity_index, shp); // end Stage 1a alias handling } //(end if is alias) // Stage 1b alias handling: calculate document counts (taking overlaps into account) if (null != shp.masterAliasSH) { // Counts: shp.masterAliasSH.nTotalDocCount++; // docs including overlaps shp.masterAliasSH.avgFreqOverQuerySubset += freq; // Keep track of overlaps: if (f != shp.masterAliasSH.unusedDbo) { shp.masterAliasSH.unusedDbo = f; // (note this is only used in stage 1, alias.unusedDbo is re-used differently in stage 3/4) shp.masterAliasSH.nDocCountInQuerySubset++; // non-overlapping docs ie < shp.nDocCountInQuerySubset } // Sentiment: shp.masterAliasSH.positiveSentiment += shp.positiveSentiment; shp.masterAliasSH.negativeSentiment += shp.negativeSentiment; if (null != sentiment) { shp.masterAliasSH.nTotalSentimentValues++; } } //TESTED (end if is alias) // end Stage 1b // Pan-community logic (this needs to be before the entity object is updated) if (_s0_multiCommunityHandler.isActive()) { _s0_multiCommunityHandler.community_updateCorrelations(shp, ntotaldoccount, entity_index); } else { // (Once we've started multi-community logic, this is no longer desirable) if ((ntotaldoccount > shp.nTotalDocCount) && (ntotaldoccount <= _s0_globalDocCount)) { shp.nTotalDocCount = ntotaldoccount; } //(note there used to be some cases where we adjusted for dc/tf==0, but the // underlying issue in the data model that caused this has been fixed, so it's // now a pathological case that can be ignored) } //(TESTED) // Update counts: _s1_sumFreqInQuerySubset += freq; shp.avgFreqOverQuerySubset += freq; shp.nDocCountInQuerySubset++; shp.decayedDocCountInQuerySubset += docBucket.geoTemporalDecay; // (note this doesn't handle low accuracy geo-decay ... we'll address that via a separate term) TempEntityInDocBucket entBucket = new TempEntityInDocBucket(); entBucket.dbo = e; entBucket.freq = freq; entBucket.doc = docBucket; shp.entityInstances.add(entBucket); if (null != tmpGeotag) { // (only needed for low accuracy geo aggregation) if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo) && (null == shp.geotag)) { // (first time for shp only) shp.geotag = tmpGeotag; shp.geotaggedEntity = e; // (ie for onto type, which has been overwritten in the alias case...) } if (null != _s1_dManualGeoDecay_latLonInvdecay) { // Emulate scripted Lucene calculations double minlat = tmpGeotag.getDouble(GeoPojo.lat_); double minlon = tmpGeotag.getDouble(GeoPojo.lon_); double paramlat = _s1_dManualGeoDecay_latLonInvdecay[0]; double paramlon = _s1_dManualGeoDecay_latLonInvdecay[1]; double gdecay = _s1_dManualGeoDecay_latLonInvdecay[2]; char ontCode = GeoOntologyMapping .encodeOntologyCode(e.getString(EntityPojo.ontology_type_)); double dDecay = QueryDecayScript.getGeoDecay(minlat, minlon, paramlat, paramlon, gdecay, ontCode); if (dDecay > dBestGeoScore) { dBestGeoScore = dDecay; } } //TESTED } //(end if entity has geo and need to process entity geo) if (freq > shp.maxFreq) { shp.maxFreq = freq; } // Sentiment: if ((null != sentiment) && (Math.abs(sentiment) <= 1.1)) { // (actually 1.0) shp.nTotalSentimentValues++; if (sentiment > 0.0) { shp.positiveSentiment += sentiment; } else { shp.negativeSentiment += sentiment; } } else if (null != sentiment) { // corrupt sentiment for some reason?! e.put(EntityPojo.sentiment_, null); } docBucket.docLength += freq; } //(end loop over entities) docBucket.nLeftToProcess = nEntsInDoc; docBucket.nEntsInDoc = (int) nEntsInDoc; if (null != this._s1_dManualGeoDecay_latLonInvdecay) { // Low accuracy geo-calculations docBucket.geoTemporalDecay *= dBestGeoScore; docBucket.luceneScore *= dBestGeoScore; _s2_dAvgLowAccuracyGeoDecay += dBestGeoScore * s0_nQuerySubsetDocCountInv; } //TESTED } // (end if feed has entities) // Handle documents with no entities - can still promote them if (0 == docBucket.nLeftToProcess) { // (use this rather than doc length in case all the entities had freq 0) _s1_noEntityBuckets.add(docBucket); } } // (end loop over feeds) //TESTED } finally { dbc.setDBDecoderFactory(defaultDecoder); } }
From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java
License:Open Source License
private void stage4_prepareDocsForOutput(AdvancedQueryPojo.QueryScorePojo scoreParams, StatisticsPojo scores, long nToClientLimit, LinkedList<BasicDBObject> returnList) { // Get the documents long nDocs = 0; double dBestScore = 0.0; double dAvgScore = 0.0; double dSigFactor = 100.0 / (_s3_dSigScalingFactor * _s2_dApproxAverageDocumentSig); double dRelFactor = 100.0 / (_s3_dLuceneScalingFactor * _s0_avgLuceneScore); // Start at the bottom of the list, so don't need to worry about skipping documents, just count out from the bottom // The call to stage3_calculateTFTerms with nStart+nToClientLimit handles the rest Iterator<TempDocBucket> pqIt = _s3_pqDocs.iterator(); while (pqIt.hasNext() && (nDocs < nToClientLimit)) { TempDocBucket qsf = pqIt.next(); nDocs++;/*from w w w . j a v a2 s . c o m*/ if (!_s0_sortingByDate) { dBestScore = qsf.totalScore; } dAvgScore += dBestScore; BasicDBObject f = qsf.dbo; // Phase "0" - these are the highest prio events boolean bNeedToFilterAndAliasAssoc_event = true; boolean bNeedToFilterAndAliasAssoc_fact = true; boolean bNeedToFilterAndAliasAssoc_summary = true; if (null != _s0_standaloneEventAggregator) { ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0, _s0_standaloneEventAggregator, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts); bNeedToFilterAndAliasAssoc_event = false; bNeedToFilterAndAliasAssoc_fact = false; bNeedToFilterAndAliasAssoc_summary = false; } //TESTED if (null != _s0_lowAccuracyAssociationAggregator_events) { ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0, _s0_lowAccuracyAssociationAggregator_events, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, true, false, false); bNeedToFilterAndAliasAssoc_event = false; } //TESTED if (null != _s0_lowAccuracyAssociationAggregator_facts) { ScoringUtils_Associations.addStandaloneEvents(qsf.dbo, qsf.aggSignificance, 0, _s0_lowAccuracyAssociationAggregator_facts, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, false, false, true); bNeedToFilterAndAliasAssoc_fact = false; } //TESTED try { DocumentPojoApiMap.mapToApi(f); // Handle deduplication/multi-community code: if (null != qsf.dupList) { try { ScoringUtils_MultiCommunity.community_combineDuplicateDocs(qsf); } catch (Exception e) { // Do nothing, just carry on with minimal damage! } } // Scoring: double d = qsf.aggSignificance * dSigFactor; if (Double.isNaN(d)) { f.put(DocumentPojo.aggregateSignif_, 0.0); } else { f.put(DocumentPojo.aggregateSignif_, d); } d = qsf.luceneScore * dRelFactor; if (Double.isNaN(d)) { f.put(DocumentPojo.queryRelevance_, 0.0); } else { f.put(DocumentPojo.queryRelevance_, d); } if (!_s0_sortingByDate) { f.put(DocumentPojo.score_, qsf.totalScore); } BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_)); // Handle update ids vs normal ids: ObjectId updateId = (ObjectId) f.get(DocumentPojo.updateId_); if (null != updateId) { // swap the 2... f.put(DocumentPojo.updateId_, f.get(DocumentPojo._id_)); f.put(DocumentPojo._id_, updateId); } // Check if entities enabled if ((null != l) && (!_s0_bGeoEnts && !_s0_bNonGeoEnts)) { f.removeField(DocumentPojo.entities_); l = null; } //TESTED // Check if events etc enabled if ((!_s0_bEvents && !_s0_bFacts && !_s0_bSummaries)) { f.removeField(DocumentPojo.associations_); } //TESTED else if (!_s0_bEvents || !_s0_bFacts || !_s0_bSummaries || (null != _s0_assocVerbFilter)) { // Keep only specified event_types BasicDBList lev = (BasicDBList) (f.get(DocumentPojo.associations_)); if (null != lev) { for (Iterator<?> e0 = lev.iterator(); e0.hasNext();) { BasicDBObject e = (BasicDBObject) e0.next(); // Type filter boolean bNeedToFilterAndAliasAssoc = true; String sEvType = e.getString(AssociationPojo.assoc_type_); boolean bKeep = true; if (null == sEvType) { bKeep = false; } else if (sEvType.equalsIgnoreCase("event")) { if (!_s0_bEvents) bKeep = false; bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_event; } else if (sEvType.equalsIgnoreCase("fact")) { if (!_s0_bFacts) bKeep = false; bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_fact; } else if (sEvType.equalsIgnoreCase("summary")) { if (!_s0_bSummaries) bKeep = false; bNeedToFilterAndAliasAssoc = bNeedToFilterAndAliasAssoc_summary; } if (!bKeep) { e0.remove(); } else { // Type matches, now for some more complex logic.... if (bNeedToFilterAndAliasAssoc) { // (otherwise done already) bKeep = ScoringUtils_Associations.filterAndAliasAssociation(e, _s1_aliasLookup, true, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter); if (!bKeep) { e0.remove(); } } //TESTED } //(end output filter logic) } // (end loop over events) } // (end if this doc has events) } //TESTED // Check if metadata is enabled if (!_s0_bMetadata) { f.removeField(DocumentPojo.metadata_); } //TESTED if (null != l) { for (Iterator<?> e0 = l.iterator(); e0.hasNext();) { BasicDBObject e = (BasicDBObject) e0.next(); if (!_s0_bNonGeoEnts) { // then must only be getting geo (else wouldn't be in this loop) if (null == e.get(EntityPojo.geotag_)) { e0.remove(); continue; } } String entity_index = e.getString(EntityPojo.index_); if (null == entity_index) continue; EntSigHolder shp = (EntSigHolder) _s1_entitiesInDataset.get(entity_index); if (null != shp) { // Stage 4x: alias processing, just overwrite // (note don't delete "duplicate entities", hard-to-be-globally-consistent // and will potentially throw data away which might be undesirable) if (null != shp.masterAliasSH) { shp = shp.masterAliasSH; // (already has all the aggregated values used below) if (!entity_index.equals(shp.aliasInfo.getIndex())) { e.put(EntityPojo.index_, shp.aliasInfo.getIndex()); e.put(EntityPojo.disambiguated_name_, shp.aliasInfo.getDisambiguatedName()); e.put(EntityPojo.type_, shp.aliasInfo.getType()); e.put(EntityPojo.dimension_, shp.aliasInfo.getDimension()); if (null != shp.aliasInfo.getGeotag()) { BasicDBObject aliasedGeoTag = new BasicDBObject(); aliasedGeoTag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat); aliasedGeoTag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon); e.put(EntityPojo.geotag_, aliasedGeoTag); if (null != shp.aliasInfo.getOntology_type()) { e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type()); } } //TESTED } } //TESTED // end Stage 4x of alias processing double dataSig = shp.datasetSignificance; if (Double.isNaN(dataSig)) { e.put(EntityPojo.datasetSignificance_, 0.0); } else { e.put(EntityPojo.datasetSignificance_, dataSig); } e.put(EntityPojo.queryCoverage_, shp.queryCoverage); e.put(EntityPojo.averageFreq_, shp.avgFreqOverQuerySubset); if (shp.nTotalSentimentValues > 0) { e.put(EntityPojo.positiveSentiment_, shp.positiveSentiment); e.put(EntityPojo.negativeSentiment_, shp.negativeSentiment); e.put(EntityPojo.sentimentCount_, shp.nTotalSentimentValues); } } else { // (most likely to occur if the entity is discarded (alias/filter) or is corrupt in some way) e0.remove(); continue; } } //(end loop over entities) } // (end if feed has entities) //TESTED // Explain if enabled if (null != qsf.explain) { f.put(DocumentPojo.explain_, qsf.explain); } // Add to the end of the list (so will come back from API call in natural order, highest first) returnList.addFirst(f); // (add elements to the front of the list so that the top of the list is ordered by priority) } catch (Exception e) { // Probably a JSON error, just carry on String title = f.getString(DocumentPojo.title_); logger.error(title + ": " + e.getMessage()); } } // (end loop over feeds) //TESTED // Update the scores: scores.maxScore = (float) dBestScore; if (nDocs > 0) { scores.avgScore = (float) dAvgScore / nDocs; } }