Java tutorial
/******************************************************************************* * Copyright 2012, The Infinit.e Open Source Project. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package com.ikanow.infinit.e.api.knowledge.federated; import java.io.IOException; import java.net.URI; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.concurrent.Future; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.script.ScriptEngine; import javax.script.ScriptEngineManager; import net.minidev.json.JSONArray; import org.bson.types.ObjectId; import com.google.common.collect.Iterators; import com.google.common.collect.PeekingIterator; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.google.gson.JsonElement; import com.google.gson.JsonParser; import com.ikanow.infinit.e.api.knowledge.QueryHandler.ISimpleFederatedQueryEngine; import com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils; import com.ikanow.infinit.e.data_model.api.ResponsePojo; import com.ikanow.infinit.e.data_model.api.knowledge.AdvancedQueryPojo; import com.ikanow.infinit.e.data_model.interfaces.query.IQueryExtension; import com.ikanow.infinit.e.data_model.store.DbManager; import com.ikanow.infinit.e.data_model.store.MongoDbManager; import com.ikanow.infinit.e.data_model.store.MongoDbUtil; import com.ikanow.infinit.e.data_model.store.config.source.SimpleFederatedCache; import com.ikanow.infinit.e.data_model.store.config.source.SourceFederatedQueryConfigPojo; import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo; import com.ikanow.infinit.e.data_model.store.document.AssociationPojo; import com.ikanow.infinit.e.data_model.store.document.DocumentPojo; import com.ikanow.infinit.e.data_model.store.document.EntityPojo; import com.ikanow.infinit.e.data_model.utils.IkanowSecurityManager; import com.ikanow.infinit.e.harvest.HarvestController; import com.ikanow.infinit.e.harvest.HarvestControllerPipeline; import com.ikanow.infinit.e.harvest.extraction.text.externalscript.TextExtractorExternalScript; import com.jayway.jsonpath.JsonPath; import com.mongodb.BasicDBList; import com.mongodb.BasicDBObject; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.DBObject; import com.ning.http.client.AsyncHttpClient; import com.ning.http.client.AsyncHttpClient.BoundRequestBuilder; import com.ning.http.client.Response; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; public class SimpleFederatedQueryEngine implements IQueryExtension, ISimpleFederatedQueryEngine { //DEBUG static private boolean _DEBUG = false; //static private boolean _DEBUG = true; static Logger _logger = Logger.getLogger(SimpleFederatedQueryEngine.class); private ScoringUtils _scoreStats = null; public void registerScoringEngine(ScoringUtils scoreStats) { _scoreStats = scoreStats; } private boolean _cacheMode = true; private boolean _testMode = false; public void setTestMode(boolean testMode) { _testMode = testMode; _cacheMode = !_testMode; // (don't cache in some places if in test mode) } public static class FederatedRequest { // Endpoint version public SourceFederatedQueryConfigPojo.FederatedQueryEndpointUrl subRequest; public Future<Response> responseFuture; public AsyncHttpClient asyncClient; public String mergeKey; // Script import version public FederatedHarvest importThread; public AdvancedQueryPojo fullQuery; public Throwable errorMessage; public BasicDBObject scriptResult = null; public List<DocumentPojo> complexSourceProcResults = null; // Joint public SourceFederatedQueryConfigPojo endpointInfo; public BasicDBObject cachedResult = null; public BasicDBObject cachedDoc = null; public BasicDBObject cachedDoc_expired = null; public String requestParameter; public String[] communityIdStrs; public String queryIndex; } public void addEndpoint(SourceFederatedQueryConfigPojo newEndpoint) { if (null == _endpoints) { _endpoints = new LinkedList<SourceFederatedQueryConfigPojo>(); } _endpoints.add(newEndpoint); } LinkedList<FederatedRequest> _asyncRequestsPerQuery = null; LinkedList<SourceFederatedQueryConfigPojo> _endpoints = null; IkanowSecurityManager _scriptingSecurityManager = new IkanowSecurityManager(); private ScriptEngine _pyEngine = null; @Override public void preQueryActivities(ObjectId queryId, AdvancedQueryPojo query, String[] communityIdStrs) { _scoreStats = null; _asyncRequestsPerQuery = null; // 1] Check whether this makes sense to query, get the (sole) entity if so String entityType = null; String entityValue = null; String entityIndex = null; String textToTest = null; if ((null != query.qt) && (query.qt.size() > 0) && (query.qt.size() < 4)) { String logic = query.logic; if (null != logic) { logic = logic.toLowerCase(); } if ((null != logic) && (logic.contains("or") || logic.contains("not"))) { //DEBUG if (_DEBUG) _logger.debug("DEB: preQA1: Logic too complex: " + query.logic); if (_testMode) { throw new RuntimeException("Bad testQueryJson: Logic too complex: " + query.logic); } return; // logic too complex } //TESTED (1.3) for (AdvancedQueryPojo.QueryTermPojo qt : query.qt) { if ((null != qt.entity) || ((null != qt.entityType) && (null != qt.entityValue))) { if (null == entityType) { // we now have == 1 entity if (null != qt.entityValue) { entityValue = qt.entityValue; entityType = qt.entityType; entityIndex = entityValue.toLowerCase() + "/" + entityType.toLowerCase(); } //TESTED (1.5) else { entityIndex = qt.entity.toLowerCase(); int index = qt.entity.lastIndexOf('/'); if (index > 0) { entityValue = qt.entity.substring(0, index); entityType = qt.entity.substring(index + 1).toLowerCase(); } } //TESTED (1.6) } else { // >1 entity, not supported //DEBUG if (_DEBUG) _logger.debug("DEB: preQA2a: >1 entity: " + qt.entity + " / " + entityType + " / " + query.toApi()); if (_testMode) { throw new RuntimeException("Bad testQueryJson: >1 entity: " + qt.entity + " / " + entityType + " / " + query.toApi()); } return; } //TESTED (1.4) } //TESTED else if ((null != qt.etext) && (qt.etext.equals("*"))) { //this is fine provided it's only ANDed together (eg above logic case) } else if (null != qt.etext) { // Only work if it matches one of the regexes if (null == entityType) { textToTest = qt.etext; entityType = "etext"; } else { // >1 entity, not supported //DEBUG if (_DEBUG) _logger.debug("DEB: preQA2b: >1 entity: " + qt.entity + " / " + entityType + " / " + query.toApi()); if (_testMode) { throw new RuntimeException("Bad testQueryJson: >1 entity: " + qt.entity + " / " + entityType + " / " + query.toApi()); } return; } //TESTED (1.4) } else if (null == qt.time) { // temporal //DEBUG if (_DEBUG) _logger.debug("DEB: preQA3: non-entity/date " + query.toApi()); if (_testMode) { throw new RuntimeException("Bad testQueryJson: non-entity/date " + query.toApi()); } return; } //TESTED (1.1) } //(end loop over query terms) } //TESTED (1.*) if (null == entityType) { // Query too complex //DEBUG if (_DEBUG) _logger.debug("DEB: preQA4: query missing entity " + query.toApi()); if (_testMode) { throw new RuntimeException("Bad testQueryJson: query missing entity " + query.toApi()); } return; } //TESTED (1.2) entityType = entityType.toLowerCase(); // 2] If so, query across all the end for (SourceFederatedQueryConfigPojo endpoint : _endpoints) { // Endpoint validation: if (null == endpoint.entityTypes) { if (_testMode) { throw new RuntimeException("No entity types specified"); } else { continue; } } if (null != textToTest) { // This is text, see if you can convert to an entity entityValue = null; //(reset for different endpoints - used in the check to decide whether to continue) for (String entityTypeRegex : endpoint.entityTypes) { if (entityTypeRegex.startsWith("/")) { int regexIndex = entityTypeRegex.lastIndexOf('/'); // (guaranteed to be >= 0) try { Pattern regex = Pattern.compile(entityTypeRegex.substring(1, regexIndex)); if (regex.matcher(textToTest).matches()) { entityType = entityTypeRegex.substring(1 + regexIndex); if (entityType.length() > 0) { entityValue = textToTest; entityIndex = entityValue.toLowerCase() + "/" + entityType.toLowerCase(); } } } catch (Exception e) { // if not in test mode, carry on if (_testMode) { throw new RuntimeException(e); } } } } //(end loop over entity regexes) } //TESTED if (null == entityValue) { // None of the regexes matched if (_testMode) { throw new RuntimeException("Text specified, does not match any of the regexes: " + Arrays.toString(endpoint.entityTypes.toArray()) + " ... text = " + textToTest); } continue; } //DEBUG if (_DEBUG) _logger.debug("DEB: preQA5: ENDPOINT: " + Arrays.toString(endpoint.entityTypes.toArray()) + " / " + entityType); if ((null != endpoint.importScript) && !endpoint.importScript.isEmpty()) { if (null == endpoint.scriptlang) { endpoint.scriptlang = "python"; // python ==default } if (endpoint.scriptlang.equalsIgnoreCase("python")) { _pyEngine = new ScriptEngineManager().getEngineByName("python"); if (null == _pyEngine) { _logger.error( "Python not installed - copy jython 2.5+ into /opt/infinite-home/lib/unbundled"); if (_testMode) { throw new RuntimeException( "Python not installed - copy jython 2.5+ into /opt/infinite-home/lib/unbundled"); } } //TESTED (by hand, importScript != null and scriptlang: "python", jython not on classpath) } else if (endpoint.scriptlang.equalsIgnoreCase("external")) { //nothing to do here, just carry on, will handle the external bit later on } else { _logger.error("Python/External is currently the only supported scriptlang"); if (_testMode) { throw new RuntimeException("Python is currently the only supported scriptlang"); } } //TESTED (by hand, importScript != null and scriptlang: "none") } //TESTED if ((null != endpoint.bypassSimpleQueryParsing) && endpoint.bypassSimpleQueryParsing) { throw new RuntimeException("Currently only simple query parsing is supported"); } if ((null != endpoint.entityTypes) && endpoint.entityTypes.contains(entityType)) { // If not using the full source pipeline processing capability (ie always generating 0/1 BasicDBObject cachedDoc = null; String cachedDocUrl = buildScriptUrl(endpoint.parentSource.getKey(), entityIndex); BasicDBObject cachedDoc_expired = null; if (!isComplexSource(endpoint.parentSource)) { // Check if the *doc* (not *API response*) generated from this endpoint/entity has been cached, check expiry if so if (_cacheMode && ((null == endpoint.cacheTime_days) || (endpoint.cacheTime_days >= 0))) { if (_DEBUG) _logger.debug("DEB: preQA6ya: Search Doc Cache: " + cachedDocUrl + " , " + endpoint.cacheTime_days); BasicDBObject cachedDocQuery = new BasicDBObject(DocumentPojo.url_, cachedDocUrl); cachedDocQuery.put(DocumentPojo.sourceKey_, endpoint.parentSource.getKey()); cachedDoc = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(cachedDocQuery); if (null != cachedDoc) { // (quick check if we have a complex source in here) String sourceUrl = cachedDoc.getString(DocumentPojo.sourceUrl_); if (null != sourceUrl) { // switching from complex to simple source - delete the cached docs if (_DEBUG) _logger.debug("DEB: preQA6yb: Clear Search Doc Cache: " + cachedDocUrl + " , " + sourceUrl); cachedDocQuery.remove(DocumentPojo.url_); cachedDocQuery.put(DocumentPojo.sourceUrl_, sourceUrl); DbManager.getDocument().getMetadata().remove(cachedDocQuery); cachedDoc = null; } //TESTED (by hand) else if (checkDocCache_isExpired(cachedDoc, endpoint)) { cachedDoc_expired = cachedDoc; cachedDoc = null; } } } //TESTED (by hand) } if (null == _asyncRequestsPerQuery) { // If we've got this far create a list to store the async requests _asyncRequestsPerQuery = new LinkedList<FederatedRequest>(); } if (null != cachedDoc) { // (simple sources only, by construction) // Common params: FederatedRequest requestOverview = new FederatedRequest(); requestOverview.endpointInfo = endpoint; requestOverview.communityIdStrs = communityIdStrs; requestOverview.requestParameter = entityValue; requestOverview.queryIndex = entityIndex; requestOverview.mergeKey = endpoint.parentSource.getKey(); if (_DEBUG) _logger.debug("DEB: preQA6z: Doc Cache: " + cachedDocUrl + " , " + cachedDoc); requestOverview.cachedDoc = cachedDoc; _asyncRequestsPerQuery.add(requestOverview); } //TESTED (by hand) else if (null != endpoint.importScript) { BasicDBObject cachedVal = null; if (_cacheMode) { // (source key not static, plus not sure it's desirable, so for simplicity just don't cache requests in test mode) cachedVal = this.getCache(cachedDocUrl, endpoint); } // Common params: FederatedRequest requestOverview = new FederatedRequest(); requestOverview.endpointInfo = endpoint; requestOverview.communityIdStrs = communityIdStrs; requestOverview.requestParameter = entityValue; requestOverview.queryIndex = entityIndex; requestOverview.mergeKey = endpoint.parentSource.getKey(); requestOverview.cachedDoc_expired = cachedDoc_expired; if (null != cachedVal) { if (checkIfNeedToClearCache(cachedVal, endpoint.parentSource)) { if (_DEBUG) _logger.debug("DEB: preQA6aa: Clear cache: " + cachedDocUrl + " , " + cachedVal); cachedVal = null; } } requestOverview.cachedResult = cachedVal; // will often be null if ((null == cachedVal) || isComplexSource(endpoint.parentSource)) { if (null != cachedVal) { if (_DEBUG) _logger.debug( "DEB: preQA6ab: Complex Src Cache: " + cachedDocUrl + " , " + cachedVal); } if (endpoint.scriptlang.equalsIgnoreCase("external")) { requestOverview.importThread = new FederatedScriptHarvest(); } else { requestOverview.importThread = new FederatedJythonHarvest(); } requestOverview.importThread.queryEngine = this; requestOverview.importThread.request = requestOverview; requestOverview.importThread.start(); } else { if (_DEBUG) _logger.debug("DEB: preQA6a: Cache: " + cachedDocUrl + " , " + cachedVal); } // Launch thread _asyncRequestsPerQuery.add(requestOverview); } //TESTED (by hand) else { if (isComplexSource(endpoint.parentSource)) { //DEBUG if (_DEBUG) _logger.debug("DEB: preQA6ba: Build complex source, num requests = " + endpoint.requests.size()); FederatedRequest requestOverview = new FederatedRequest(); requestOverview.endpointInfo = endpoint; requestOverview.communityIdStrs = communityIdStrs; requestOverview.requestParameter = entityValue; requestOverview.queryIndex = entityIndex; requestOverview.mergeKey = endpoint.parentSource.getKey(); requestOverview.cachedDoc_expired = cachedDoc_expired; requestOverview.importThread = new FederatedSimpleHarvest(); requestOverview.importThread.queryEngine = this; requestOverview.importThread.request = requestOverview; requestOverview.importThread.start(); // Launch thread _asyncRequestsPerQuery.add(requestOverview); } else { // simple source try { for (SourceFederatedQueryConfigPojo.FederatedQueryEndpointUrl request : endpoint.requests) { FederatedRequest requestOverview = createSimpleHttpEndpoint_includingCache( entityValue, entityIndex, communityIdStrs, endpoint, request, cachedDoc_expired); //DEBUG if (_DEBUG) _logger.debug("DEB: preQA6bb: Build request: " + request.endPointUrl); _asyncRequestsPerQuery.add(requestOverview); } //(end loop over multiple requests } catch (Exception e) { _logger.error("Unknown error creating federated query for " + endpoint.titlePrefix + ": " + e.getMessage()); if (_testMode) { throw new RuntimeException("Unknown error creating federated query for " + endpoint.titlePrefix + ": " + e.getMessage(), e); } } } //(end if simple not complex) } //(end cached doc vs script vs request mode for queries) } //(end if this request is for this entity type) else { // no entity matches - if in test mode then bomb out with useful error if (_testMode) { throw new RuntimeException("Specified entity: " + entityIndex + " not in set: " + Arrays.toString(endpoint.entityTypes.toArray())); } } } //(end loop over endpoints) } @Override public void postQueryActivities(ObjectId queryId, List<BasicDBObject> docs, ResponsePojo response) { boolean grabbedScores = false; double aggregateSignif = 100.0; double queryRelevance = 100.0; double score = 100.0; if (null != _asyncRequestsPerQuery) { int added = 0; BasicDBList bsonArray = new BasicDBList(); PeekingIterator<FederatedRequest> it = Iterators.peekingIterator(_asyncRequestsPerQuery.iterator()); while (it.hasNext()) { // loop state: BasicDBObject[] docOrDocs = new BasicDBObject[1]; docOrDocs[0] = null; FederatedRequest request = it.next(); boolean isComplexSource = isComplexSource(request.endpointInfo.parentSource); if (null == request.cachedDoc) { // no cached doc, simple source processing (OR ANY COMPLEX CASE BY CONSTRUCTION) try { if ((null == request.cachedResult) || isComplexSource) { // no cached api response, or complex if (null != request.importThread) { // 1) wait for the thread to finish if (null == request.endpointInfo.queryTimeout_secs) { request.endpointInfo.queryTimeout_secs = 300; } for (int timer = 0; timer < request.endpointInfo.queryTimeout_secs; timer++) { try { request.importThread.join(1000L); if (!request.importThread.isAlive()) { break; } } //TESTED (by hand) catch (Exception e) { //(carry on) } } if (request.importThread.isAlive()) { request.errorMessage = new RuntimeException("Script timed out"); } //TESTED (by hand) // 2) Get the results if (null != request.errorMessage) { if (_testMode) { throw new RuntimeException(request.errorMessage); } } else if (isComplexSource) { //DEBUG if (_DEBUG) _logger.debug("DEB: postQA0: " + request.complexSourceProcResults.size()); handleComplexDocCaching(request, _cacheMode, _scoreStats); // Get a list of docs docOrDocs = ((BasicDBList) DocumentPojo .listToDb(request.complexSourceProcResults, DocumentPojo.listType())) .toArray(new BasicDBObject[0]); // (_API_ caching is exactly the same between cache and non-cache cases) // (note that if null != complexSourceProcResults then follows that null != scriptResult) String url = buildScriptUrl(request.mergeKey, request.queryIndex); if (!(request.importThread instanceof FederatedSimpleHarvest) && _cacheMode) { // (don't cache python federated queries in test mode) // (simple harvest caching is done separately) this.cacheApiResponse(url, request.scriptResult, request.endpointInfo); } } //TESTED (by hand - single and multiple doc mode) else if (null == request.scriptResult) { if (_testMode) { throw new RuntimeException("Script mode: no cached result found from: " + request.requestParameter); } } else { // (_API_ caching is exactly the same between cache and non-cache cases) String url = buildScriptUrl(request.mergeKey, request.queryIndex); if (_cacheMode) { // (don't cache python federated queries in test mode) this.cacheApiResponse(url, request.scriptResult, request.endpointInfo); } bsonArray.add(request.scriptResult); } } // end script mode else { // HTTP mode (also: must be simple source builder) Response endpointResponse = request.responseFuture.get(); request.asyncClient.close(); request.asyncClient = null; String jsonStr = endpointResponse.getResponseBody(); String url = endpointResponse.getUri().toURL().toString(); Object bsonUnknownType = com.mongodb.util.JSON.parse(jsonStr); BasicDBObject bson = null; if (bsonUnknownType instanceof BasicDBObject) { bson = (BasicDBObject) bsonUnknownType; } else if (bsonUnknownType instanceof BasicDBList) { bson = new BasicDBObject(SimpleFederatedCache.array_, bsonUnknownType); } else if (bsonUnknownType instanceof String) { bson = new BasicDBObject(SimpleFederatedCache.value_, bsonUnknownType); } //DEBUG if (_DEBUG) _logger.debug("DEB: postQA1: " + url + ": " + jsonStr); if (null != bson) { MongoDbUtil.enforceTypeNamingPolicy(bson, 0); this.cacheApiResponse(url, bson, request.endpointInfo); bsonArray.add(bson); } } //(end script vs request method) } //TESTED (3.1, 4.2) else { // (just used cached value) //DEBUG if (_DEBUG) _logger.debug("DEB: postQA2: " + request.cachedResult.toString()); bsonArray.add( (BasicDBObject) request.cachedResult.get(SimpleFederatedCache.cachedJson_)); } //TESTED (4.1, 4.3) } catch (Exception e) { //DEBUG if (null == request.subRequest) { _logger.error("Error with script: " + e.getMessage()); if (_testMode) { throw new RuntimeException("Error with script: " + e.getMessage(), e); } } else { _logger.error("Error with " + request.subRequest.endPointUrl + ": " + e.getMessage()); if (_testMode) { throw new RuntimeException( "Error with " + request.subRequest.endPointUrl + ": " + e.getMessage(), e); } } } if (null == docOrDocs[0]) { // (this next bit of logic can only occur in simple source cases by construction, phew) if (!it.hasNext() || (request.mergeKey != it.peek().mergeKey)) { // deliberate ptr arithmetic String url = buildScriptUrl(request.mergeKey, request.queryIndex); //DEBUG if (_DEBUG) _logger.debug("DEB: postQA3: " + url + ": " + bsonArray); docOrDocs[0] = createDocFromJson(bsonArray, url, request, request.endpointInfo); } } } // (end if no cached doc) else { // cached doc, bypass lots of processing because no merging and doc already built (simple source processing) docOrDocs[0] = request.cachedDoc; } //TESTED (by hand) if (null != docOrDocs[0]) for (BasicDBObject doc : docOrDocs) { // Cache the document unless already cached (or caching disabled) if ((null == request.cachedDoc) && _cacheMode && !isComplexSource && ((null == request.endpointInfo.cacheTime_days) || (request.endpointInfo.cacheTime_days >= 0))) { simpleDocCache(request, doc); } //TESTED (by hand, 3 cases: cached not expired, cached expired first time, cached expired multiple times) if (!grabbedScores) { if (!docs.isEmpty()) { BasicDBObject topDoc = docs.get(0); aggregateSignif = topDoc.getDouble(DocumentPojo.aggregateSignif_, aggregateSignif); queryRelevance = topDoc.getDouble(DocumentPojo.queryRelevance_, queryRelevance); score = topDoc.getDouble(DocumentPojo.score_, score); grabbedScores = true; // OK would also like to grab the original matching entity, if it exists if (!isComplexSource) { BasicDBList ents = (BasicDBList) topDoc.get(DocumentPojo.entities_); if (null != ents) { for (Object entObj : ents) { BasicDBObject ent = (BasicDBObject) entObj; String entIndex = ent.getString(EntityPojo.index_, ""); if (entIndex.equals(request.queryIndex)) { ents = (BasicDBList) doc.get(DocumentPojo.entities_); if (null != ents) { ents.add(ent); } break; } } } //TESTED (by hand) } } } doc.put(DocumentPojo.aggregateSignif_, aggregateSignif); doc.put(DocumentPojo.queryRelevance_, queryRelevance); doc.put(DocumentPojo.score_, score); // Swap id and updateId, everything's been cached now: // Handle update ids vs normal ids: ObjectId updateId = (ObjectId) doc.get(DocumentPojo.updateId_); if (null != updateId) { // swap the 2... doc.put(DocumentPojo.updateId_, doc.get(DocumentPojo._id_)); doc.put(DocumentPojo._id_, updateId); } //TESTED (by hand) // If we're returning to a query then we'll adjust the doc format (some of the atomic fields become arrays) if (!_testMode) { convertDocToQueryFormat(doc, request.communityIdStrs); } //TESTED (by hand) docs.add(0, doc); added++; //(doc auto reset at top of loop) //(end if built a doc from the last request/set of requests) } //TESTED (3.1) } //(end loop over federated requests) if (null != response.getStats()) { response.getStats().found += added; } //TESTED (by hand) } } ///////////////////////////////////////////////// // UTILITY public FederatedRequest createSimpleHttpEndpoint_includingCache(String entityValue, String entityIndex, String[] communityIdStrs, SourceFederatedQueryConfigPojo endpoint, SourceFederatedQueryConfigPojo.FederatedQueryEndpointUrl request, BasicDBObject cachedDoc_expired) throws IOException { AsyncHttpClient asyncHttpClient = new AsyncHttpClient(); BoundRequestBuilder asyncRequest = null; String postContent = null; if (null != request.httpFields) { postContent = request.httpFields.get("Content"); if (null == postContent) postContent = request.httpFields.get("content"); } //TESTED (by hand, "http://httpbin.org/post", "httpFields": { "Content": "test" } if (null == postContent) { asyncRequest = asyncHttpClient.prepareGet(request.endPointUrl.replace("$1", entityValue)); } else { asyncRequest = asyncHttpClient.preparePost(request.endPointUrl.replace("$1", entityValue)); } //TESTED (by hand, "http://httpbin.org/post", "httpFields": { "Content": "test" } if (null != request.urlParams) { for (Map.Entry<String, String> keyValue : request.urlParams.entrySet()) { asyncRequest = asyncRequest.addQueryParameter(keyValue.getKey(), keyValue.getValue().replace("$1", entityValue)); } } //TESTED (1.5, 1.6, 3.*, 4.*) if (null != request.httpFields) { for (Map.Entry<String, String> keyValue : request.httpFields.entrySet()) { if (!keyValue.getKey().equalsIgnoreCase("content")) { asyncRequest = asyncRequest.addHeader(keyValue.getKey(), keyValue.getValue().replace("$1", entityValue)); } } } //TESTED (by hand, "http://httpbin.org/cookies", "httpFields": { "Cookie": "mycookie=test" } if (null != postContent) { asyncRequest = asyncRequest.setBody(postContent.replace("$1", entityValue)); } //TESTED (by hand, "http://httpbin.org/post", "httpFields": { "Content": "$1" } // Common params: FederatedRequest requestOverview = new FederatedRequest(); requestOverview.endpointInfo = endpoint; requestOverview.communityIdStrs = communityIdStrs; requestOverview.requestParameter = entityValue; requestOverview.asyncClient = asyncHttpClient; requestOverview.queryIndex = entityIndex; requestOverview.mergeKey = endpoint.parentSource.getKey(); requestOverview.subRequest = request; // Now check out the cache: URI rawUri = asyncRequest.build().getRawURI(); String url = rawUri.toString(); if ((null == endpoint.parentSource.getOwnedByAdmin()) || !endpoint.parentSource.getOwnedByAdmin()) { int port = rawUri.getPort(); if ((80 != port) && (443 != port) && (-1 != port)) { _logger.error("Only admin can make requests on non-standard ports: " + url + ": " + port); if (_testMode) { asyncHttpClient.close(); throw new RuntimeException( "Only admin can make requests on non-standard ports: " + url + ": " + port); } } } //TESTED (by hand) BasicDBObject cachedVal = this.getCache(url, endpoint); if (null != cachedVal) { if (checkIfNeedToClearCache(cachedVal, endpoint.parentSource)) { //DEBUG if (_DEBUG) _logger.debug("DEB: pre:CSHEb: Clear cache: " + url + " , " + cachedVal); cachedVal = null; } } requestOverview.cachedResult = cachedVal; requestOverview.cachedDoc_expired = cachedDoc_expired; if (null == cachedVal) { requestOverview.responseFuture = asyncRequest.execute(); } else { //DEBUG if (_DEBUG) _logger.debug("DEB: pre:CSHEb: Cache: " + url + " , " + cachedVal); requestOverview.responseFuture = null; asyncHttpClient.close(); } return requestOverview; } public BasicDBObject createDocFromJson(BasicDBList jsonList, String url, FederatedRequest request, SourceFederatedQueryConfigPojo endpointInfo) { BasicDBObject doc = null; // (don't create unless needed) BasicDBList ents = null; StringBuffer entVals = null; HashSet<String> entDedup = null; if (_testMode) { // In test mode, need to return the JSON even if no entities are specified doc = new BasicDBObject(); } if (null != endpointInfo.docConversionMap) { for (Map.Entry<String, String> docInfo : endpointInfo.docConversionMap.entrySet()) { for (Object jsonObj : jsonList) { BasicDBObject json = (BasicDBObject) jsonObj; try { String key = docInfo.getKey(); // (allow user to not prepend array: if they don't want to) if ((1 == json.size()) && json.containsKey((Object) "array")) { if (!key.startsWith("array:") && !key.startsWith(":array") && !key.startsWith("$:array") && !key.startsWith("::") && !key.startsWith("$::")) { if (key.startsWith(":")) { // jpath key = ":array" + key; } else if (key.startsWith("$:")) { // jpath key = "$:array" + key.substring(1); } else { key = "array:" + key; } } } //TESTED (by hand) if (key.startsWith(":")) { // jpath key = "$" + key; } // NOTE: *not* org.json.JSONArray JSONArray candidateEntities = null; if (key.startsWith("$")) { JSONArray candidateEntities_tmp = JsonPath.read(json.toString(), key.replace(':', '.')); if (null != candidateEntities_tmp) { candidateEntities = new JSONArray(); for (Object o : candidateEntities_tmp) { if (o instanceof String) { candidateEntities.add(o); } else if (o instanceof JSONArray) { candidateEntities.addAll((JSONArray) o); } } //TESTED (displayUrl vs entities, 3.2) } //DEBUG //System.out.println(candidateEntities); } //(TESTED (permutations above by hand)) else { String s = (String) MongoDbUtil.getProperty(json, key.replace(':', '.')); if (null != s) { candidateEntities = new JSONArray(); candidateEntities.add(s); } } //TESTED (3.1) if (null != candidateEntities) for (int i = 0; i < candidateEntities.size(); ++i) { Object o = candidateEntities.get(i); if (!(o instanceof String)) { continue; } String s = o.toString(); if (null == doc) { doc = new BasicDBObject(); //(various fields added below) } if (docInfo.getValue().equalsIgnoreCase(DocumentPojo.displayUrl_)) { doc.put(DocumentPojo.displayUrl_, s); } //TESTED (3.1, 4.*) else { // Entities! if (null == ents) { ents = new BasicDBList(); } String index = s.toLowerCase() + "/" + docInfo.getValue().toLowerCase(); if (null == entDedup) { entDedup = new HashSet<String>(); } else if (entDedup.contains(index)) { // Entity deduplication continue; } //TESTED (3.2) entDedup.add(index); if (null == entVals) { entVals = new StringBuffer(": "); } else { entVals.append(", "); } entVals.append(s); String dimension = null; if (null != endpointInfo.typeToDimensionMap) { try { dimension = EntityPojo.Dimension .valueOf( endpointInfo.typeToDimensionMap.get(docInfo.getValue())) .toString(); } catch (Exception e) { } } if (null == dimension) { dimension = EntityPojo.Dimension.What.toString(); } //TESTED (by hand) // (alternative to "made up" values would be to go looking in the existing docs/ents?) // (we'll try to avoid that for now...) BasicDBObject ent = new BasicDBObject(); ent.put(EntityPojo.disambiguated_name_, s); ent.put(EntityPojo.type_, docInfo.getValue()); ent.put(EntityPojo.dimension_, dimension); ent.put(EntityPojo.relevance_, 1.0); ent.put(EntityPojo.doccount_, 1L); // (ie relative to this query) ent.put(EntityPojo.averageFreq_, 1.0); ent.put(EntityPojo.datasetSignificance_, 10.0); // (ie relative to this query) ent.put(EntityPojo.significance_, 10.0); // (ie relative to this query) ent.put(EntityPojo.frequency_, 1.0); ent.put(EntityPojo.index_, index); ent.put(EntityPojo.queryCoverage_, 100.0); // (ie relative to this query) ent.put(EntityPojo.totalfrequency_, 1.0); // (ie relative to this query) ents.add(ent); } //TESTED (3.1, 4.*) } } catch (Exception e) { //(do nothing? null or the wrong type) //e.printStackTrace(); } } //end loop over various JSON objects retrieved } //(End loop over doc conversion elements) } //TESTED (3.*, 4.*) if ((null == ents) && !_testMode) { // don't return unless there are any entities return null; } else if (null != doc) { // Insert mandatory fields: // (Note the query format is a little bit different, the following fields are converted to arrays: // sourceKey, source, communityId, mediaType) doc.put(DocumentPojo._id_, new ObjectId()); doc.put(DocumentPojo.url_, url); doc.put(DocumentPojo.created_, new Date()); doc.put(DocumentPojo.modified_, new Date()); doc.put(DocumentPojo.publishedDate_, new Date()); doc.put(DocumentPojo.sourceKey_, endpointInfo.parentSource.getKey()); doc.put(DocumentPojo.source_, endpointInfo.parentSource.getTitle()); doc.put(DocumentPojo.communityId_, new ObjectId(request.communityIdStrs[0])); doc.put(DocumentPojo.mediaType_, endpointInfo.parentSource.getMediaType()); doc.put(DocumentPojo.metadata_, new BasicDBObject("json", jsonList.toArray())); if ((null != entVals) && (entVals.length() > 165)) { // (arbitrary length) entVals.setLength(165); entVals.append("..."); } doc.put(DocumentPojo.title_, new StringBuffer(endpointInfo.titlePrefix).append(": ") .append(request.requestParameter).append(entVals).toString()); doc.put(DocumentPojo.entities_, ents); Gson gson = new GsonBuilder().setPrettyPrinting().create(); JsonParser jp = new JsonParser(); JsonElement je = jp.parse(jsonList.toString()); doc.put(DocumentPojo.description_, gson.toJson(je)); // (prettified JSON) } //TESTED (3.*, 4.*) return doc; } // Convert some of the doc fields to private static BasicDBObject convertDocToQueryFormat(BasicDBObject doc, String[] communityIdStrs) { doc.put(DocumentPojo.sourceKey_, Arrays.asList(doc.get(DocumentPojo.sourceKey_))); // (don't need to remove #N/#NN - fed queries can't have them) doc.put(DocumentPojo.source_, Arrays.asList(doc.get(DocumentPojo.source_))); doc.put(DocumentPojo.mediaType_, Arrays.asList(doc.get(DocumentPojo.mediaType_))); doc.put(DocumentPojo.communityId_, communityIdStrs); return doc; }//TESTED (by hand) private static String buildScriptUrl(String mergeKey, String entityIndex) { String url = new StringBuilder().append("inf://federated/").append(mergeKey).append("/").append(entityIndex) .toString(); return url; }//TESTED (by hand) ///////////////////////////////////////////////// // CACHE UTILITIES public static boolean TEST_MODE_ONLY = false; public static DBCollection getCacheCollection() { if (TEST_MODE_ONLY) { return MongoDbManager.getCollection("test", "fed_query_cache"); } else { return MongoDbManager.getIngest().getFederatedCache(); } } private static boolean _staticInitializer = false; private static long _lastChecked = -1L; private BasicDBObject getCache(String url, SourceFederatedQueryConfigPojo endpoint) { if ((null != endpoint.cacheTime_days) && (endpoint.cacheTime_days <= 0)) { // cache disabled return null; } DBCollection endpointCacheCollection = getCacheCollection(); if (!_staticInitializer) { _staticInitializer = true; endpointCacheCollection.ensureIndex(new BasicDBObject(SimpleFederatedCache.expiryDate_, 1)); } BasicDBObject cacheObj = (BasicDBObject) endpointCacheCollection .findOne(new BasicDBObject(SimpleFederatedCache._id_, url)); if (null == cacheObj) { return null; } // else found something, means there's stuff in the DB // so check it's not too big: Date now = new Date(); if ((-1 == _lastChecked) || (now.getTime() > (_lastChecked + (600L * 1000L)))) { // (only check every 10 minutes) if (endpointCacheCollection.count() > SimpleFederatedCache.QUERY_FEDERATION_CACHE_CLEANSE_SIZE) { _lastChecked = now.getTime(); // Remove everything with expiry date older than now endpointCacheCollection.remove(new BasicDBObject(SimpleFederatedCache.expiryDate_, new BasicDBObject(DbManager.lt_, new Date()))); } } //TESTED (4.3) Date expiryDate = cacheObj.getDate(SimpleFederatedCache.expiryDate_, now); if (now.getTime() < expiryDate.getTime()) { return cacheObj; } else { return null; } //TESTED (4.2) }//TESTED (4.*) private static final int DEFAULT_CACHE_TIME_DAYS = 5; private void cacheApiResponse(String url, BasicDBObject toCacheJson, SourceFederatedQueryConfigPojo endpoint) { int cacheTime_days = DEFAULT_CACHE_TIME_DAYS; if (null != endpoint.cacheTime_days) { cacheTime_days = endpoint.cacheTime_days; } if (cacheTime_days <= 0) { // Disable _request_ cache (to disable all caching include doc caching use -1) return; } DBCollection endpointCacheCollection = getCacheCollection(); BasicDBObject toCacheObj = new BasicDBObject(SimpleFederatedCache._id_, url); toCacheObj.put(SimpleFederatedCache.cachedJson_, toCacheJson); toCacheObj.put(SimpleFederatedCache.expiryDate_, new Date(new Date().getTime() + cacheTime_days * 3600L * 24L * 1000L)); toCacheObj.put(SimpleFederatedCache.created_, new Date()); endpointCacheCollection.save(toCacheObj); }//TESTED (3.1, 4.*) // Document level caching, although it effectively serves as a mostly redundant request cache, // It's actually used to allow users to save federated query documents in their buckets public static void simpleDocCache(FederatedRequest request, BasicDBObject doc) { if (null != request.cachedDoc_expired) { ObjectId updateId = request.cachedDoc_expired.getObjectId(DocumentPojo.updateId_); if (null != updateId) { doc.put(DocumentPojo.updateId_, updateId); } else { doc.put(DocumentPojo.updateId_, request.cachedDoc_expired.getObjectId(DocumentPojo._id_)); } BasicDBObject docUpdate = new BasicDBObject(DocumentPojo.url_, doc.getString(DocumentPojo.url_)); docUpdate.put(DocumentPojo.sourceKey_, doc.getString(DocumentPojo.sourceKey_)); DbManager.getDocument().getMetadata().remove(docUpdate); //DEBUG if (_DEBUG) _logger.debug("DEB: postQA4a: re-cached ... " + docUpdate.toString() + ": " + doc.getObjectId(DocumentPojo.updateId_)); } else if (null == request.cachedDoc) { // if no currently cached doc, simply save what we have //DEBUG if (_DEBUG) _logger.debug("DEB: postQA4b: cached ... " + doc); DbManager.getDocument().getMetadata().save(doc); } // (else already have a valid cached doc so nothing to do) } public static boolean checkDocCache_isExpired(BasicDBObject cachedDoc, SourceFederatedQueryConfigPojo endpoint) { if (null == endpoint.cacheTime_days) endpoint.cacheTime_days = DEFAULT_CACHE_TIME_DAYS; Date now = new Date(); long cacheThreshold = cachedDoc.getDate(DocumentPojo.created_, now).getTime() + endpoint.cacheTime_days * 3600L * 24L * 1000L; if (cacheThreshold < now.getTime()) // (ie doc-creation-time + cache is earlier than now => time to decache) { //DEBUG if (_DEBUG) _logger.debug("DEB: preQA6zz: Cache expired: " + cachedDoc.getString(DocumentPojo.url_) + ": " + new Date(cacheThreshold) + " vs " + now); return true; } else return false; }//TESTED private static BasicDBObject getCachedApiResponse(String scriptResult) { Object parsedScriptResult; BasicDBObject outResult; try { parsedScriptResult = com.mongodb.util.JSON.parse(scriptResult); } catch (Exception e) { throw new RuntimeException("Error deserializing " + scriptResult + ": " + e.getMessage(), e); } if (parsedScriptResult instanceof BasicDBObject) { outResult = (BasicDBObject) parsedScriptResult; } else if (parsedScriptResult instanceof BasicDBList) { outResult = new BasicDBObject(SimpleFederatedCache.array_, parsedScriptResult); } else if (parsedScriptResult instanceof String) { outResult = new BasicDBObject(SimpleFederatedCache.value_, parsedScriptResult); } else { throw new RuntimeException("Error deserializing " + scriptResult + ": " + parsedScriptResult); } try { MongoDbUtil.enforceTypeNamingPolicy(outResult, 0); } catch (Exception ee) { throw new RuntimeException("Error deserializing " + scriptResult + ": " + ee.getMessage(), ee); } return outResult; }//TESTED (c/p from tested code though I deleted the test cases) private static boolean isComplexSource(SourcePojo src) { return (null != src.getProcessingPipeline()) && (src.getProcessingPipeline().size() > 1); } private static boolean checkIfNeedToClearCache(BasicDBObject cachedVal, SourcePojo src) { BasicDBObject cachedJson = (BasicDBObject) cachedVal.get(SimpleFederatedCache.cachedJson_); if (null == cachedJson) { return true; // (corrupt cache) } boolean isComplexSrc = isComplexSource(src); if (isComplexSrc) { // check API response Date createdDate = (Date) cachedVal.get(SimpleFederatedCache.created_); if (null == createdDate) { return true; // (needs date for doc caching code so just remove from cache and re-create) } } if ((1 == cachedJson.size()) && (cachedJson.get(SimpleFederatedCache.__infinite__value_) instanceof String)) { // ie if complex source, return false - no need to clear cache return !isComplexSrc; } else { // opposite return isComplexSrc; } }//TESTED (by hand - all 4 combos) /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // (my own threading) public abstract class FederatedHarvest extends Thread { public FederatedRequest request; public SimpleFederatedQueryEngine queryEngine; } public class FederatedJythonHarvest extends FederatedHarvest { public void run() { String scriptResult = null; LinkedList<String> debugLog = queryEngine._testMode ? new LinkedList<String>() : null; try { if (null != request.cachedResult) { // we have a cached value but are in complex mode so can't immediately BasicDBObject cachedJson = (BasicDBObject) request.cachedResult .get(SimpleFederatedCache.cachedJson_); scriptResult = (String) cachedJson.get(SimpleFederatedCache.__infinite__value_); } else { scriptResult = queryEngine.performImportPythonScript(request.endpointInfo.importScript, request.requestParameter, request.fullQuery, request.endpointInfo.parentSource.getOwnedByAdmin(), debugLog); } } catch (Exception e) { request.errorMessage = e; return; } try { if (isComplexSource(request.endpointInfo.parentSource)) { DocumentPojo doc = new DocumentPojo(); // Fields that are specific to the federated query type: doc.setFullText(scriptResult); doc.setUrl(buildScriptUrl(request.mergeKey, request.queryIndex)); // (always cache complex source pipeline results like this) request.scriptResult = new BasicDBObject(SimpleFederatedCache.__infinite__value_, scriptResult); if ((null != debugLog) && !debugLog.isEmpty()) { request.scriptResult.put("$logs", debugLog); } handleComplexDocProcessing(doc, request, _cacheMode); } else { // simple case: // In this simple case, the output has to be JSON request.scriptResult = getCachedApiResponse(scriptResult); if ((null != debugLog) && !debugLog.isEmpty()) { request.scriptResult.put("$logs", debugLog); } } //(END simple vs complex post processing case) } catch (Exception e) { request.errorMessage = e; return; } } }//TESTED (by hand) public class FederatedSimpleHarvest extends FederatedHarvest { public void run() { // 1) Kick off all the requests asynchronously BasicDBList bsonArray = new BasicDBList(); StringBuffer sb = new StringBuffer(); try { LinkedList<FederatedRequest> asyncRequestsPerQuery = new LinkedList<FederatedRequest>(); for (SourceFederatedQueryConfigPojo.FederatedQueryEndpointUrl httpRequest : request.endpointInfo.requests) { FederatedRequest requestOverview = createSimpleHttpEndpoint_includingCache( request.requestParameter, request.queryIndex, request.communityIdStrs, request.endpointInfo, httpRequest, null); if (_DEBUG) _logger.debug("DEB: FederatedSimpleHarvest: Build request: " + httpRequest.endPointUrl); asyncRequestsPerQuery.add(requestOverview); } //(end loop over multiple requests //TESTED (by hand) // 2) Combine the results (waiting asyncronously for the results) - see the mergeKey code equivalent for simple queries while (!asyncRequestsPerQuery.isEmpty()) { Iterator<FederatedRequest> it = asyncRequestsPerQuery.iterator(); while (it.hasNext()) { FederatedRequest requestOverview = it.next(); if (null != requestOverview.cachedResult) { // cached result, note must be in format cachedJson.__infinite__value (else will have been discarded) BasicDBObject jsonCache = (BasicDBObject) requestOverview.cachedResult .get(SimpleFederatedCache.cachedJson_); if (null != jsonCache) { String s = (String) jsonCache.get(SimpleFederatedCache.__infinite__value_); if (null != s) { //DEBUG if (_DEBUG) _logger.debug("DEB: FederatedSimpleHarvest: found cached element: " + requestOverview.cachedResult); bsonArray.add(s); sb.append(s).append("\n\n"); it.remove(); continue; } } } //TESTED (by hand) //IF HERE THEN CACHE DIDN'T EXIST OR FAILED (see continue above) if (requestOverview.responseFuture.isDone()) { it.remove(); Response endpointResponse = requestOverview.responseFuture.get(); requestOverview.asyncClient.close(); requestOverview.asyncClient = null; String jsonStr = endpointResponse.getResponseBody(); String url = endpointResponse.getUri().toURL().toString(); //DEBUG if (_DEBUG) _logger.debug( "DEB: FederatedSimpleHarvest: found new element: " + url + " = " + jsonStr); BasicDBObject bson = new BasicDBObject(SimpleFederatedCache.__infinite__value_, jsonStr); cacheApiResponse(url, bson, request.endpointInfo); bsonArray.add(jsonStr); sb.append(jsonStr).append("\n\n"); } //TESTED (by hand) } if (!asyncRequestsPerQuery.isEmpty()) { // wait 100ms to stop thrashing try { Thread.sleep(100); } catch (Exception e) { } } } //TESTED (by hand) } catch (Exception e) { request.errorMessage = e; return; } // 3) Run the document processing DocumentPojo doc = new DocumentPojo(); // Fields that are specific to the federated query type: doc.setFullText(sb.toString()); if (bsonArray.size() > 1) { doc.addToMetadata("__FEDERATED_REPLIES__", bsonArray.toArray()); } doc.setUrl(buildScriptUrl(request.mergeKey, request.queryIndex)); // (always cache complex source pipeline results like this) handleComplexDocProcessing(doc, request, _cacheMode); } } public class FederatedScriptHarvest extends FederatedHarvest { public void run() { String scriptResult = null; try { if (null != request.cachedResult) { // we have a cached value but are in complex mode so can't immediately BasicDBObject cachedJson = (BasicDBObject) request.cachedResult .get(SimpleFederatedCache.cachedJson_); scriptResult = (String) cachedJson.get(SimpleFederatedCache.__infinite__value_); } else { // Use the TextExtractorExternalScript function: TextExtractorExternalScript extractor = new TextExtractorExternalScript(); LinkedHashMap<String, String> dummyExtractorOptions = new LinkedHashMap<String, String>(); String[] args = request.endpointInfo.importScript.split("\\s+"); int i = 0; for (String arg : args) { arg = arg.replace("$1", request.requestParameter); if (0 == i) { dummyExtractorOptions.put("script", arg); } else { dummyExtractorOptions.put("arg" + i, arg); } i++; } if (null != request.endpointInfo.queryTimeout_secs) { dummyExtractorOptions.put("timeout", Long.toString(request.endpointInfo.queryTimeout_secs.longValue() * 1000L)); //(convert to ms then to string) } SourcePojo dummySrc = new SourcePojo(); dummySrc.setOwnerId(request.endpointInfo.parentSource.getOwnerId()); dummySrc.setExtractorOptions(dummyExtractorOptions); dummySrc.setCommunityIds(request.endpointInfo.parentSource.getCommunityIds()); DocumentPojo dummyDoc = new DocumentPojo(); dummyDoc.setTempSource(dummySrc); extractor.extractText(dummyDoc); scriptResult = dummyDoc.getFullText(); if (null == scriptResult) { request.errorMessage = new RuntimeException( "Unknown problem, script didn't return text: " + dummyExtractorOptions.toString()); return; } } } catch (Exception e) { request.errorMessage = e; return; } try { if (isComplexSource(request.endpointInfo.parentSource)) { DocumentPojo doc = new DocumentPojo(); // Fields that are specific to the federated query type: doc.setFullText(scriptResult); doc.setUrl(buildScriptUrl(request.mergeKey, request.queryIndex)); // (always cache complex source pipeline results like this) request.scriptResult = new BasicDBObject(SimpleFederatedCache.__infinite__value_, scriptResult); handleComplexDocProcessing(doc, request, _cacheMode); } else { // simple case: // In this simple case, the output has to be JSON request.scriptResult = getCachedApiResponse(scriptResult); } //(END simple vs complex post processing case) } catch (Exception e) { request.errorMessage = e; return; } } } /////////////////////////////////////////////////////////////////////////////// private String performImportPythonScript(String importScript, String entityValue, AdvancedQueryPojo query, Boolean isSrcAdmin, LinkedList<String> debugLog) { String modCode = importScript; try { // Create a return value String fnName = "var" + new ObjectId(); // Pull all the imports, evaluate them outside the security manager Pattern importRegex = Pattern.compile("^\\s*(?:import|from)\\s[^\n]+", Pattern.MULTILINE); Matcher m = importRegex.matcher(importScript); StringBuffer sbImports = new StringBuffer(); while (m.find()) { sbImports.append(m.group()).append('\n'); } if (null != query) { _pyEngine.put("_query", query.toApi()); // full query } if (null != entityValue) { _pyEngine.put("_entityValue", entityValue); // allow either entityValue } _pyEngine.eval(sbImports.toString()); // Logging function if (null != debugLog) { String logName = "log" + new ObjectId(); _pyEngine.put(logName, debugLog); _pyEngine.eval("def ikanow_log(logmsg):\n " + logName + ".add(logmsg)\n\n"); } else { _pyEngine.eval("def ikanow_log(logmsg):\n pass\n\n"); } // Enable SSL everywhere (https://wiki.python.org/jython/NewSocketModule#SSLSupport) //http://tech.pedersen-live.com/2010/10/trusting-all-certificates-in-jython/ //didn't work: http://jython.xhaus.com/installing-an-all-trusting-security-provider-on-java-and-jython/ _pyEngine.eval(IOUtils .toString(SimpleFederatedQueryEngine.class.getResourceAsStream("JythonTrustManager.py"))); // Now run the script modCode = m.replaceAll(""); modCode = modCode.replaceAll("\n([^\n]+)$", "\n" + fnName + " = " + "$1"); if ((null == isSrcAdmin) || !isSrcAdmin) { _scriptingSecurityManager.eval(_pyEngine, modCode); } else { _pyEngine.eval(modCode); } // Get return value Object result = _pyEngine.get(fnName.toString()); //DEBUG if (_DEBUG) _logger.debug("DEB: T1: Return val from script: " + result); if (null == result) { throw new RuntimeException( "Null return from script - final line needs to evaluate expression to return, eg 'response.read()' or 'varname'"); } //TESTED (by hand) return result.toString(); } catch (Exception e) { throw new RuntimeException(e); } catch (Error ee) { throw new RuntimeException(ee); } }//TESTED /////////////////////////////////////////////////////////////////////////////// public static void handleComplexDocProcessing(DocumentPojo doc, FederatedRequest request, boolean cacheMode) { List<DocumentPojo> docWrapper = null; if (null != request.cachedResult) { // API result was cached, so docs might be Date createdDate = request.cachedResult.getDate(SimpleFederatedCache.created_); if ((null != request.endpointInfo.parentSource.getModified()) && (request.endpointInfo.parentSource.getModified().getTime() > createdDate.getTime())) { //DEBUG if (_DEBUG) _logger.debug("DEB: HCDP0: cache out of date, src=" + request.endpointInfo.parentSource.getModified() + " cache=" + createdDate); request.cachedResult = null; // (clear cache) } if (null != request.cachedResult) { // see if we can get the docs from the DB instead of from the harvester BasicDBObject query = new BasicDBObject(DocumentPojo.sourceKey_, request.endpointInfo.parentSource.getKey()); query.put(DocumentPojo.sourceUrl_, doc.getUrl()); DBCursor dbc = DbManager.getDocument().getMetadata().find(query); if (dbc.hasNext()) { docWrapper = DocumentPojo.listFromDb(dbc, DocumentPojo.listType()); if (null == docWrapper) { // (shouldn't ever happen) request.cachedResult = null; } else { //DEBUG if (_DEBUG) _logger.debug("DEB: HCDP2: cache, found docs=" + docWrapper.size()); } } else { // last chance to clear cache //DEBUG if (_DEBUG) _logger.debug("DEB: HCDP3: empty cache from " + query); request.cachedResult = null; } } } //TESTED (by hand) // Need to set this 1) to avoid the harvest controller from deduplicating and 2) so i can retrieve those docs if (null == docWrapper) { // Go ahead and harvest these docs doc.setSourceUrl(doc.getUrl()); doc.setId(new ObjectId()); doc.setSourceKey(request.endpointInfo.parentSource.getKey()); doc.setSource(request.endpointInfo.parentSource.getTitle()); doc.setMediaType(request.endpointInfo.parentSource.getMediaType()); doc.setTitle(request.endpointInfo.titlePrefix); Date d = new Date(); doc.setCreated(d); doc.setModified(d); docWrapper = new ArrayList<DocumentPojo>(1); ArrayList<DocumentPojo> dummyDocs = new ArrayList<DocumentPojo>(0); docWrapper.add(doc); try { HarvestController hc = new HarvestController(); HarvestControllerPipeline hcp = new HarvestControllerPipeline(); hcp.extractSource_preProcessingPipeline(request.endpointInfo.parentSource, hc); hcp.enrichSource_processingPipeline(request.endpointInfo.parentSource, docWrapper, dummyDocs, dummyDocs); //DEBUG if (_DEBUG) _logger.debug("DEB: HCDP4: created " + docWrapper.size() + " doc(s)"); } catch (IOException e) { throw new RuntimeException("Complex source processing", e); } } request.complexSourceProcResults = docWrapper; } public static void handleComplexDocCaching(FederatedRequest request, boolean cacheMode, ScoringUtils scoreStats) { List<DocumentPojo> docWrapper = request.complexSourceProcResults; //In non-test mode .. Go through the list of docs and work out what the deal is with caching, ie remove docs + add update ids //Also go through and set default scores for any entities that haven't been scored based on existing docs BasicDBObject query = new BasicDBObject(DocumentPojo.sourceKey_, request.endpointInfo.parentSource.getKey()); BasicDBObject fields = new BasicDBObject(DocumentPojo.updateId_, 1); // (ie _id and updateId only) String srcUrl = null; for (DocumentPojo outDoc : docWrapper) { if (null == srcUrl) { srcUrl = outDoc.getSourceUrl(); } // Always make the text non-transient, so gets stored outDoc.makeFullTextNonTransient(); if (null == outDoc.getId()) { outDoc.setId(new ObjectId()); } if (cacheMode && (null == request.cachedResult)) { // (if result not previously cached) // Step 1: deduplication query.put(DocumentPojo.url_, outDoc.getUrl()); BasicDBObject outVal = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(query, fields); if (null != outVal) { //DEBUG if (_DEBUG) _logger.debug("DEB: HCDC1: update cache from : " + outVal + " for " + outDoc.getUrl()); // Use updateId if it exists, otherwise _id ObjectId updateId = outVal.getObjectId(DocumentPojo.updateId_); if (null == updateId) { updateId = outVal.getObjectId(DocumentPojo._id_); } outDoc.setUpdateId(updateId); } } //TESTED (by hand - single and multiple docs mode) // Step 2: add fake scores to all the entities that didn't get scores from the aggregation manager if (null != outDoc.getEntities()) for (EntityPojo ent : outDoc.getEntities()) { boolean fakeStats = true; if (null != scoreStats) { if (scoreStats.fillInEntityStatistics(ent)) { fakeStats = false; } } if (fakeStats) { ent.setDoccount(1L); ent.setTotalfrequency(1L); ent.setDatasetSignificance(10.0); ent.setSignificance(10.0); ent.setQueryCoverage(100.0); } //DEBUG if (_DEBUG) _logger.debug( "DEB: HCDC2: entity: " + ent.getIndex() + " , sig=" + ent.getDatasetSignificance()); } //TESTED if (null != outDoc.getAssociations()) for (AssociationPojo assoc : outDoc.getAssociations()) { assoc.setAssoc_sig(10.0); assoc.setDoccount(1L); } } //TESTED (by hand - overlapping and non-overlapping case) if (cacheMode && (null == request.cachedResult)) { // (if result not previously cached) //Remove old docs now we have new ones DbManager.getDocument().getMetadata().remove(query); // remove everything with this specific URL (ie simple source) query.remove(DocumentPojo.url_); query.put(DocumentPojo.sourceUrl_, srcUrl); DbManager.getDocument().getMetadata().remove(query); // remove everything with this specific _source_ URL (ie docs generated from this URL) // Now cache all the existing docs: @SuppressWarnings("unchecked") ArrayList<Object> tmpDocList = (ArrayList<Object>) DocumentPojo.listToDb(docWrapper, DocumentPojo.listType()); DbManager.getDocument().getMetadata().insert(tmpDocList.toArray(new BasicDBObject[0])); //DEBUG if (_DEBUG) _logger.debug("DEB: HCDC3: remove/insert cache: " + query.toString()); } //TESTED (by hand - single and multiple docs) }//TESTED /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // TEST UTILITIES // Check 1.x test results public void test_CheckIfQueryLaunched(String testName, boolean shouldWork) { FederatedRequest request = null; if (null != this._asyncRequestsPerQuery) { if (!this._asyncRequestsPerQuery.isEmpty()) { request = this._asyncRequestsPerQuery.peek(); if (this._asyncRequestsPerQuery.size() > 1) { System.out.println("*** " + testName + ": + too many async requests"); System.exit(-1); } } } if (shouldWork) { if (null == request) { System.out.println("*** " + testName + ": Expected request, got none"); System.exit(-1); } } else { if (null != request) { if (null != request.responseFuture) { System.out.println("*** " + testName + ": incorrectly added " + request.requestParameter); System.exit(-1); } else if ((null == request.cachedResult) && (null == request.cachedDoc)) { System.out.println("*** " + testName + ": no request, not cached " + request.cachedResult); System.exit(-1); } } } }//TESTED (by hand - who tests the tester?!) public void test_CheckIfDocAdded(String testName, List<BasicDBObject> docs) { // 1) Did a doc get added? if (2 != docs.size()) { System.out.println("*** " + testName + ": didn't add doc? " + docs.size()); System.exit(-1); } // 2) Was it the right doc? BasicDBObject doc = docs.get(0); doc.remove(DocumentPojo._id_); doc.remove(DocumentPojo.created_); doc.remove(DocumentPojo.modified_); doc.remove(DocumentPojo.publishedDate_); String docToCheck = "{ \"displayUrl\" : \"http://test3_1\" , \"url\" : \"inf://federated/fakeendpoint.123/test3_1/testentityin\" , \"sourceKey\" : [ \"fakeendpoint.123\"] , \"source\" : [ \"fakeendpoint\"] , \"communityId\" : [ \"4c927585d591d31d7b37097a\"] , \"mediaType\" : [ \"Report\"] , \"metadata\" : { \"json\" : [ { \"test\" : { \"field\" : [ \"test3_1\" , \"test3_1\"] , \"field2\" : \"http://test3_1\"}}]} , \"title\" : \"fake endpoint: : test3_1: test3_1\" , \"entities\" : [ { \"disambiguated_name\" : \"test3_1\" , \"type\" : \"TestEntityOut\" , \"dimension\" : \"What\" , \"relevance\" : 1.0 , \"doccount\" : 1 , \"averageFreq\" : 1.0 , \"datasetSignificance\" : 10.0 , \"significance\" : 10.0 , \"frequency\" : 1.0 , \"index\" : \"test3_1/testentityout\" , \"queryCoverage\" : 100.0 , \"totalfrequency\" : 1.0}] , \"description\" : \"[\\n {\\n \\\"test\\\": {\\n \\\"field\\\": [\\n \\\"test3_1\\\",\\n \\\"test3_1\\\"\\n ],\\n \\\"field2\\\": \\\"http://test3_1\\\"\\n }\\n }\\n]\" , \"aggregateSignif\" : 115.0 , \"queryRelevance\" : 105.0 , \"score\" : 110.0}"; if (!docToCheck.equals(doc.toString())) { System.out.println( "*** " + testName + ": document incorrect:\n" + docToCheck + "\nVS\n" + doc.toString()); System.exit(-1); } // 3) Did the doc get cached? DBCollection endpointCacheCollection = getCacheCollection(); String hardwiredCacheId = "http://localhost:8186/test/test3_1/?param1=test3_1"; BasicDBObject cachedVal = (BasicDBObject) endpointCacheCollection .findOne(new BasicDBObject(SimpleFederatedCache._id_, hardwiredCacheId)); if (null == cachedVal) { System.out.println( "*** " + testName + ": no cache for: " + doc.get(DocumentPojo.url_) + " / " + hardwiredCacheId); System.exit(-1); } else { Date expiryDate = cachedVal.getDate(SimpleFederatedCache.expiryDate_, null); if ((null == expiryDate) || ((new Date().getTime() - expiryDate.getTime()) > 10 * 1000L)) { System.out.println("*** " + testName + ": expiry date for: " + doc.get(DocumentPojo.url_) + ": " + expiryDate + " vs now= " + new Date()); System.exit(-1); } BasicDBObject cachedJson = (BasicDBObject) cachedVal.get(SimpleFederatedCache.cachedJson_); Object docVal = null; try { Object[] docVals = (Object[]) ((BasicDBObject) doc.get(DocumentPojo.metadata_)).get("json"); docVal = docVals[0]; } catch (Exception e) { @SuppressWarnings("rawtypes") Collection docVals = (Collection) ((BasicDBObject) doc.get(DocumentPojo.metadata_)).get("json"); docVal = docVals.iterator().next(); } if ((null == cachedJson) || !cachedJson.equals(docVal)) { System.out.println("*** " + testName + ": cache: " + doc.get(DocumentPojo.url_) + ": cached val incorrect:\n" + docVal + "\nVS\n" + cachedJson); System.exit(-1); } } }//TESTED (by hand) /////////////////////////////////////////////////////////////////////////////// // (general test utils) // Reset state ready for next test public void test_queryClear(boolean clearCacheAlso) { if (null != this._asyncRequestsPerQuery) { for (FederatedRequest req : this._asyncRequestsPerQuery) { try { req.responseFuture.get(); if (null != req.asyncClient) { req.asyncClient.close(); } } catch (Exception e) { } } this._asyncRequestsPerQuery.clear(); } if (clearCacheAlso) { DBCollection endpointCacheCollection = getCacheCollection(); endpointCacheCollection.remove(new BasicDBObject()); } }//TESTED (by hand - who tests the tester?!) public void test_cacheExpire() { DBCollection endpointCacheCollection = getCacheCollection(); DBCursor dbc = endpointCacheCollection.find(); for (DBObject cacheEntryObj : dbc) { BasicDBObject cacheEntry = (BasicDBObject) cacheEntryObj; cacheEntry.put(SimpleFederatedCache.expiryDate_, new Date(new Date().getTime() - 3600L * 1000L)); // (ie expired an hour ago) endpointCacheCollection.save(cacheEntry); } } public void test_cacheClear(boolean clearApiCache, boolean clearDocCache, String key) { // (DO NOT COMMENT THIS OUT) SimpleFederatedQueryEngine.TEST_MODE_ONLY = true; // Clear doc_metadata table before running if (clearDocCache) { MongoDbManager.getDocument().getMetadata().remove(new BasicDBObject(DocumentPojo.sourceKey_, key)); } if (clearApiCache) { // (note: this only deletes the test collection because of TEST_MODE_ONLY code) SimpleFederatedQueryEngine.getCacheCollection().remove(new BasicDBObject()); // (clear test collection for testing) } } public void test_cacheFill(String testName, boolean fill, boolean shouldBeFull) { DBCollection endpointCacheCollection = getCacheCollection(); if (fill) { for (long i = 0; i < (1 + SimpleFederatedCache.QUERY_FEDERATION_CACHE_CLEANSE_SIZE); ++i) { SimpleFederatedCache fakeCacheElement = new SimpleFederatedCache(); fakeCacheElement.expiryDate = new Date(new Date().getTime() - 3600L * 1000L); // (ie expired an hour ago) fakeCacheElement._id = testName + "_" + i; fakeCacheElement.cachedJson = new BasicDBObject(); endpointCacheCollection.save(fakeCacheElement.toDb()); } _lastChecked = new Date(new Date().getTime() - 602L * 1000L).getTime(); } long count = endpointCacheCollection.count(); if (shouldBeFull) { if (count < SimpleFederatedCache.QUERY_FEDERATION_CACHE_CLEANSE_SIZE) { System.out.println("*** " + testName + ": cache should just contain many elements, not: " + count); System.exit(-1); } } else { if (1 != count) { System.out.println("*** " + testName + ": cache should just contain one element, not: " + count); System.exit(-1); } } } }