List of usage examples for com.mongodb DBCollection getDBDecoderFactory
public synchronized DBDecoderFactory getDBDecoderFactory()
From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java
License:Open Source License
@SuppressWarnings("unchecked") private void stage1_initialCountingLoop(DBCursor docs, AdvancedQueryPojo.QueryScorePojo scoreParams, int toReturn, StatisticsPojo scores, LinkedList<BasicDBObject> standaloneEventsReturn, int nCommunities) { double s0_nQuerySubsetDocCountInv = 1.0 / (double) _s0_nQuerySubsetDocCount; // Some memory management: DBCollection dbc = MongoDbManager.getDocument().getMetadata(); DBDecoderFactory defaultDecoder = dbc.getDBDecoderFactory(); try {/* w w w. j a v a 2 s . co m*/ SizeReportingBasicBSONDecoder sizeReportingDecoder = new SizeReportingBasicBSONDecoder(); dbc.setDBDecoderFactory(sizeReportingDecoder); long currMemUsage = 0; int ndocs = 0; long lastBatch = 0L; long initialUnusedMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory(); long initialFreeMemory = Runtime.getRuntime().freeMemory(); for (DBObject f0 : docs) { BasicDBObject f = (BasicDBObject) f0; long newMemUsage = sizeReportingDecoder.getSize(); if ((newMemUsage - currMemUsage) > 0) { // check every batch long now = new Date().getTime(); //DEBUG //logger.warn(ndocs + " : " + (now - lastBatch) + " : " + newMemUsage + " VS " + Runtime.getRuntime().maxMemory() + " UNUSED " + (Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory()) + " FREE " + Runtime.getRuntime().freeMemory()); // Check vs total memory: long runtimeMem = Runtime.getRuntime().maxMemory(); // note newMemUsage is the input memory ... gets expanded ~6x by the BSON-ification, allowed at most 1/4rd of memory... // Also if we're taking more than 20s for a batch then limp over the limit and exit... if (((newMemUsage * 24) > runtimeMem) || (((now - lastBatch) > 20000L) && (ndocs >= toReturn))) { long finalUnusedMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory(); long finalFreeMemory = Runtime.getRuntime().freeMemory(); logger.error("Query truncated memUsage=" + newMemUsage + ", memory=" + runtimeMem + ", docs=" + ndocs + ", totaldocs=" + scores.found + ", init_free_mem=" + initialFreeMemory + ", end_free_mem=" + finalFreeMemory + ", init_unused_mem=" + initialUnusedMemory + ", end_unused_mem=" + finalUnusedMemory); break; } //TESTED currMemUsage = newMemUsage; lastBatch = now; } //TESTED ndocs++; // Simple handling for standalone events if ((null != _s0_standaloneEventAggregator) && !_s0_bNeedToCalcSig) { //if _s0_bNeedToCalcSig then do this elsewhere ScoringUtils_Associations.addStandaloneEvents(f, 0.0, 0, _s0_standaloneEventAggregator, _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter, _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts); } //TESTED if (!_s0_bNeedToCalcSig) { continue; } //TESTED if (nCommunities > 1) { // (could have pan-community entities) ObjectId communityId = (ObjectId) f.get(DocumentPojo.communityId_); if (null != communityId) { // (have big problems if so, but anyway!) int retval = _s0_multiCommunityHandler.community_getIdAndInitialize(communityId, _s1_entitiesInDataset); // (returns an int community id but also sets it into the cache, so just use that below) if (Integer.MIN_VALUE == retval) { //this document cannot be viewed from within this set of communities continue; } } } //TESTED TempDocBucket docBucket = new TempDocBucket(); docBucket.dbo = f; ObjectId id = (ObjectId) f.get(DocumentPojo._id_); // If we're going to weight relevance in, or we need the geo temporal decay: if ((0 != scoreParams.relWeight) || (null != scoreParams.timeProx) || (null != scoreParams.geoProx)) { StatisticsPojo.Score scoreObj = scores.getScore().get(id); if (null != scoreObj) { docBucket.explain = scoreObj.explain; // (will normally be null) docBucket.luceneScore = scoreObj.score; if ((null != scoreParams.timeProx) || (null != scoreParams.geoProx)) { if (scoreObj.decay >= 0.0) { docBucket.geoTemporalDecay = scoreObj.decay; } // (see also below for low accuracy geo scoring) } } else { docBucket.luceneScore = 1.0; } } //TESTED else if (this._s0_sortingByDate) { StatisticsPojo.Score scoreObj = scores.getScore().get(id); if (null != scoreObj) { docBucket.nLuceneIndex = scoreObj.nIndex; } } docBucket.manualWeighting = this.getManualScoreWeights(scoreParams, f); BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_)); if (null != l) { long nEntsInDoc = l.size(); double dBestGeoScore = 0.0; // (for low accuracy geo only) for (Iterator<?> e0 = l.iterator(); e0.hasNext();) { BasicDBObject e = (BasicDBObject) e0.next(); BasicDBObject tmpGeotag = null; if (_s3_bLowAccuracyGeo || (null != _s1_dManualGeoDecay_latLonInvdecay)) { // low accuracy geo, need to look for geotag tmpGeotag = (BasicDBObject) e.get(EntityPojo.geotag_); } // Get attributes double freq = -1.0; long ntotaldoccount = -1; String entity_index; Double sentiment = null; try { sentiment = (Double) e.get(EntityPojo.sentiment_); ntotaldoccount = e.getLong(EntityPojo.doccount_); freq = e.getDouble(EntityPojo.frequency_); entity_index = e.getString(EntityPojo.index_); if (null == entity_index) { // Just bypass the entity e.put(EntityPojo.significance_, 0.0); nEntsInDoc--; continue; } } catch (Exception ex) { try { String sfreq; if (ntotaldoccount < 0) { sfreq = e.getString(EntityPojo.doccount_); ntotaldoccount = Long.valueOf(sfreq); } if (freq < -0.5) { sfreq = e.getString(EntityPojo.frequency_); freq = Long.valueOf(sfreq).doubleValue(); } entity_index = e.getString(EntityPojo.index_); if (null == entity_index) { // Just bypass the entity e.put(EntityPojo.significance_, 0.0); nEntsInDoc--; continue; } } catch (Exception e2) { // Just bypass the entity e.put(EntityPojo.significance_, 0.0); nEntsInDoc--; continue; } } //TESTED // First loop through is just counting // Retrieve entity (create/initialzie if necessary) EntSigHolder shp = _s1_entitiesInDataset.get(entity_index); if (null == shp) { if (ntotaldoccount > (long) _s0_globalDocCount) { // obviously can't have more entities-in-dos than docs... ntotaldoccount = (long) _s0_globalDocCount; } shp = new EntSigHolder(entity_index, ntotaldoccount, _s0_multiCommunityHandler); // Stage 1a alias handling: set up infrastructure, calculate doc overlap if (null != _s1_aliasLookup) { stage1_initAlias(shp); } if ((null != shp.aliasInfo) && (null == shp.masterAliasSH)) { // this is the discard alias nEntsInDoc--; continue; } //TESTED // Check if entity is in type filter list if (null != _s0_entityTypeFilter) { String entType = null; if (null != shp.aliasInfo) { entType = shp.aliasInfo.getType(); } else { entType = e.getString(EntityPojo.type_); } if (_s0_bEntityTypeFilterPositive) { if ((null != entType) && !_s0_entityTypeFilter.contains(entType.toLowerCase())) { nEntsInDoc--; continue; } } else if ((null != entType) && _s0_entityTypeFilter.contains(entType.toLowerCase())) { //(negative filter) nEntsInDoc--; continue; } } //TESTED (end entity filter) // Geo: if (null != shp.aliasInfo) { if (null != shp.aliasInfo.getGeotag()) { //Geo, overwrite/create tmpGeotag if (_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo || (null != _s1_dManualGeoDecay_latLonInvdecay)) { // Always capture alias geo, even if not in low accuracy mode because we add it to the // legitimate geo: if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo) && (null == _s3_geoBuckets)) { // Initialize the buckets if this is for aggregation not just decay _s3_geoBuckets = (LinkedList<EntSigHolder>[]) new LinkedList[_s3_nGEO_BUCKETS]; } if (null == tmpGeotag) { tmpGeotag = new BasicDBObject(); } tmpGeotag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat); tmpGeotag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon); if (null != shp.aliasInfo.getOntology_type()) { e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type()); } } } } //TESTED (end geo for aggregation or decay) _s1_entitiesInDataset.put(entity_index, shp); // end Stage 1a alias handling } //(end if is alias) // Stage 1b alias handling: calculate document counts (taking overlaps into account) if (null != shp.masterAliasSH) { // Counts: shp.masterAliasSH.nTotalDocCount++; // docs including overlaps shp.masterAliasSH.avgFreqOverQuerySubset += freq; // Keep track of overlaps: if (f != shp.masterAliasSH.unusedDbo) { shp.masterAliasSH.unusedDbo = f; // (note this is only used in stage 1, alias.unusedDbo is re-used differently in stage 3/4) shp.masterAliasSH.nDocCountInQuerySubset++; // non-overlapping docs ie < shp.nDocCountInQuerySubset } // Sentiment: shp.masterAliasSH.positiveSentiment += shp.positiveSentiment; shp.masterAliasSH.negativeSentiment += shp.negativeSentiment; if (null != sentiment) { shp.masterAliasSH.nTotalSentimentValues++; } } //TESTED (end if is alias) // end Stage 1b // Pan-community logic (this needs to be before the entity object is updated) if (_s0_multiCommunityHandler.isActive()) { _s0_multiCommunityHandler.community_updateCorrelations(shp, ntotaldoccount, entity_index); } else { // (Once we've started multi-community logic, this is no longer desirable) if ((ntotaldoccount > shp.nTotalDocCount) && (ntotaldoccount <= _s0_globalDocCount)) { shp.nTotalDocCount = ntotaldoccount; } //(note there used to be some cases where we adjusted for dc/tf==0, but the // underlying issue in the data model that caused this has been fixed, so it's // now a pathological case that can be ignored) } //(TESTED) // Update counts: _s1_sumFreqInQuerySubset += freq; shp.avgFreqOverQuerySubset += freq; shp.nDocCountInQuerySubset++; shp.decayedDocCountInQuerySubset += docBucket.geoTemporalDecay; // (note this doesn't handle low accuracy geo-decay ... we'll address that via a separate term) TempEntityInDocBucket entBucket = new TempEntityInDocBucket(); entBucket.dbo = e; entBucket.freq = freq; entBucket.doc = docBucket; shp.entityInstances.add(entBucket); if (null != tmpGeotag) { // (only needed for low accuracy geo aggregation) if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo) && (null == shp.geotag)) { // (first time for shp only) shp.geotag = tmpGeotag; shp.geotaggedEntity = e; // (ie for onto type, which has been overwritten in the alias case...) } if (null != _s1_dManualGeoDecay_latLonInvdecay) { // Emulate scripted Lucene calculations double minlat = tmpGeotag.getDouble(GeoPojo.lat_); double minlon = tmpGeotag.getDouble(GeoPojo.lon_); double paramlat = _s1_dManualGeoDecay_latLonInvdecay[0]; double paramlon = _s1_dManualGeoDecay_latLonInvdecay[1]; double gdecay = _s1_dManualGeoDecay_latLonInvdecay[2]; char ontCode = GeoOntologyMapping .encodeOntologyCode(e.getString(EntityPojo.ontology_type_)); double dDecay = QueryDecayScript.getGeoDecay(minlat, minlon, paramlat, paramlon, gdecay, ontCode); if (dDecay > dBestGeoScore) { dBestGeoScore = dDecay; } } //TESTED } //(end if entity has geo and need to process entity geo) if (freq > shp.maxFreq) { shp.maxFreq = freq; } // Sentiment: if ((null != sentiment) && (Math.abs(sentiment) <= 1.1)) { // (actually 1.0) shp.nTotalSentimentValues++; if (sentiment > 0.0) { shp.positiveSentiment += sentiment; } else { shp.negativeSentiment += sentiment; } } else if (null != sentiment) { // corrupt sentiment for some reason?! e.put(EntityPojo.sentiment_, null); } docBucket.docLength += freq; } //(end loop over entities) docBucket.nLeftToProcess = nEntsInDoc; docBucket.nEntsInDoc = (int) nEntsInDoc; if (null != this._s1_dManualGeoDecay_latLonInvdecay) { // Low accuracy geo-calculations docBucket.geoTemporalDecay *= dBestGeoScore; docBucket.luceneScore *= dBestGeoScore; _s2_dAvgLowAccuracyGeoDecay += dBestGeoScore * s0_nQuerySubsetDocCountInv; } //TESTED } // (end if feed has entities) // Handle documents with no entities - can still promote them if (0 == docBucket.nLeftToProcess) { // (use this rather than doc length in case all the entities had freq 0) _s1_noEntityBuckets.add(docBucket); } } // (end loop over feeds) //TESTED } finally { dbc.setDBDecoderFactory(defaultDecoder); } }
From source file:com.ikanow.infinit.e.processing.custom.utils.CustomApiUtils.java
License:Apache License
public static void getJobResults(ResponsePojo rp, CustomMapReduceJobPojo cmr, int limit, String fields, String findStr, String sortStr, boolean bCsv) { BasicDBObject queryDbo = null;/*from www . j a va 2 s. c o m*/ if (null != findStr) { queryDbo = (BasicDBObject) com.mongodb.util.JSON.parse(findStr); } else { queryDbo = new BasicDBObject(); } //TOTEST BasicDBObject fieldsDbo = new BasicDBObject(); if (null != fields) { fieldsDbo = (BasicDBObject) com.mongodb.util.JSON.parse("{" + fields + "}"); } //return the results: // Need to handle sorting... BasicDBObject sort = null; if (null != sortStr) { //override sort = (BasicDBObject) com.mongodb.util.JSON.parse(sortStr); } else { //defaults String sortField = "_id"; int sortDir = 1; BasicDBObject postProcObject = (BasicDBObject) com.mongodb.util.JSON.parse( InfiniteHadoopUtils.getQueryOrProcessing(cmr.query, InfiniteHadoopUtils.QuerySpec.POSTPROC)); if (postProcObject != null) { sortField = postProcObject.getString("sortField", "_id"); sortDir = postProcObject.getInt("sortDirection", 1); } //TESTED (post proc and no post proc) sort = new BasicDBObject(sortField, sortDir); } //TOTEST // Case 1: DB rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", true, "Map reduce job completed at: " + cmr.lastCompletionTime)); if ((null == cmr.exportToHdfs) || !cmr.exportToHdfs) { DBCursor resultCursor = null; DBCollection coll = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollection); DBDecoderFactory defaultDecoder = coll.getDBDecoderFactory(); CsvGeneratingBsonDecoder csvDecoder = null; SizeReportingBasicBSONDecoder sizeDecoder = null; CustomMapReduceResultPojo cmrr = new CustomMapReduceResultPojo(); try { if (bCsv) { coll.setDBDecoderFactory((csvDecoder = new CsvGeneratingBsonDecoder())); } else { coll.setDBDecoderFactory((sizeDecoder = new SizeReportingBasicBSONDecoder())); } if (limit > 0) { resultCursor = coll.find(queryDbo, fieldsDbo).sort(sort).limit(limit); } else { resultCursor = coll.find(queryDbo, fieldsDbo).sort(sort); } LinkedList<BasicDBObject> list = null; if (!bCsv) { list = new LinkedList<BasicDBObject>(); } final int MAX_SIZE_CSV = 80 * 1024 * 1024; //(80MB) final int MAX_SIZE_JSON = 80 * 1024 * 1024; //(80MB) while (resultCursor.hasNext()) { BasicDBObject x = (BasicDBObject) resultCursor.next(); if (!bCsv) { list.add(x); } if (null != csvDecoder) { if (csvDecoder.getCsv().length() > MAX_SIZE_CSV) { break; } } else if (null != sizeDecoder) { if (sizeDecoder.getSize() > MAX_SIZE_JSON) { break; } } } cmrr.results = list; } finally { coll.setDBDecoderFactory(defaultDecoder); } cmrr.lastCompletionTime = cmr.lastCompletionTime; if (null != csvDecoder) { StringBuffer header = new StringBuffer(); for (String field : csvDecoder.getOrderedFields()) { if (0 != header.length()) { header.append(','); } header.append('"'); header.append(field.replace("\"", "\\\"")); header.append("\""); } header.append('\n'); header.append(csvDecoder.getCsv().toString()); cmrr.results = header.toString(); } rp.setData(cmrr); } //TESTED else { // Case 2: HDFS if ((null != cmr.outputKey) && (null != cmr.outputValue) && cmr.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text") && cmr.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) { // special case, text file try { rp.setData(HadoopUtils.getBsonFromTextFiles(cmr, limit, fields), (BasePojoApiMap<BasicDBList>) null); } catch (Exception e) { rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", false, "Files don't appear to be in text file format, did you run the job before changing the output to Text/Text?")); } } //TESTED else { // sequence file try { rp.setData(HadoopUtils.getBsonFromSequenceFile(cmr, limit, fields), (BasePojoApiMap<BasicDBList>) null); } catch (Exception e) { rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", false, "Files don't appear to be in sequence file format, did you run the job with Text/Text?")); } } //TESTED } //TESTED }