List of usage examples for com.mongodb DBCursor count
public int count()
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
@SuppressWarnings("unchecked") public static boolean splitPrecalculations_newShardScheme(BasicDBObject query, BasicDBObject srcTagsQuery) { // Get the communityIds from the query Collection<ObjectId> communityIds = null; try {/*from w ww . j a va 2s .c o m*/ BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_); communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_); if (null == communityIds) { return false; } } catch (Exception e) { //DEBUG //e.printStackTrace(); return false; // back out } BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_, new BasicDBObject(DbManager.in_, communityIds)); BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1); keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1); keyFields.put(SourcePojo.highestDistributionFactorStored_, 1); // Get and remove the sourceKey information, incorporate into source query, // so it's nice and simple by the time it gets to the actual query Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_); if (null != srcTagsQuery) { // Simpler case: src tags specified, so going to get a list of all the sources regardless if (null != sourceKeyQueryTerm) { keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); } keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_)); } //TESTED (including $all to test that "$srctags":{"$all": ["tagtest","db"]} matches on tags: ["tagtest","db", "tagtest2" ] else if (null != sourceKeyQueryTerm) { boolean sourceKeyQueryComplex = false; if (sourceKeyQueryTerm instanceof BasicDBObject) { BasicDBObject sourceKeyQueryTermDbo = (BasicDBObject) sourceKeyQueryTerm; if (sourceKeyQueryTermDbo.size() <= 2) { // every term must be lt/lte/gt/gte for (String sourceKeyQueryTermEl : sourceKeyQueryTermDbo.keySet()) { if (!sourceKeyQueryTermEl.equals(DbManager.in_) && !sourceKeyQueryTermEl.equals(DbManager.lt_) && !sourceKeyQueryTermEl.equals(DbManager.lte_) && !sourceKeyQueryTermEl.equals(DbManager.gt_) && !sourceKeyQueryTermEl.equals(DbManager.gte_)) { sourceKeyQueryComplex = true; break; } //TESTED (eg ne) else if (sourceKeyQueryTermEl.equals(DbManager.in_) && (1 != sourceKeyQueryTermDbo.size())) { sourceKeyQueryComplex = true; break; } //TESTED ((lt,in)) } } //TESTED: (in, (gte,lt), ne) else { sourceKeyQueryComplex = true; } //TESTED ({ "sourceKey": { "$in": ["test"], "$gt": "alex", "$lte":"test" } }) } else if (sourceKeyQueryTerm instanceof java.util.regex.Pattern) { // probably a sourceKeyQueryComplex = true; } //TESTED ($regex) if (sourceKeyQueryComplex) { keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); // ie we'll simplify it below } else { return false; // already have a perfectly good source key specification } } //TESTED (See combinations above) DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(keyFields); int count = dbc.count(); if (count > 5000) { // (too many source keys to process, just going to leave well alone... note will mean $srctags will fail open) return false; } else { ArrayList<String> sources = new ArrayList<String>(count); while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); String sourceKey = (String) dbo.get(SourcePojo.key_); Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_); sources.addAll(SourcePojo.getDistributedKeys(sourceKey, distributionFactor)); } if (sources.isEmpty()) { throw new RuntimeException(); // will just return no splits at all, no problem } //TESTED if (1 == sources.size()) { query.put(DocumentPojo.sourceKey_, sources.get(0)); } //TESTED else { query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sources)); } //TESTED return true; } }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java
License:Apache License
@SuppressWarnings("unchecked") public static BasicDBList splitPrecalculations_oldShardSchemeOrDebug(BasicDBObject query, BasicDBObject srcTagsQuery, int maxCountPerTask) { // Get the communityIds from the query Collection<ObjectId> communityIds = null; try {// w w w . jav a2s. com BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_); communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_); if (null == communityIds) { return null; } } catch (Exception e) { return null; // back out } BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_, new BasicDBObject(DbManager.in_, communityIds)); BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1); keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1); BasicDBObject sortFields = new BasicDBObject(SourcePojo.key_, 1); // Get and remove the sourceKey information, incorporate into source query: Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_); if (null != sourceKeyQueryTerm) { keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm); } //TESTED if (null != srcTagsQuery) { keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_)); } //TESTED DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(sortFields); // (note the sort is needed so that the potentially expensive doc query has a sensibly ordered $in clause) if (dbc.count() > 5000) { // (too many source keys to process, just going to leave well alone... note this means $srctags will fail open) return null; } else { //TreeMap<String, Long> sourceKeys = new TreeMap<String, Long>(); // Build collections of objects of format { sourceKey: string or [], totalDocs } BasicDBList sourceKeyListCollection = new BasicDBList(); BasicDBList sourceKeyList = null; int runningDocs = 0; int runningSources = 0; while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); String sourceKey = (String) dbo.get(SourcePojo.key_); if (null != sourceKey) { long docCount = 0L; try { BasicDBObject harvestStatus = (BasicDBObject) dbo.get(SourcePojo.harvest_); if (null != harvestStatus) { docCount = harvestStatus.getLong(SourceHarvestStatusPojo.doccount_, 0L); } } catch (Exception e) { } //DEBUG //System.out.println("SOURCE=" + sourceKey + " DOC_COUNT=" + docCount + " RUNNING=" + runningDocs +"," + runningSources + ": " + sourceKeyList); if (docCount > maxCountPerTask) { // source is large enough by itself // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKey); collection.put(SourceHarvestStatusPojo.doccount_, docCount); sourceKeyListCollection.add(collection); // (leaving running* alone, can keep building that) } //TESTED (by eye, system community of demo cluster) else if ((runningDocs + docCount) > maxCountPerTask) { // have now got a large enough collection of sources if (null == sourceKeyList) { sourceKeyList = new BasicDBList(); } sourceKeyList.add(sourceKey); // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount); sourceKeyListCollection.add(collection); sourceKeyList = null; runningDocs = 0; runningSources = 0; } //TESTED (by eye, system community of demo cluster) else if (runningSources >= 15) { // have a limit on the number of sources per query, to keep the queries manageable sourceKeyList.add(sourceKey); // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount); sourceKeyListCollection.add(collection); sourceKeyList = null; runningDocs = 0; runningSources = 0; } //TESTED (by eye, system community of demo cluster) else { // (keep) build(ing) list if (null == sourceKeyList) { sourceKeyList = new BasicDBList(); } sourceKeyList.add(sourceKey); runningDocs += docCount; runningSources++; } //TESTED (by eye, system community of demo cluster) } //(end if has source key) } //(end loop over cursor) // Finish off: if (null != sourceKeyList) { // Create collection BasicDBObject collection = new BasicDBObject(); collection.put(DocumentPojo.sourceKey_, sourceKeyList); collection.put(SourceHarvestStatusPojo.doccount_, runningDocs); sourceKeyListCollection.add(collection); } //TESTED (by eye, system community of demo cluster) if (sourceKeyListCollection.isEmpty()) { // query returns empty throw new RuntimeException("Communities contain no sources"); } return sourceKeyListCollection; } // (end if too many source keys across the communities) }
From source file:com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.AlchemyEntityGeoCleanser.java
License:Open Source License
public boolean cleanseGeoInDocu(DocumentPojo doc) { boolean bChangedAnything = false; Map<String, Candidate> dubiousLocations = new HashMap<String, Candidate>(); Set<String> otherRegions = new HashSet<String>(); Set<String> otherCountries = new HashSet<String>(); Set<String> otherCountriesOrRegionsReferenced = new HashSet<String>(); //Debug/* w ww . j a va2 s. c om*/ if (_nDebugLevel >= 2) { System.out.println( "+++++++ Doc: " + doc.getTitle() + " / " + doc.getId() + " / " + doc.getEntities().size()); } // 1] First off, let's find anything location-based and also determine if it's bad or not if (null != doc.getEntities()) for (EntityPojo ent : doc.getEntities()) { boolean bStrongCandidate = false; // People: decompose names if (EntityPojo.Dimension.Where == ent.getDimension()) { // So locations get disambiguated to one of: // "<city-etc>, <region-or-country>", or "<region-or-country>" // though can also just be left as they are. String sActualName = ent.getActual_name().toLowerCase(); if (!ent.getDisambiguatedName().toLowerCase().equals(sActualName)) { // It's been disambiguated //Debug if (_nDebugLevel >= 3) { System.out.println("disambiguous candidate: " + ent.getDisambiguatedName() + " VS " + ent.getActual_name() + " (" + ((null != ent.getSemanticLinks()) ? ent.getSemanticLinks().size() : 0) + ")"); } // OK next step, is it a disambiguation to a US town? String splitMe[] = ent.getDisambiguatedName().split(", "); if (2 == splitMe.length) { String stateOrCountry = splitMe[1]; Matcher m = _statesRegex.matcher(stateOrCountry); if (m.find()) { // This is a US disambiguation - high risk case // Short cut if state is already directly mentioned? stateOrCountry = stateOrCountry.toLowerCase(); if (!otherRegions.contains(stateOrCountry)) { // See list below - no need to go any further // OK next step - is it a possible ambiguity: ArrayList<BasicDBObject> x = new ArrayList<BasicDBObject>(); BasicDBObject inner0_0 = new BasicDBObject(MongoDbManager.not_, Pattern.compile("US")); BasicDBObject inner1_0 = new BasicDBObject("country_code", inner0_0); x.add(inner1_0); BasicDBObject inner0_1 = new BasicDBObject(MongoDbManager.gte_, 400000); BasicDBObject inner1_1 = new BasicDBObject("population", inner0_1); x.add(inner1_1); BasicDBObject dbo = new BasicDBObject(); dbo.append("search_field", sActualName); dbo.append(MongoDbManager.or_, x); DBCursor dbc = _georefDB.find(dbo); if (dbc.size() >= 1) { // Problems! //Create list of candidates Type listType = new TypeToken<LinkedList<GeoFeaturePojo>>() { }.getType(); LinkedList<GeoFeaturePojo> grpl = new Gson() .fromJson(dbc.toArray().toString(), listType); //Debug if (_nDebugLevel >= 2) { System.out.println("\tERROR CANDIDATE: " + ent.getDisambiguatedName() + " VS " + ent.getActual_name() + " (" + dbc.count() + ")"); if (_nDebugLevel >= 3) { for (GeoFeaturePojo grp : grpl) { System.out.println("\t\tCandidate:" + grp.getCity() + " / " + grp.getRegion() + " / " + grp.getCountry()); } } } Candidate candidate = new Candidate(ent, grpl, stateOrCountry); dubiousLocations.put(ent.getIndex(), candidate); bStrongCandidate = true; } // if strong candidate } //TESTED ("reston, virginia" after "virginia/stateorcounty" mention) // (end if can't shortcut past all this) } // end if a US town } // end if in the format "A, B" } // if weak candidate //TESTED if (!bStrongCandidate) { // Obv can't count on a disambiguous candidate: String type = ent.getType().toLowerCase(); if (type.equals("stateorcounty")) { String disName = ent.getDisambiguatedName().toLowerCase(); if (_abbrStateRegex.matcher(disName).matches()) { otherRegions.add(getStateFromAbbr(disName)); } else { otherRegions.add(ent.getDisambiguatedName().toLowerCase()); } otherCountriesOrRegionsReferenced.add("united states"); } //TESTED: "mich./stateorcounty" else if (type.equals("country")) { String disName = ent.getDisambiguatedName().toLowerCase(); // Translation of known badly transcribed countries: // (England->UK) if (disName.equals("england")) { otherCountries.add("united kingdom"); } //TESTED else { otherCountries.add(ent.getDisambiguatedName().toLowerCase()); } } else if (type.equals("region")) { otherRegions.add(ent.getDisambiguatedName().toLowerCase()); } else if (type.equals("city")) { String splitMe[] = ent.getDisambiguatedName().split(",\\s*"); if (2 == splitMe.length) { otherCountriesOrRegionsReferenced.add(splitMe[1].toLowerCase()); if (this._statesRegex.matcher(splitMe[1]).find()) { otherCountriesOrRegionsReferenced.add("united states"); } //TESTED: "lexingon, kentucky/city" } } } //TESTED: just above clauses } // if location } // (end loop over entities) // Debug: if ((_nDebugLevel >= 3) && (!dubiousLocations.isEmpty())) { for (String s : otherRegions) { System.out.println("Strong region: " + s); } for (String s : otherCountries) { System.out.println("Strong countries: " + s); } for (String s : otherCountriesOrRegionsReferenced) { System.out.println("Weak regionscountries: " + s); } } // 2] The requirements and algorithm are discussed in // http://ikanow.jira.com/wiki/display/INF/Beta...+improving+AlchemyAPI+extraction+%28geo%29 // Canonical cases: // Darfur -> Darfur, MN even though Sudan and sometimes Darfur, Sudan are present // Shanghai -> Shanghai, WV even though China is mentioned (and not WV) // Manchester -> Manchester village, NY (not Manchester, UK) // Philadelphia -> Philadelphia (village), NY (though NY is mentioned and not PA) // We're generating the following order // 10] Sitting tenant with strong direct // 15] Large city with strong direct // 20] Region with direct // 30] Large city with strong indirect // 40] Sitting tenant with strong indirect // 50] Region with indirect // 60] Another foreign possibility with strong direct // 70] Large city with weak direct // 72] Large city with weak indirect // 75] Large city with no reference // 78] Another foreign possibility with strong indirect (>100K population - ie not insignificant) // 80] Sitting tenant with any weak (US) direct or indirect // 90] Another foreign possibility with strong indirect // 100] Another foreign possibility with weak direct // 110] Another foreign possibility with weak indirect // 120] Region with no reference, if there is only 1 // 130] Sitting tenant with none of the above (ie default) // 140] Anything else! for (Map.Entry<String, Candidate> pair : dubiousLocations.entrySet()) { EntityPojo ent = pair.getValue().entity; Candidate candidate = pair.getValue(); // 2.1] Let's analyse the "sitting tenant" int nPrio = 130; GeoFeaturePojo currLeader = null; int nCase = 0; // (just for debugging, 0=st, 1=large city, 2=region, 3=other) if (otherRegions.contains(candidate.state)) { // Strong direct ref, winner! nPrio = 10; // winner! } //TESTED: "san antonio, texas/city" vs "texas" else if (otherCountriesOrRegionsReferenced.contains(candidate.state)) { // Indirect ref nPrio = 40; // good, but beatable... } //TESTED: "philadelphia (village), new york/city" else if (otherCountries.contains("united states")) { // Weak direct ref nPrio = 80; // better than nothing... } //TESTED: "apache, oklahoma/city" else if (otherCountriesOrRegionsReferenced.contains("united states")) { // Weak indirect ref nPrio = 80; // better than nothing... } //TESTED: "washington, d.c." have DC as stateorcounty, but US in countries list // Special case: we don't like "village": if ((80 != nPrio) && ent.getDisambiguatedName().contains("village") && !ent.getActual_name().contains("village")) { nPrio = 80; } //TESTED: "Downvoted: Philadelphia (village), New York from Philadelphia" // Debug if (_nDebugLevel >= 2) { System.out.println(pair.getKey() + " SittingTenantScore=" + nPrio); } // Alternatives if (nPrio > 10) { LinkedList<GeoFeaturePojo> geos = pair.getValue().candidates; for (GeoFeaturePojo geo : geos) { int nAltPrio = 140; int nAltCase = -1; String city = (null != geo.getCity()) ? geo.getCity().toLowerCase() : null; String region = (null != geo.getRegion()) ? geo.getRegion().toLowerCase() : null; String country = (null != geo.getCountry()) ? geo.getCountry().toLowerCase() : null; // 2.2] CASE 1: I'm a city with pop > 1M (best score 15) // 15] Large city with strong direct // 30] Large city with strong indirect // 70] Large city with weak direct // 72] Large city with weak indirect // 75] Large city with no reference if ((null != city) && (geo.getPopulation() >= 400000) && (nPrio > 15)) { nAltCase = 1; if ((null != region) && (otherRegions.contains(region))) { nAltPrio = 15; // strong direct } //TESTED: "dallas / Texas / United States = 15" else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) { nAltPrio = 30; // strong indirect } //TESTED: "sacramento / California / United State" else if ((null != country) && (otherCountries.contains(country))) { nAltPrio = 70; // weak direct } //TESTED: "berlin, germany", with "germany" directly mentioned else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) { nAltPrio = 72; // weak indirect } //TESTED: "los angeles / California / United States = 72" else { nAltPrio = 75; // just for being big! } //TESTED: "barcelona, spain" } // 2.3] CASE 2: I'm a region (best score=20, can beat current score) // 20] Region with direct // 50] Region with indirect // 120] Region with no reference, if there is only 1 else if ((null == city) && (nPrio > 20)) { nAltCase = 2; if ((null != country) && (otherCountries.contains(country))) { nAltPrio = 20; // strong direct } //TESTED: (region) "Berlin, Germany" with "Germany" mentioned else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) { nAltPrio = 50; // strong indirect } //(haven't seen, but we'll live) else { nAltPrio = 120; // (just for being there) } //TESTED: "null / Portland / Jamaica = 120", also "Shanghai / China" } // 2.4] CASE 3: I'm any foreign possibility (best score=60) // 60] Another foreign possibility with strong direct // 78] Another foreign possibility with strong indirect (>100K population - ie not insignificant) // 90] Another foreign possibility with strong indirect // 100] Another foreign possibility with weak direct // 110] Another foreign possibility with weak indirect else if (nPrio > 60) { nAltCase = 3; if ((null != region) && (otherRegions.contains(region))) { nAltPrio = 60; // strong direct // Double check we're not falling into the trap below: if (!geo.getCountry_code().equals("US")) { Matcher m = this._statesRegex.matcher(geo.getRegion()); if (m.matches()) { // non US state matching against (probably) US state, disregard) nAltPrio = 140; } } //TESTED (same clause as below) } //TESTED: lol "philadelphia / Maryland / Liberia = 60" (before above extra clause) if (nAltPrio > 60) { // (may need to re-run test) if ((null != country) && (otherCountries.contains(country))) { if (geo.getPopulation() < 100000) { nAltPrio = 90; // strong indirect } //TESTED: "washington / Villa Clara / Cuba" else { nAltPrio = 78; // strong indirect, with boost! } //TESTED: "geneva, Geneve, Switzerland", pop 180K } else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) { nAltPrio = 100; // weak direct } //TESTED: "lincoln / Lincolnshire / United Kingdom = 100" else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) { nAltPrio = 110; // weak indirect } //(haven't seen, but we'll live) } } // Debug: if ((_nDebugLevel >= 2) && (nAltPrio < 140)) { System.out.println("----Alternative: " + geo.getCity() + " / " + geo.getRegion() + " / " + geo.getCountry() + " score=" + nAltPrio); } // Outcome of results: if (nAltPrio < nPrio) { currLeader = geo; nPrio = nAltPrio; nCase = nAltCase; } } // end loop over alternativse if (null != currLeader) { // Need to change if (1 == nCase) { this._nMovedToLargeCity++; //(Cities are lower case in georef DB for some reason) String city = WordUtils.capitalize(currLeader.getCity()); if (currLeader.getCountry_code().equals("US")) { // Special case: is this just the original? String region = currLeader.getRegion(); if (region.equals("District of Columbia")) { // Special special case region = "D.C."; } String sCandidate = city + ", " + region; if (!sCandidate.equals(ent.getDisambiguatedName())) { ent.setDisambiguatedName(sCandidate); ent.setIndex(ent.getDisambiguatedName() + "/city"); ent.setSemanticLinks(null); bChangedAnything = true; } //TESTED (lots, eg "Philadelphia (village), New York" -> "Philadelphia, PA"; Wash, Ill. -> Wash DC) else { this._nMovedToLargeCity--; _nStayedWithOriginal++; } //TESTED ("Washington DC", "San Juan, Puerto Rico") } //TESTED (see above) else { ent.setDisambiguatedName(city + ", " + currLeader.getCountry()); ent.setIndex(ent.getDisambiguatedName() + "/city"); ent.setSemanticLinks(null); bChangedAnything = true; } //TESTED: "london, california/city to London, United Kingdom" } else if (2 == nCase) { this._nMovedToRegion++; ent.setDisambiguatedName(currLeader.getRegion() + ", " + currLeader.getCountry()); ent.setIndex(ent.getDisambiguatedName() + "/region"); ent.setSemanticLinks(null); bChangedAnything = true; } //TESTED: "Moved madrid, new york/city to Madrid, Spain" (treats Madrid as region, like Berlin see above) else { //(Cities are lower case in georef DB for some reason) String city = WordUtils.capitalize(currLeader.getCity()); this._nMovedToForeignCity++; ent.setDisambiguatedName(city + ", " + currLeader.getCountry()); ent.setIndex(ent.getDisambiguatedName() + "/city"); ent.setSemanticLinks(null); bChangedAnything = true; } //TESTED: "Moved geneva, new york/city to Geneva, Switzerland" if ((_nDebugLevel >= 1) && (null == ent.getSemanticLinks())) { System.out.println("++++ Moved " + pair.getKey() + " to " + ent.getDisambiguatedName()); } } else { _nStayedWithOriginal++; } } // (if sitting tenant not holder) } // (end loop over candidates) if ((_nDebugLevel >= 1) && bChangedAnything) { System.out.println("\t(((Doc: " + doc.getTitle() + " / " + doc.getId() + " / " + doc.getUrl() + ")))"); } return bChangedAnything; }
From source file:com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated.java
License:Open Source License
public boolean needsUpdated_Url(Date modifiedDate, String url, SourcePojo source) { // Performance shortcut: if (!_bCalculatedMostRecentlyModifiedFile) { _bCalculatedMostRecentlyModifiedFile = true; // Get date of most recently modified file: try {/* w ww . j av a 2s .c om*/ if ((null != source.getHarvestStatus()) && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) { BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); if (null != source.getDistributionFactor()) { // if distributed, then apply extra term if ((null != source.getHarvestStatus()) && (null != source.getHarvestStatus().getDistributedLastCompletedCycle())) { Date d = source.getHarvestStatus().getDistributedLastCompletedCycle(); mostRecentQuery.put(DocumentPojo._id_, new BasicDBObject(DbManager.lte_, new ObjectId(d))); } } //TESTED BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1); BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1); if (null != source.getDistributionFactor()) { // (need the created date also mostRecentFields.put(DocumentPojo.created_, 1); } DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata() .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1); if (mostRecentDocs.hasNext()) { BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next(); _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_); _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_); } //TESTED (found docs) //DEBUG //if (null != _mostRecentlyModifiedDocId) // System.out.println("DEDUP: " + mostRecentQuery + ": RESULTS IN " + new Date(_mostRecentlyModifiedDocId.getTime())); } //(success mode) } catch (Exception e) { } // If anything goes wrong will just check all files (slower) } //TESTED if (null != _mostRecentlyModifiedFile) { // Use short cut... long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L; long nFileTime = modifiedDate.getTime() / 1000L; if (nFileTime <= nMostRecentlyModifiedTime) { return false; } } //TESTED if (null == url) { // use this call with url==null to just check the modified file... return true; } // No short cut, go the long way round: DBCollection collection = DbManager.getDocument().getMetadata(); boolean ret = true; BasicDBObject query = new BasicDBObject(); query.put(DocumentPojo.url_, url); query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1); DBCursor dbc = collection.find(query, fields).limit(2); // (will normally return 0 or 1) boolean foundMatch = dbc.hasNext(); if (!foundMatch) { //if there is no record, return true ret = true; } else { BasicDBObject dbo = (BasicDBObject) dbc.next(); Date oldModified = (Date) dbo.get(DocumentPojo.modified_); if ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)) { // times don't match if (!dbc.hasNext()) { // 1 matching doc, different modified times so update ret = true; } //TESTED else { // Not sure about this case, multiple docs, are any of them the same? (Shouldn't ever occur) // (slightly slow but should be OK because not going to happen very often) int nCount = dbc.count(); query.put(DocumentPojo.modified_, modifiedDate); ret = !(collection.find(query).limit(1).count() == nCount); } //TOTEST (shouldn't ever occur) } else { // Doc has same modified time so don't update ret = false; } //TESTED } return ret; }
From source file:com.ikanow.infinit.e.harvest.extraction.document.file.InternalInfiniteFile.java
License:Open Source License
@Override public InfiniteFile[] listFiles(Date optionalFilterDate, int maxDocsPerCycle) { if (_isDirectory) { if (_isShare) { // must be a zip file ArrayList<InfiniteFile> zipFiles = new ArrayList<InfiniteFile>(); @SuppressWarnings("unchecked") Enumeration<net.sf.jazzlib.ZipEntry> entries = _zipView.entries(); while (entries.hasMoreElements()) { net.sf.jazzlib.ZipEntry zipInfo = entries.nextElement(); InternalInfiniteFile newFile = new InternalInfiniteFile(this, zipInfo.getName()); zipFiles.add(newFile);/*w ww .j av a 2 s.c om*/ } return zipFiles.toArray(new InfiniteFile[zipFiles.size()]); } //TESTED (3.2) else if (_isCustom) { // create some virtual directories eg at most 10K per "virtual directory" String outputDatabase = _resultObj.getString(CustomMapReduceJobPojo.outputDatabase_); String outputCollection = _resultObj.getString(CustomMapReduceJobPojo.outputCollection_); if (null == outputDatabase) { outputDatabase = "custommr"; } DBCollection outColl = null; DBCursor dbc = null; if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit)) { // Actual directory DBCollection chunks = MongoDbManager.getCollection("config", "chunks"); StringBuffer ns = new StringBuffer(outputDatabase).append(".").append(outputCollection); dbc = chunks.find(new BasicDBObject("ns", ns.toString())); int splits = dbc.count(); if (splits < 2) { // Nothing to do (unsharded or 1 chunk) dbc.close(); outColl = MongoDbManager.getCollection(outputDatabase, outputCollection); dbc = outColl.find(); } //TESTED (4.2) else { // Create one virtual dir per split InfiniteFile[] virtualDirs = new InfiniteFile[splits]; int added = 0; for (DBObject splitObj : dbc) { BasicDBObject minObj = (BasicDBObject) splitObj.get("min"); BasicDBObject maxObj = (BasicDBObject) splitObj.get("max"); ObjectId minId = null; try { minId = (ObjectId) minObj.get("_id"); } catch (Exception e) { } // min key.. ObjectId maxId = null; try { maxId = (ObjectId) maxObj.get("_id"); } catch (Exception e) { } // max key.. //Handle current case where custom jobs are all dumped in with the wrong _id type if ((null != minId) || (null != maxId)) { if ((null != maxId) && (null != optionalFilterDate)) { // (also used on the files below) if (maxId.getTime() < optionalFilterDate.getTime()) { // (the "getTime()"s can overlap across chunks so we have to use minId // and accept that we'll often deserialize 1+ extra chunk every harvest) continue; } } //TESTED (by hand) InternalInfiniteFile split = new InternalInfiniteFile(this, minId, maxId); virtualDirs[added] = split; added++; } //TESTED (5.2.2, 6.2.2) (chunk skipping by hand) } dbc.close(); return virtualDirs; } //TESTED (5.2.2, 6.2.2) } //TESTED else { // Virtual directory BasicDBObject query = new BasicDBObject(); if (null != _virtualDirStartLimit) { if (null != optionalFilterDate) { ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0); //(zero out the inc/machine ids so this query is independent to calling service) if (altStartId.compareTo(_virtualDirStartLimit) > 0) { // (altStartId > _virtualDirStartLimit) query.put(MongoDbManager.gte_, altStartId); } else { query.put(MongoDbManager.gte_, _virtualDirStartLimit); } } //TESTED (by hand) else { // normal case query.put(MongoDbManager.gte_, _virtualDirStartLimit); } } else if (null != optionalFilterDate) { // (first chunk so always overwrite with optionalFilter date if applicable) ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0); query.put(MongoDbManager.gte_, altStartId); } //TESTED (by hand) if (null != _virtualDirEndLimit) { query.put(MongoDbManager.lt_, _virtualDirEndLimit); } outColl = MongoDbManager.getCollection(outputDatabase, outputCollection); dbc = outColl.find(new BasicDBObject("_id", query)).limit(1 + maxDocsPerCycle); } //TESTED (6.2.2) (doc skipping by hand) if (null != outColl) { // has files, create the actual file objects //DEBUG //System.out.println("CHUNK: GOT " + dbc.count()); int docCount = dbc.count(); if (docCount > 1 + maxDocsPerCycle) { docCount = 1 + maxDocsPerCycle; // (we're limiting it here anyway) } InfiniteFile[] docs = new InfiniteFile[docCount]; int added = 0; for (DBObject docObj : dbc) { // (if didn't use a query then apply internal filter date by hand) if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit) && (null != optionalFilterDate)) { ObjectId docId = (ObjectId) docObj.get("_id"); if (optionalFilterDate.getTime() > docId.getTime()) { continue; } } //TESTED if (added >= maxDocsPerCycle) { // (we've reached our limit so put the remaining docs in a new directory, will only be used if it has to) docs[added] = new InternalInfiniteFile(this, (ObjectId) docObj.get("_id"), _virtualDirEndLimit); break; } else { InternalInfiniteFile doc = new InternalInfiniteFile(this, (BasicDBObject) docObj); docs[added] = doc; } //TESTED (both cases) added++; } dbc.close(); return docs; } //TESTED (4.2) } } else { // can just return myself InfiniteFile[] retVal = new InfiniteFile[1]; retVal[0] = this; return retVal; } //TESTED (1.2, 2.2) return null; }
From source file:com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java
License:Apache License
private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) { ElasticSearchManager elasticManager = null; // Initialize the DB: DBCollection eventFeatureDB = DbManager.getFeature().getAssociation(); // Initialize the ES (create the index if it doesn't already): // 1. Set-up the entity feature index ElasticSearchManager.setDefaultClusterName("infinite-aws"); // (delete the index) //elasticManager = ElasticSearchManager.getIndex("association_index"); //elasticManager.deleteMe(); // Create the index if necessary String sMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(), AssociationFeaturePojoIndexMap.Mapping.class); Builder localSettings = ImmutableSettings.settingsBuilder(); localSettings.put("number_of_shards", 1).put("number_of_replicas", 0); localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); elasticManager = ElasticSearchManager.createIndex("association_index", null, false, null, sMapping, localSettings);//from ww w. java 2 s . c o m // Get the index (necessary if already created) if (null == elasticManager) { elasticManager = ElasticSearchManager.getIndex("association_index"); } // Now query the DB: DBCursor dbc = null; dbc = eventFeatureDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } List<AssociationFeaturePojo> events = new LinkedList<AssociationFeaturePojo>(); int nSynced = 0; // Loop over array and invoke the cleansing function for each one while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); AssociationFeaturePojo evt = AssociationFeaturePojo.fromDb(dbo, AssociationFeaturePojo.class); // If this table has just been rebuilt from the document then the indexes are all wrong ... // recalculate and save if ('#' == evt.getIndex().charAt(0)) { AssociationPojo singleEvt = new AssociationPojo(); singleEvt.setEntity1_index(evt.getEntity1_index()); singleEvt.setEntity2_index(evt.getEntity2_index()); singleEvt.setVerb_category(evt.getVerb_category()); singleEvt.setGeo_index(evt.getGeo_index()); evt.setIndex(AssociationAggregationUtils.getEventFeatureIndex(singleEvt)); eventFeatureDB .update(new BasicDBObject("_id", dbo.get("_id")), new BasicDBObject(MongoDbManager.set_, new BasicDBObject(AssociationFeaturePojo.index_, evt.getIndex())), false, true); // (has to be a multi-update even though it's unique because it's sharded on index) } // Handle groups (system group is: "4c927585d591d31d7b37097a") if (null == evt.getCommunityId()) { evt.setCommunityId(new ObjectId("4c927585d591d31d7b37097a")); } // Bulk add prep events.add(evt); nSynced++; if (events.size() > 1000) { elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events, AssociationFeaturePojo.listType(), new AssociationFeaturePojoIndexMap()), "_id", null, true); events.clear(); } } // End loop over entities //write whatevers left elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(events, AssociationFeaturePojo.listType(), new AssociationFeaturePojoIndexMap()), "_id", null, true); if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } }
From source file:com.ikanow.infinit.e.utility.MongoAssociationFeatureTxfer.java
License:Apache License
private void doDelete(BasicDBObject query, int nLimit) { try {/*from w w w .j av a2 s .com*/ // Initialize the DB: DBCollection eventFeatureDB = DbManager.getFeature().getAssociation(); DBCursor cur = eventFeatureDB.find(query).limit(nLimit); // (this internally works in batches of 1000; just get _id) System.out.println("Found " + cur.count() + " records to delete"); if (nLimit > 0) { System.out.println("(limited to " + nLimit + " records)"); } ArrayList<AssociationFeaturePojo> events = new ArrayList<AssociationFeaturePojo>(); LinkedList<String> eventIds = new LinkedList<String>(); while (cur.hasNext()) { AssociationFeaturePojo event = AssociationFeaturePojo.fromDb(cur.next(), AssociationFeaturePojo.class); events.add(event); eventIds.add( new StringBuffer(event.getIndex()).append(":").append(event.getCommunityId()).toString()); eventFeatureDB.remove(new BasicDBObject("index", event.getIndex())); } ElasticSearchManager elasticManager = ElasticSearchManager.getIndex("association_index"); elasticManager.bulkDeleteDocuments(eventIds); } catch (NumberFormatException e) { e.printStackTrace(); } catch (MongoException e) { e.printStackTrace(); } }
From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java
License:Apache License
private void doTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, BasicDBObject chunk) throws IOException { PropertiesManager pm = new PropertiesManager(); int nMaxContentSize_bytes = pm.getMaxContentSize(); // Initialize the DB: DBCollection docsDB = DbManager.getDocument().getMetadata(); DBCollection contentDB = DbManager.getDocument().getContent(); DBCollection sourcesDB = DbManager.getIngest().getSource(); ElasticSearchManager.setDefaultClusterName("infinite-aws"); // 1. Get the documents from the DB (combining data + metadata and refreshing source meta) // (Ignore soft-deleted records:) if (null == query) { query = new BasicDBObject(); }//from w w w. j a va 2 s . c om Object sourceKeyQueryTerm = query.remove(DocumentPojo.sourceKey_); if (null != sourceKeyQueryTerm) { if (query.toString() .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) { throw new RuntimeException( "Can't specify sourceKey as part of complex query term: " + query.toString()); } //TESTED (by hand, "{ \"sourceKey\": \"x\", \"$or\": [ { \"sourceKey\": \"x\" } ] }") if (sourceKeyQueryTerm instanceof String) { query.put(DocumentPojo.sourceKey_, SourcePojo.getDistributedKeyQueryTerm((String) sourceKeyQueryTerm)); } //TESTED (by hand, "{\"sourceKey\": \"feeds.arstechnica.com.arstechnica.index.11.2.\" }") else if (sourceKeyQueryTerm instanceof DBObject) { // find all the _sources_ matching this term, and convert to a big list including distribution BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1); fields.put(SourcePojo.highestDistributionFactorStored_, 1); DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.key_, sourceKeyQueryTerm), fields); LinkedList<String> sourceKeys = new LinkedList<String>(); for (DBObject dbo : dbc) { String key = (String) dbo.get(SourcePojo.key_); Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_); Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor); sourceKeys.addAll(sourceKeysForSource); } query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys)); } //TESTED (by hand, "{\"sourceKey\": { \"$gt\": \"dev.ikanow\" } }") else { throw new RuntimeException("Can't specify sourceKey as part of complex query term"); } //(actually not possible, just included here for mathematical completeness...) } else { if (query.toString() .contains(new StringBuffer('"').append(DocumentPojo.sourceKey_).append('"').toString())) { throw new RuntimeException("Can't specify sourceKey as part of complex query term"); } //TESTE (by hand, "{ \"$or\": [ { \"sourceKey\": \"x\" } ] }") // Optimize communityId into sourceKeys... if (null != query.get(DocumentPojo.communityId_)) { try { ObjectId commId = query.getObjectId(DocumentPojo.communityId_); BasicDBObject fields = new BasicDBObject(SourcePojo.key_, 1); fields.put(SourcePojo.highestDistributionFactorStored_, 1); DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.communityIds_, commId), fields); LinkedList<String> sourceKeys = new LinkedList<String>(); int added = 0; for (DBObject dbo : dbc) { String key = (String) dbo.get(SourcePojo.key_); Integer distributionFactor = (Integer) dbo.get(SourcePojo.highestDistributionFactorStored_); Collection<String> sourceKeysForSource = SourcePojo.getDistributedKeys(key, distributionFactor); sourceKeys.addAll(sourceKeysForSource); added += sourceKeysForSource.size(); } query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys)); System.out.println("(Optimized simple community query to " + added + " source key(s))"); } catch (Exception e) { //DEBUG //e.printStackTrace(); System.out.println("(Can't optimize complex community query: " + e.getMessage()); } } //TESTED (by hand - including distributed source version) } // Ignored delete objects Object urlQuery = query.get(DocumentPojo.url_); if (null == urlQuery) { query.put(DocumentPojo.url_, Pattern.compile("^[^?]")); // (ie nothing starting with ?) } //TESTED else if (urlQuery instanceof BasicDBObject) { ((BasicDBObject) urlQuery).append("$regex", "^[^?]"); } //TESTED //DEBUG //System.out.println("COMBINED QUERY= " + query.toString()); // If aggregating, kick off the background aggregation thread if (bAggregate) { EntityBackgroundAggregationManager.startThread(); AssociationBackgroundAggregationManager.startThread(); } //Debug: DBCursor dbc = null; dbc = docsDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } byte[] storageArray = new byte[200000]; int nSynced = 0; LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>(); Map<ObjectId, LinkedList<DocumentPojo>> communityList = null; ObjectId currCommunityId = null; while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class); String sDocIndex = doc.getIndex(); if (null == sDocIndex) { sDocIndex = "document_index"; } if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) { _deletedIndex.add(sDocIndex); rebuildIndex(sDocIndex); try { // (Just in case the index requires some time to sort itself out) Thread.sleep(1000); } catch (InterruptedException e) { } } //Debug: //System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl()); // Get the content: if ((0 != nMaxContentSize_bytes) && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl())) { BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl()); contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey()))); BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1); fields.put(CompressedFullTextPojo.sourceKey_, 1); DBCursor dbcGzip = contentDB.find(contentQ, fields); while (dbcGzip.hasNext()) { BasicDBObject dboContent = (BasicDBObject) dbcGzip.next(); if (!dboContent.containsField(CompressedFullTextPojo.sourceKey_)) { // If this has another version then ignore this one... if (dbc.hasNext()) { continue; } //TESTED (by hand) } byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_)); ByteArrayInputStream in = new ByteArrayInputStream(compressedData); GZIPInputStream gzip = new GZIPInputStream(in); int nRead = 0; StringBuffer output = new StringBuffer(); while (nRead >= 0) { nRead = gzip.read(storageArray, 0, 200000); if (nRead > 0) { String s = new String(storageArray, 0, nRead, "UTF-8"); output.append(s); } } doc.setFullText(output.toString()); } } // (else document has full text already) // Get tags, if necessary: // Always overwrite tags - one of the reasons we might choose to migrate // Also may need source in order to support source index filtering SourcePojo src = _sourceCache.get(doc.getSourceKey()); if (null == src) { //TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?) BasicDBObject srcDbo = (BasicDBObject) sourcesDB .findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey())); if (null != srcDbo) { src = SourcePojo.fromDb(srcDbo, SourcePojo.class); if (null != src.getProcessingPipeline()) { try { // Set the index settings HarvestController hc = new HarvestController(); HarvestControllerPipeline hcPipe = new HarvestControllerPipeline(); hcPipe.extractSource_preProcessingPipeline(src, hc); } catch (Exception e) { //DEBUG e.printStackTrace(); } } //TESTED (by hand) _sourceCache.put(doc.getSourceKey(), src); } } doc.setTempSource(src); // (needed for source index filtering) if (null != src) { if (null != src.getTags()) { Set<String> tagsTidied = new TreeSet<String>(); for (String s : src.getTags()) { String ss = s.trim().toLowerCase(); tagsTidied.add(ss); } // May also want to write this back to the DB: //TODO (INF-2223): Handle append tags or not in the pipeline... if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) { if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) { BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_, doc.getRawSourceKey()); // (ie including the # if there is one) updateQuery.put(DocumentPojo._id_, doc.getId()); docsDB.update(updateQuery, new BasicDBObject(DbManager.addToSet_, new BasicDBObject(DocumentPojo.tags_, new BasicDBObject(DbManager.each_, tagsTidied)))); } doc.setTags(tagsTidied); // (just copy ptr across) } } } // 2. Update the index with the new document // (Optionally also update entity and assoc features) if (bAggregate) { if (null == currCommunityId) { currCommunityId = doc.getCommunityId(); } else if (!currCommunityId.equals(doc.getCommunityId())) { LinkedList<DocumentPojo> perCommunityDocList = null; if (null == communityList) { // (very first time we see > 1 community) communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>(); perCommunityDocList = new LinkedList<DocumentPojo>(); perCommunityDocList.addAll(docsToTransfer); //(NOT including doc, this hasn't been added to docsToTransfer yet) communityList.put(currCommunityId, perCommunityDocList); } currCommunityId = doc.getCommunityId(); perCommunityDocList = communityList.get(currCommunityId); if (null == perCommunityDocList) { perCommunityDocList = new LinkedList<DocumentPojo>(); communityList.put(currCommunityId, perCommunityDocList); } perCommunityDocList.add(doc); } } //TESTED nSynced++; docsToTransfer.add(doc); if (0 == (nSynced % 10000)) { StoreAndIndexManager manager = new StoreAndIndexManager(); if (bAggregate) { // Loop over communities and aggregate each one then store the modified entities/assocs doAggregation(communityList, docsToTransfer); communityList = null; // (in case the next 10,000 docs are all in the same community!) currCommunityId = null; } //TOTEST manager.addToSearch(docsToTransfer); docsToTransfer.clear(); System.out.println("(Synced " + nSynced + " records)"); } } // (End loop over docs) // Sync remaining docs if (!docsToTransfer.isEmpty()) { if (bAggregate) { // Loop over communities and aggregate each one then store the modified entities/assocs doAggregation(communityList, docsToTransfer); } StoreAndIndexManager manager = new StoreAndIndexManager(); manager.addToSearch(docsToTransfer); } if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } if (bAggregate) { System.out.println("Completed. You can hit CTRL+C at any time."); System.out.println( "By default it will keep running for 5 minutes while the background aggregation runs to update the documents' entities."); try { Thread.sleep(300000); } catch (InterruptedException e) { } // Turn off so we can exit EntityBackgroundAggregationManager.stopThreadAndWait(); AssociationBackgroundAggregationManager.stopThreadAndWait(); } }
From source file:com.ikanow.infinit.e.utility.MongoDocumentTxfer.java
License:Apache License
private void doDelete(BasicDBObject query, int nLimit) { try {/*from w w w .ja v a2 s.c o m*/ // Get the documents to delete BasicDBObject queryFields = new BasicDBObject(DocumentPojo.sourceKey_, 1); queryFields.put(DocumentPojo.sourceUrl_, 1); queryFields.put(DocumentPojo.url_, 1); queryFields.put(DocumentPojo.communityId_, 1); queryFields.put(DocumentPojo.index_, 1); DBCursor cur = DbManager.getDocument().getMetadata().find(query, queryFields).limit(nLimit); // (this internally works in batches of 1000) System.out.println("Found " + cur.count() + " records to delete"); if (nLimit > 0) { System.out.println("(limited to " + nLimit + " records)"); } List<DocumentPojo> docs = DocumentPojo.listFromDb(cur, DocumentPojo.listType()); // Keep track of number of docs per community getting deleted Map<ObjectId, Integer> communityMap = new HashMap<ObjectId, Integer>(); Map<String, Integer> sourceKeyMap = new HashMap<String, Integer>(); for (DocumentPojo doc : docs) { if (null != doc.getSourceKey()) { // (can only happen by error, still) ObjectId community = doc.getCommunityId(); Integer count = communityMap.get(community); communityMap.put(community, (count == null ? 1 : count + 1)); String sourceKey = doc.getSourceKey(); Integer count2 = sourceKeyMap.get(sourceKey); sourceKeyMap.put(sourceKey, (count2 == null ? 1 : count2 + 1)); } } StoreAndIndexManager dataStore = new StoreAndIndexManager(); dataStore.removeFromDatastore_byURL(docs, null); AggregationManager.updateEntitiesFromDeletedDocuments(dataStore.getUUID()); dataStore.removeSoftDeletedDocuments(); AggregationManager.updateDocEntitiesFromDeletedDocuments(dataStore.getUUID()); // Actually update the DB counts: for (Map.Entry<ObjectId, Integer> communityInfo : communityMap.entrySet()) { System.out.println("Removed " + communityInfo.getValue() + " records from community " + communityInfo.getKey()); DbManager.getDocument().getCounts().update(new BasicDBObject("_id", communityInfo.getKey()), new BasicDBObject("$inc", new BasicDBObject("doccount", -communityInfo.getValue()))); } for (Map.Entry<String, Integer> sourceInfo : sourceKeyMap.entrySet()) { System.out.println( "Removed " + sourceInfo.getValue() + " records from source " + sourceInfo.getKey()); DbManager.getIngest().getSource().update(new BasicDBObject("key", sourceInfo.getKey()), new BasicDBObject("$inc", new BasicDBObject("harvest.doccount", -sourceInfo.getValue()))); } } catch (Exception e) { e.printStackTrace(); } }
From source file:com.ikanow.infinit.e.utility.MongoEntityFeatureTxfer.java
License:Apache License
private void doTransfer(BasicDBObject query, int nSkip, int nLimit, BasicDBObject chunk) { ElasticSearchManager elasticManager = null; // Initialize the DB: DBCollection entityFeatureDB = DbManager.getFeature().getEntity(); // Initialize the ES (create the index if it doesn't already): // 1. Set-up the entity feature index String indexName = "entity_index"; ElasticSearchManager.setDefaultClusterName("infinite-aws"); // (delete the index) //elasticManager = ElasticSearchManager.getIndex(indexName); //elasticManager.deleteMe(); // Create the index if necessary String sMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(), EntityFeaturePojoIndexMap.Mapping.class); Builder localSettings = ImmutableSettings.settingsBuilder(); localSettings.put("number_of_shards", 1).put("number_of_replicas", 0); localSettings.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); localSettings.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); elasticManager = ElasticSearchManager.createIndex(indexName, null, false, null, sMapping, localSettings); // Get the index (necessary if already created) if (null == elasticManager) { elasticManager = ElasticSearchManager.getIndex(indexName); }/*from w w w . j av a2 s. c o m*/ // Now query the DB: DBCursor dbc = null; dbc = entityFeatureDB.find(query); if (null != chunk) { if (chunk.containsField(DbManager.min_)) { dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_)); } if (chunk.containsField(DbManager.max_)) { dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_)); } } dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000); if (null == chunk) { int nCount = dbc.count() - nSkip; if (nCount < 0) nCount = 0; System.out.println( "Found " + nCount + " records to sync, process first " + (0 == nLimit ? nCount : nLimit)); if (0 == nCount) { // Nothing to do... return; } } int nSynced = 0; List<EntityFeaturePojo> entities = new ArrayList<EntityFeaturePojo>(); while (dbc.hasNext()) { EntityFeaturePojo feature = EntityFeaturePojo.fromDb(dbc.next(), EntityFeaturePojo.class); if (null != feature.getAlias()) { // (some corrupt gazateer entry) // Handle groups (system group is: "4c927585d591d31d7b37097a") // if there is no community id, add system group (something is wrong if this happens?) if (null == feature.getCommunityId()) { feature.setCommunityId(new ObjectId("4c927585d591d31d7b37097a")); } } entities.add(feature); nSynced++; // Add the entities if (entities.size() > 1000) { elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(), new EntityFeaturePojoIndexMap()), "_id", null, true); // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community) entities = new ArrayList<EntityFeaturePojo>(); } } //write whatevers left elasticManager.bulkAddDocuments(IndexManager.mapListToIndex(entities, EntityFeaturePojo.listType(), new EntityFeaturePojoIndexMap()), "_id", null, true); // (note EntityFeaturePojoIndexMap creates an "_id" field of the format index:community) if (null != chunk) { System.out.println("Found " + nSynced + " records to sync in chunk"); } }