List of usage examples for com.mongodb BasicDBObject append
@Override public BasicDBObject append(final String key, final Object val)
From source file:com.ikanow.infinit.e.data_model.store.MongoDbUtil.java
License:Apache License
public static BasicDBObject encode(JsonObject o) { BasicDBObject dbo = new BasicDBObject(); for (Map.Entry<String, JsonElement> elKV : o.entrySet()) { dbo.append(elKV.getKey(), encodeUnknown(elKV.getValue())); }/*from w ww .j a va 2 s . com*/ return dbo; }
From source file:com.ikanow.infinit.e.harvest.enrichment.custom.GeoReference.java
License:Open Source License
/** * getQuery/* ww w . j a v a 2 s . c o m*/ * @param g * @param hasGeoindex * @param attempt * @return */ //TODO (INF-1864): running this in non-strict mode can cripple the DB since search field might not //be set ... at least need to cache such queries (almost always the US every time!).... private static BasicDBObject getQuery(Boolean hasGeoindex, int attempt) { BasicDBObject query = new BasicDBObject(); // SearchField String searchField = (geoInfo.getSearch_field() != null) ? geoInfo.getSearch_field().toLowerCase() : null; // Cities are all lower case in the georeference collection, set toLowerCase here String city = (geoInfo.getCity() != null) ? geoInfo.getCity().toLowerCase() : null; // Use WordUtils.capitalize to set first char of region and country words to Upper Case String region = (geoInfo.getRegion() != null) ? WordUtils.capitalize(geoInfo.getRegion()) : null; String country = (geoInfo.getCountry() != null) ? WordUtils.capitalize(geoInfo.getCountry()) : null; String countryCode = geoInfo.getCountry_code(); // If the only field sent was the search_field if ((searchField != null) && (city == null) && (region == null) && (country == null) && (countryCode == null)) { query.put("search_field", searchField); } // Otherwise... else { switch (attempt) { case 1: // Set the searchField if it is null if (searchField == null && city != null) searchField = city.toLowerCase(); if (searchField == null && region != null) searchField = region.toLowerCase(); if (searchField == null && country != null) searchField = country.toLowerCase(); // if (searchField != null) query.put("search_field", searchField); if (city != null) query.put("city", city); if (region != null) query.put("region", region); if (country != null) query.put("country", country); if (null == searchField) { // only country code specified... query.put("city", new BasicDBObject(DbManager.exists_, false)); query.put("region", new BasicDBObject(DbManager.exists_, false)); } if (countryCode != null) query.put("country_code", countryCode); break; case 2: if (city != null) { query.put("search_field", city.toLowerCase()); query.put("city", city); } else if (region != null) { query.put("search_field", region.toLowerCase()); query.put("region", region); } else { query.put("search_field", country.toLowerCase()); } if (country != null) query.put("country", country); if (countryCode != null) query.put("country_code", countryCode); break; case 3: if (searchField == null && region != null) searchField = region.toLowerCase(); if (searchField == null && country != null) searchField = country.toLowerCase(); if (searchField != null) query.put("search_field", searchField); if (region != null) query.put("region", region); if (country != null) query.put("country", country); if (countryCode != null) query.put("country_code", countryCode); break; default: if (country != null) query.put("search_field", country.toLowerCase()); if (country != null) query.put("country", country); if (countryCode != null) query.put("country_code", countryCode); break; } } if (query.isEmpty()) { return null; } // Only return records with GeoIndex objects if (hasGeoindex) { BasicDBObject ne = new BasicDBObject(); ne.append(DbManager.exists_, true); query.put("geoindex", ne); } return query; }
From source file:com.ikanow.infinit.e.harvest.enrichment.custom.GeoReference.java
License:Open Source License
/** * getNearestCities/* w w w. j a va2 s . c o m*/ * Get n-cities near a lat/lon pair, results returned ordered by distance from * the lat/lon pair * @param lat * @param lon * @param nMaxReturns * @return List<GeoReferencePojo> */ public static List<GeoFeaturePojo> getNearestCities(DBCollection geoDb, String lat, String lon, int nMaxReturns) { try { // Create Double[] from lat, lon Double[] d = new Double[] { Double.parseDouble(lat), Double.parseDouble(lon) }; // Build query object to return the shell equivalent of: // db.georeference.find({geoindex : {$near : [lat.lon]}}) BasicDBObject query = new BasicDBObject(); BasicDBObject near = new BasicDBObject(); near.append("$near", d); query.put("geoindex", near); // Perform query DBCursor result = geoDb.find(query).limit(nMaxReturns); // Convert results to List<GeoReferencePojo> List<GeoFeaturePojo> gpl = GeoFeaturePojo.listFromDb(result, new TypeToken<ArrayList<GeoFeaturePojo>>() { }); return gpl; } catch (Exception e) { return null; } }
From source file:com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.AlchemyEntityGeoCleanser.java
License:Open Source License
public void doProcessing(int nSkip, BasicDBObject userQuery, int nLimit, boolean bAlterDB) throws NumberFormatException, UnknownHostException, MongoException { // Initialization (regexes and stuff) this.initialize(); // Launch MongoDB query BasicDBObject query = userQuery;/*from w ww . j a v a2 s . c o m*/ if (null == query) { new BasicDBObject(); } // Just get the entity list out to save a few CPU cycles BasicDBObject outFields = new BasicDBObject(); outFields.append(DocumentPojo.entities_, 1); outFields.append(DocumentPojo.url_, 1); // (help with debugging) outFields.append(DocumentPojo.title_, 1); // (help with debugging) DBCursor dbc = null; if (nLimit > 0) { dbc = _docsDB.find(query, outFields).limit(nLimit).skip(nSkip); } else { // Everything! dbc = _docsDB.find(query, outFields).skip(nSkip); } // Create POJO array of documents (definitely not the most efficient, but // will make integration with the harvester easier) List<DocumentPojo> docus = DocumentPojo.listFromDb(dbc, DocumentPojo.listType()); // Loop over array and invoke the cleansing function for each one for (DocumentPojo docu : docus) { if (this.cleanseGeoInDocu(docu)) { this._nDocusModified++; if (bAlterDB) { BasicDBObject inner0 = new BasicDBObject(DocumentPojo.entities_, (DBObject) com.mongodb.util.JSON.parse(new Gson().toJson(docu.getEntities()))); BasicDBObject inner1 = new BasicDBObject(MongoDbManager.set_, inner0); // Overwrite the existing entities list with the new one _docsDB.update(new BasicDBObject(DocumentPojo._id_, docu.getId()), inner1, false, true); // (need the multi-update in case _id isn't the shard key - documentation claims this is not necessary but 2.4.6/shell still enforces it) } //TESTED } this._nDocusProcessed++; } }
From source file:com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.AlchemyEntityGeoCleanser.java
License:Open Source License
public boolean cleanseGeoInDocu(DocumentPojo doc) { boolean bChangedAnything = false; Map<String, Candidate> dubiousLocations = new HashMap<String, Candidate>(); Set<String> otherRegions = new HashSet<String>(); Set<String> otherCountries = new HashSet<String>(); Set<String> otherCountriesOrRegionsReferenced = new HashSet<String>(); //Debug/*from ww w .j a v a2 s.c om*/ if (_nDebugLevel >= 2) { System.out.println( "+++++++ Doc: " + doc.getTitle() + " / " + doc.getId() + " / " + doc.getEntities().size()); } // 1] First off, let's find anything location-based and also determine if it's bad or not if (null != doc.getEntities()) for (EntityPojo ent : doc.getEntities()) { boolean bStrongCandidate = false; // People: decompose names if (EntityPojo.Dimension.Where == ent.getDimension()) { // So locations get disambiguated to one of: // "<city-etc>, <region-or-country>", or "<region-or-country>" // though can also just be left as they are. String sActualName = ent.getActual_name().toLowerCase(); if (!ent.getDisambiguatedName().toLowerCase().equals(sActualName)) { // It's been disambiguated //Debug if (_nDebugLevel >= 3) { System.out.println("disambiguous candidate: " + ent.getDisambiguatedName() + " VS " + ent.getActual_name() + " (" + ((null != ent.getSemanticLinks()) ? ent.getSemanticLinks().size() : 0) + ")"); } // OK next step, is it a disambiguation to a US town? String splitMe[] = ent.getDisambiguatedName().split(", "); if (2 == splitMe.length) { String stateOrCountry = splitMe[1]; Matcher m = _statesRegex.matcher(stateOrCountry); if (m.find()) { // This is a US disambiguation - high risk case // Short cut if state is already directly mentioned? stateOrCountry = stateOrCountry.toLowerCase(); if (!otherRegions.contains(stateOrCountry)) { // See list below - no need to go any further // OK next step - is it a possible ambiguity: ArrayList<BasicDBObject> x = new ArrayList<BasicDBObject>(); BasicDBObject inner0_0 = new BasicDBObject(MongoDbManager.not_, Pattern.compile("US")); BasicDBObject inner1_0 = new BasicDBObject("country_code", inner0_0); x.add(inner1_0); BasicDBObject inner0_1 = new BasicDBObject(MongoDbManager.gte_, 400000); BasicDBObject inner1_1 = new BasicDBObject("population", inner0_1); x.add(inner1_1); BasicDBObject dbo = new BasicDBObject(); dbo.append("search_field", sActualName); dbo.append(MongoDbManager.or_, x); DBCursor dbc = _georefDB.find(dbo); if (dbc.size() >= 1) { // Problems! //Create list of candidates Type listType = new TypeToken<LinkedList<GeoFeaturePojo>>() { }.getType(); LinkedList<GeoFeaturePojo> grpl = new Gson() .fromJson(dbc.toArray().toString(), listType); //Debug if (_nDebugLevel >= 2) { System.out.println("\tERROR CANDIDATE: " + ent.getDisambiguatedName() + " VS " + ent.getActual_name() + " (" + dbc.count() + ")"); if (_nDebugLevel >= 3) { for (GeoFeaturePojo grp : grpl) { System.out.println("\t\tCandidate:" + grp.getCity() + " / " + grp.getRegion() + " / " + grp.getCountry()); } } } Candidate candidate = new Candidate(ent, grpl, stateOrCountry); dubiousLocations.put(ent.getIndex(), candidate); bStrongCandidate = true; } // if strong candidate } //TESTED ("reston, virginia" after "virginia/stateorcounty" mention) // (end if can't shortcut past all this) } // end if a US town } // end if in the format "A, B" } // if weak candidate //TESTED if (!bStrongCandidate) { // Obv can't count on a disambiguous candidate: String type = ent.getType().toLowerCase(); if (type.equals("stateorcounty")) { String disName = ent.getDisambiguatedName().toLowerCase(); if (_abbrStateRegex.matcher(disName).matches()) { otherRegions.add(getStateFromAbbr(disName)); } else { otherRegions.add(ent.getDisambiguatedName().toLowerCase()); } otherCountriesOrRegionsReferenced.add("united states"); } //TESTED: "mich./stateorcounty" else if (type.equals("country")) { String disName = ent.getDisambiguatedName().toLowerCase(); // Translation of known badly transcribed countries: // (England->UK) if (disName.equals("england")) { otherCountries.add("united kingdom"); } //TESTED else { otherCountries.add(ent.getDisambiguatedName().toLowerCase()); } } else if (type.equals("region")) { otherRegions.add(ent.getDisambiguatedName().toLowerCase()); } else if (type.equals("city")) { String splitMe[] = ent.getDisambiguatedName().split(",\\s*"); if (2 == splitMe.length) { otherCountriesOrRegionsReferenced.add(splitMe[1].toLowerCase()); if (this._statesRegex.matcher(splitMe[1]).find()) { otherCountriesOrRegionsReferenced.add("united states"); } //TESTED: "lexingon, kentucky/city" } } } //TESTED: just above clauses } // if location } // (end loop over entities) // Debug: if ((_nDebugLevel >= 3) && (!dubiousLocations.isEmpty())) { for (String s : otherRegions) { System.out.println("Strong region: " + s); } for (String s : otherCountries) { System.out.println("Strong countries: " + s); } for (String s : otherCountriesOrRegionsReferenced) { System.out.println("Weak regionscountries: " + s); } } // 2] The requirements and algorithm are discussed in // http://ikanow.jira.com/wiki/display/INF/Beta...+improving+AlchemyAPI+extraction+%28geo%29 // Canonical cases: // Darfur -> Darfur, MN even though Sudan and sometimes Darfur, Sudan are present // Shanghai -> Shanghai, WV even though China is mentioned (and not WV) // Manchester -> Manchester village, NY (not Manchester, UK) // Philadelphia -> Philadelphia (village), NY (though NY is mentioned and not PA) // We're generating the following order // 10] Sitting tenant with strong direct // 15] Large city with strong direct // 20] Region with direct // 30] Large city with strong indirect // 40] Sitting tenant with strong indirect // 50] Region with indirect // 60] Another foreign possibility with strong direct // 70] Large city with weak direct // 72] Large city with weak indirect // 75] Large city with no reference // 78] Another foreign possibility with strong indirect (>100K population - ie not insignificant) // 80] Sitting tenant with any weak (US) direct or indirect // 90] Another foreign possibility with strong indirect // 100] Another foreign possibility with weak direct // 110] Another foreign possibility with weak indirect // 120] Region with no reference, if there is only 1 // 130] Sitting tenant with none of the above (ie default) // 140] Anything else! for (Map.Entry<String, Candidate> pair : dubiousLocations.entrySet()) { EntityPojo ent = pair.getValue().entity; Candidate candidate = pair.getValue(); // 2.1] Let's analyse the "sitting tenant" int nPrio = 130; GeoFeaturePojo currLeader = null; int nCase = 0; // (just for debugging, 0=st, 1=large city, 2=region, 3=other) if (otherRegions.contains(candidate.state)) { // Strong direct ref, winner! nPrio = 10; // winner! } //TESTED: "san antonio, texas/city" vs "texas" else if (otherCountriesOrRegionsReferenced.contains(candidate.state)) { // Indirect ref nPrio = 40; // good, but beatable... } //TESTED: "philadelphia (village), new york/city" else if (otherCountries.contains("united states")) { // Weak direct ref nPrio = 80; // better than nothing... } //TESTED: "apache, oklahoma/city" else if (otherCountriesOrRegionsReferenced.contains("united states")) { // Weak indirect ref nPrio = 80; // better than nothing... } //TESTED: "washington, d.c." have DC as stateorcounty, but US in countries list // Special case: we don't like "village": if ((80 != nPrio) && ent.getDisambiguatedName().contains("village") && !ent.getActual_name().contains("village")) { nPrio = 80; } //TESTED: "Downvoted: Philadelphia (village), New York from Philadelphia" // Debug if (_nDebugLevel >= 2) { System.out.println(pair.getKey() + " SittingTenantScore=" + nPrio); } // Alternatives if (nPrio > 10) { LinkedList<GeoFeaturePojo> geos = pair.getValue().candidates; for (GeoFeaturePojo geo : geos) { int nAltPrio = 140; int nAltCase = -1; String city = (null != geo.getCity()) ? geo.getCity().toLowerCase() : null; String region = (null != geo.getRegion()) ? geo.getRegion().toLowerCase() : null; String country = (null != geo.getCountry()) ? geo.getCountry().toLowerCase() : null; // 2.2] CASE 1: I'm a city with pop > 1M (best score 15) // 15] Large city with strong direct // 30] Large city with strong indirect // 70] Large city with weak direct // 72] Large city with weak indirect // 75] Large city with no reference if ((null != city) && (geo.getPopulation() >= 400000) && (nPrio > 15)) { nAltCase = 1; if ((null != region) && (otherRegions.contains(region))) { nAltPrio = 15; // strong direct } //TESTED: "dallas / Texas / United States = 15" else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) { nAltPrio = 30; // strong indirect } //TESTED: "sacramento / California / United State" else if ((null != country) && (otherCountries.contains(country))) { nAltPrio = 70; // weak direct } //TESTED: "berlin, germany", with "germany" directly mentioned else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) { nAltPrio = 72; // weak indirect } //TESTED: "los angeles / California / United States = 72" else { nAltPrio = 75; // just for being big! } //TESTED: "barcelona, spain" } // 2.3] CASE 2: I'm a region (best score=20, can beat current score) // 20] Region with direct // 50] Region with indirect // 120] Region with no reference, if there is only 1 else if ((null == city) && (nPrio > 20)) { nAltCase = 2; if ((null != country) && (otherCountries.contains(country))) { nAltPrio = 20; // strong direct } //TESTED: (region) "Berlin, Germany" with "Germany" mentioned else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) { nAltPrio = 50; // strong indirect } //(haven't seen, but we'll live) else { nAltPrio = 120; // (just for being there) } //TESTED: "null / Portland / Jamaica = 120", also "Shanghai / China" } // 2.4] CASE 3: I'm any foreign possibility (best score=60) // 60] Another foreign possibility with strong direct // 78] Another foreign possibility with strong indirect (>100K population - ie not insignificant) // 90] Another foreign possibility with strong indirect // 100] Another foreign possibility with weak direct // 110] Another foreign possibility with weak indirect else if (nPrio > 60) { nAltCase = 3; if ((null != region) && (otherRegions.contains(region))) { nAltPrio = 60; // strong direct // Double check we're not falling into the trap below: if (!geo.getCountry_code().equals("US")) { Matcher m = this._statesRegex.matcher(geo.getRegion()); if (m.matches()) { // non US state matching against (probably) US state, disregard) nAltPrio = 140; } } //TESTED (same clause as below) } //TESTED: lol "philadelphia / Maryland / Liberia = 60" (before above extra clause) if (nAltPrio > 60) { // (may need to re-run test) if ((null != country) && (otherCountries.contains(country))) { if (geo.getPopulation() < 100000) { nAltPrio = 90; // strong indirect } //TESTED: "washington / Villa Clara / Cuba" else { nAltPrio = 78; // strong indirect, with boost! } //TESTED: "geneva, Geneve, Switzerland", pop 180K } else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) { nAltPrio = 100; // weak direct } //TESTED: "lincoln / Lincolnshire / United Kingdom = 100" else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) { nAltPrio = 110; // weak indirect } //(haven't seen, but we'll live) } } // Debug: if ((_nDebugLevel >= 2) && (nAltPrio < 140)) { System.out.println("----Alternative: " + geo.getCity() + " / " + geo.getRegion() + " / " + geo.getCountry() + " score=" + nAltPrio); } // Outcome of results: if (nAltPrio < nPrio) { currLeader = geo; nPrio = nAltPrio; nCase = nAltCase; } } // end loop over alternativse if (null != currLeader) { // Need to change if (1 == nCase) { this._nMovedToLargeCity++; //(Cities are lower case in georef DB for some reason) String city = WordUtils.capitalize(currLeader.getCity()); if (currLeader.getCountry_code().equals("US")) { // Special case: is this just the original? String region = currLeader.getRegion(); if (region.equals("District of Columbia")) { // Special special case region = "D.C."; } String sCandidate = city + ", " + region; if (!sCandidate.equals(ent.getDisambiguatedName())) { ent.setDisambiguatedName(sCandidate); ent.setIndex(ent.getDisambiguatedName() + "/city"); ent.setSemanticLinks(null); bChangedAnything = true; } //TESTED (lots, eg "Philadelphia (village), New York" -> "Philadelphia, PA"; Wash, Ill. -> Wash DC) else { this._nMovedToLargeCity--; _nStayedWithOriginal++; } //TESTED ("Washington DC", "San Juan, Puerto Rico") } //TESTED (see above) else { ent.setDisambiguatedName(city + ", " + currLeader.getCountry()); ent.setIndex(ent.getDisambiguatedName() + "/city"); ent.setSemanticLinks(null); bChangedAnything = true; } //TESTED: "london, california/city to London, United Kingdom" } else if (2 == nCase) { this._nMovedToRegion++; ent.setDisambiguatedName(currLeader.getRegion() + ", " + currLeader.getCountry()); ent.setIndex(ent.getDisambiguatedName() + "/region"); ent.setSemanticLinks(null); bChangedAnything = true; } //TESTED: "Moved madrid, new york/city to Madrid, Spain" (treats Madrid as region, like Berlin see above) else { //(Cities are lower case in georef DB for some reason) String city = WordUtils.capitalize(currLeader.getCity()); this._nMovedToForeignCity++; ent.setDisambiguatedName(city + ", " + currLeader.getCountry()); ent.setIndex(ent.getDisambiguatedName() + "/city"); ent.setSemanticLinks(null); bChangedAnything = true; } //TESTED: "Moved geneva, new york/city to Geneva, Switzerland" if ((_nDebugLevel >= 1) && (null == ent.getSemanticLinks())) { System.out.println("++++ Moved " + pair.getKey() + " to " + ent.getDisambiguatedName()); } } else { _nStayedWithOriginal++; } } // (if sitting tenant not holder) } // (end loop over candidates) if ((_nDebugLevel >= 1) && bChangedAnything) { System.out.println("\t(((Doc: " + doc.getTitle() + " / " + doc.getId() + " / " + doc.getUrl() + ")))"); } return bChangedAnything; }
From source file:com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.AlchemyEntityPersonCleanser.java
License:Open Source License
public void doProcessing(int nSkip, BasicDBObject userQuery, int nLimit, boolean bAlterDB) throws NumberFormatException, UnknownHostException, MongoException { // Initialization (regexes and stuff) this.initialize(); // Launch MongoDB query BasicDBObject query = userQuery;/*from w w w.j a v a2s. c o m*/ if (null == query) { new BasicDBObject(); } // Just get the entity list out to save a few CPU cycles BasicDBObject outFields = new BasicDBObject(); outFields.append(DocumentPojo.entities_, 1); outFields.append(DocumentPojo.url_, 1); // (help with debugging) outFields.append(DocumentPojo.title_, 1); // (help with debugging) DBCursor dbc = null; if (nLimit > 0) { dbc = docsDB.find(query, outFields).limit(nLimit).skip(nSkip); } else { // Everything! dbc = docsDB.find(query, outFields).skip(nSkip); } // Create POJO array of documents (definitely not the most efficient, but // will make integration with the harvester easier) List<DocumentPojo> docus = DocumentPojo.listFromDb(dbc, DocumentPojo.listType()); // Loop over array and invoke the cleansing function for each one for (DocumentPojo docu : docus) { if (this.cleansePeopleInDocu(docu)) { this._nDocusModified++; if (bAlterDB) { BasicDBObject inner0 = new BasicDBObject(DocumentPojo.entities_, (DBObject) com.mongodb.util.JSON.parse(new Gson().toJson(docu.getEntities()))); BasicDBObject inner1 = new BasicDBObject(MongoDbManager.set_, inner0); // Overwrite the existing entities list with the new one docsDB.update(new BasicDBObject(DocumentPojo._id_, docu.getId()), inner1, false, true); // (need the multi-update in case _id isn't the shard key - documentation claims this is not necessary but 2.4.6/shell still enforces it) } //TESTED: checked on "Feed: Japan's Three Elections / 4c92863751cc2e59d612000b / 30" } this._nDocusProcessed++; } }
From source file:com.ikanow.infinit.e.processing.custom.scheduler.CustomScheduleManager.java
License:Open Source License
/** * Look for jobs that have not started yet but are scheduled for some point in the future *///from w ww .j av a 2s . com public static CustomMapReduceJobPojo getJobsToRun(PropertiesManager prop_custom, boolean bLocalMode, boolean bHadoopEnabled) { try { // First off, check the number of running jobs - don't exceed the max // (seem to run into memory problems if this isn't limited?) if (!availableSlots(prop_custom)) { return null; } BasicDBObject query = new BasicDBObject(); query.append(CustomMapReduceJobPojo.jobidS_, null); query.append(CustomMapReduceJobPojo.waitingOn_, new BasicDBObject(MongoDbManager.size_, 0)); query.append(CustomMapReduceJobPojo.nextRunTime_, new BasicDBObject(MongoDbManager.lt_, new Date().getTime())); if (!bHadoopEnabled && !bLocalMode) { // Can only get shared queries: query.append("jarURL", null); } BasicDBObject updates = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, ""); updates.append("lastRunTime", new Date()); BasicDBObject update = new BasicDBObject(MongoDbManager.set_, updates); DBObject dbo = DbManager.getCustom().getLookup().findAndModify(query, null, null, false, update, true, false); if (dbo != null) { return CustomMapReduceJobPojo.fromDb(dbo, CustomMapReduceJobPojo.class); } } catch (Exception ex) { //oh noes! ex.printStackTrace(); } return null; }
From source file:com.ikanow.infinit.e.processing.custom.scheduler.CustomScheduleManager.java
License:Open Source License
/** * Look for running jobs, decide if they are complete *///w w w. ja v a2 s . c om public static CustomMapReduceJobPojo getJobsToMakeComplete(boolean bHadoopEnabled, Map<ObjectId, String> incompleteJobsMap) { try { BasicDBObject query = new BasicDBObject(); BasicDBObject nors[] = new BasicDBObject[3]; nors[0] = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, null); nors[1] = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, "CHECKING_COMPLETION"); nors[2] = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, ""); query.put(MongoDbManager.nor_, Arrays.asList(nors)); BasicDBObject updates = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, "CHECKING_COMPLETION"); updates.put(CustomMapReduceJobPojo.lastChecked_, new Date()); BasicDBObject update = new BasicDBObject(MongoDbManager.set_, updates); if (!bHadoopEnabled) { // Can only get shared queries: query.append(CustomMapReduceJobPojo.jarURL_, null); } DBObject dbo = DbManager.getCustom().getLookup().findAndModify(query, update); if (dbo != null) { CustomMapReduceJobPojo cmr = CustomMapReduceJobPojo.fromDb(dbo, CustomMapReduceJobPojo.class); incompleteJobsMap.put(cmr._id, cmr.jobidS); return cmr; } } catch (Exception ex) { //oh noes! ex.printStackTrace(); } return null; }
From source file:com.ikanow.infinit.e.processing.custom.status.CustomStatusManager.java
License:Open Source License
/** * Sets the custom mr pojo to be complete for the * current job. Currently this is done by removing the * jobid and updating the next runtime, increments the * amount of timeRan counter as well so we can calculate nextRunTime * /*from w w w.jav a 2 s . c o m*/ * Also set lastCompletion time to now (best we can approx) * * @param cmr */ public void setJobComplete(CustomMapReduceJobPojo cmr, boolean isComplete, boolean isError, float mapProgress, float reduceProgress, String errorMessage) { // First off, if complete then run custom internal engine finish routines: if ((null != cmr.mapper) && !cmr.mapper.isEmpty() && !cmr.mapper.equalsIgnoreCase("none")) { StringBuffer postTaskActivityErrors = new StringBuffer(); int errLen = 0; if (null != errorMessage) { postTaskActivityErrors = new StringBuffer(errorMessage); errLen = postTaskActivityErrors.length(); } InfiniteHadoopUtils.handlePostTaskActivities(cmr, isError, postTaskActivityErrors); if (postTaskActivityErrors.length() > errLen) { errorMessage = postTaskActivityErrors.toString(); } } //TESTED // (Note, inc_ and unset_ are added in one place each, so can't use them without ensuring you combine existing uses) BasicDBObject updates = new BasicDBObject(); BasicDBObject update = new BasicDBObject(); try { long nNew = 0; long nTotal = 0; if (isComplete) { long runtime = new Date().getTime() - cmr.lastRunTime.getTime(); long timeFromSchedule = cmr.lastRunTime.getTime() - cmr.nextRunTime; updates.append(CustomMapReduceJobPojo.jobidS_, null); updates.append(CustomMapReduceJobPojo.jobidN_, 0); try { //if next run time reschedules to run before now, keep rescheduling until its later //the server could have been turned off for days and would try to rerun all jobs once a day long nextRunTime = CustomScheduleManager.getNextRunTime(cmr.scheduleFreq, cmr.firstSchedule, cmr.nextRunTime); updates.append(CustomMapReduceJobPojo.nextRunTime_, nextRunTime); } catch (Exception e) { } // just carry on, we'll live... updates.append(CustomMapReduceJobPojo.lastCompletionTime_, new Date()); updates.append(CustomMapReduceJobPojo.tempConfigXMLLocation_, null); updates.append(CustomMapReduceJobPojo.tempJarLocation_, null); try { InfiniteHadoopUtils.removeTempFile(cmr.tempConfigXMLLocation); InfiniteHadoopUtils.removeTempFile(cmr.tempJarLocation); } catch (Exception e) { _logger.info("job_error_removing_tempfiles=" + InfiniteHadoopUtils.createExceptionMessage(e)); } BasicDBObject incs = new BasicDBObject(CustomMapReduceJobPojo.timesRan_, 1); //copy depencies to waitingOn updates.append(CustomMapReduceJobPojo.waitingOn_, cmr.jobDependencies); if (!isError) { // Counts and move and output nNew = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollectionTemp).count(); //TODO (INF-1159): this shouldn't really be here but it makes life much easier for now (really should be part of the m/r OutputFormat...) CustomOutputManager.completeOutput(cmr, prop_custom); //if job was successfully, mark off dependencies removeJobFromChildren(cmr._id); // More counts: nTotal = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollection).count(); // Status: String completionStatus = "Schedule Delta: " + timeFromSchedule + "ms\nCompletion Time: " + runtime + "ms\nNew Records: " + nNew + "\nTotal Records: " + nTotal; if (null == errorMessage) { // (I think will always be the case?) errorMessage = completionStatus; } else { errorMessage += "\n" + completionStatus; } if ((null != cmr.tempErrors) && !cmr.tempErrors.isEmpty()) { // Individual errors reported from map/combine/reduce StringBuffer sb = new StringBuffer(errorMessage).append("\n\nLog Messages:\n\n"); for (String err : cmr.tempErrors) { sb.append(err).append("\n"); } errorMessage = sb.toString(); update.put(MongoDbManager.unset_, new BasicDBObject(CustomMapReduceJobPojo.tempErrors_, 1)); } updates.append(CustomMapReduceJobPojo.errorMessage_, errorMessage); // (will often be null) } else { if ((null != cmr.tempErrors) && !cmr.tempErrors.isEmpty()) { // Individual errors reported from map/combine/reduce StringBuffer sb = new StringBuffer(errorMessage).append("\n\nLog Messages:\n\n"); for (String err : cmr.tempErrors) { sb.append(err).append("\n"); } errorMessage = sb.toString(); update.put(MongoDbManager.unset_, new BasicDBObject(CustomMapReduceJobPojo.tempErrors_, 1)); } //failed, just append error message updates.append(CustomMapReduceJobPojo.errorMessage_, errorMessage); incs.append(CustomMapReduceJobPojo.timesFailed_, 1); cmr.timesFailed++; // (so that in memory processes can tell if a job failed) } update.append(MongoDbManager.inc_, incs); if (null != cmr.jobidS) { _logger.info("job_completion_title=" + cmr.jobtitle + " job_completion_id=" + cmr._id.toString() + " job_completion_time=" + runtime + " job_schedule_delta=" + timeFromSchedule + " job_completion_success=" + !isError + " job_hadoop_id=" + cmr.jobidS + "_" + cmr.jobidN + " job_new_records=" + nNew + " job_total_records=" + nTotal); } else { _logger.info("job_completion_title=" + cmr.jobtitle + " job_completion_id=" + cmr._id.toString() + " job_completion_time=" + runtime + " job_schedule_delta=" + timeFromSchedule + " job_completion_success=" + !isError + " job_new_records=" + nNew + " job_total_records=" + nTotal); } } updates.append(CustomMapReduceJobPojo.mapProgress_, mapProgress); updates.append(CustomMapReduceJobPojo.reduceProgress_, reduceProgress); } catch (Exception ex) { //ex.printStackTrace(); _logger.info("job_error_updating_status_title=" + cmr.jobtitle + " job_error_updating_status_id=" + cmr._id.toString() + " job_error_updating_status_message=" + InfiniteHadoopUtils.createExceptionMessage(ex)); } finally { // It's really bad if this doesn't happen, so do it here so that it always gets called if (!updates.isEmpty()) { update.append(MongoDbManager.set_, updates); // (if isComplete, should always include resetting jobidS and jobidN) DbManager.getCustom().getLookup().update(new BasicDBObject(CustomMapReduceJobPojo._id_, cmr._id), update); // (also set local version) cmr.errorMessage = errorMessage; } if (isComplete || isError) { // If we're derived from a source then update the source: if (null != cmr.derivedFromSourceKey) { // For a source's first run, need to grab the entire source to check if we need to override the tmin/tmax SourcePojo srcJustRun = null; if ((isComplete && !isError) && (0 == cmr.timesRan)) { BasicDBObject srcQuery = new BasicDBObject(SourcePojo.key_, cmr.derivedFromSourceKey); srcJustRun = SourcePojo.fromDb(DbManager.getIngest().getSource().findOne(srcQuery), SourcePojo.class); if (null == srcJustRun.getHarvestStatus()) { // (don't allow initial override, if one is set) srcJustRun.setHarvestStatus(new SourceHarvestStatusPojo()); } srcJustRun.getHarvestStatus().setHarvest_status(HarvestEnum.success); if (null != srcJustRun) { try { LinkedList<CustomMapReduceJobPojo> updatedJobs = new LinkedList<CustomMapReduceJobPojo>(); SourcePipelineToCustomConversion.convertSourcePipeline(srcJustRun, updatedJobs, false); for (CustomMapReduceJobPojo cmrUpdate : updatedJobs) { if (cmrUpdate._id.equals(cmr._id)) { DbManager.getCustom().getLookup().save(cmrUpdate.toDb()); } } } catch (Exception e) { } // just carry on } } //TESTED (by hand) BasicDBObject query = new BasicDBObject(SourcePojo.key_, cmr.derivedFromSourceKey); BasicDBObject setUpdate = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_, isError ? HarvestEnum.error.toString() : HarvestEnum.success.toString()); if (null != cmr.errorMessage) { setUpdate.put(SourceHarvestStatusPojo.sourceQuery_harvest_message_, cmr.errorMessage); } BasicDBObject srcUpdate = new BasicDBObject(DbManager.set_, setUpdate); DbManager.getIngest().getSource().update(query, srcUpdate, false, false); } } //TESTED (by hand) } }
From source file:com.ikanow.infinit.e.processing.custom.status.CustomStatusManager.java
License:Open Source License
/** * Updates the status of the current, active, job *///from w ww . jav a 2s . c o m public void updateJobPojo(ObjectId _id, String jobids, int jobidn, String xmlLocation, String jarLocation, CustomMapReduceJobPojo job) { try { BasicDBObject set = new BasicDBObject(); set.append(CustomMapReduceJobPojo.jobidS_, jobids); set.append(CustomMapReduceJobPojo.jobidN_, jobidn); set.append(CustomMapReduceJobPojo.tempConfigXMLLocation_, xmlLocation); set.append(CustomMapReduceJobPojo.tempJarLocation_, jarLocation); set.append(CustomMapReduceJobPojo.errorMessage_, null); BasicDBObject updateObject = new BasicDBObject(MongoDbManager.set_, set); DbManager.getCustom().getLookup().update(new BasicDBObject(CustomMapReduceJobPojo._id_, _id), updateObject); if ((null != job) && (null != job.derivedFromSourceKey)) { //update to success_iteration BasicDBObject query = new BasicDBObject(SourcePojo.key_, job.derivedFromSourceKey); BasicDBObject setUpdate = new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvest_status_, HarvestEnum.success_iteration.toString()); BasicDBObject srcUpdate = new BasicDBObject(DbManager.set_, setUpdate); DbManager.getIngest().getSource().update(query, srcUpdate, false, false); } } catch (Exception ex) { ex.printStackTrace(); } }