Java tutorial
/******************************************************************************* * Copyright 2012, The Infinit.e Open Source Project. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi; import java.lang.reflect.Type; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.WordUtils; import com.mongodb.BasicDBObject; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.DBObject; import com.mongodb.MongoException; import com.google.gson.Gson; import com.google.gson.reflect.TypeToken; import com.ikanow.infinit.e.data_model.store.MongoDbManager; import com.ikanow.infinit.e.data_model.store.document.DocumentPojo; import com.ikanow.infinit.e.data_model.store.document.EntityPojo; import com.ikanow.infinit.e.data_model.store.feature.geo.GeoFeaturePojo; //______________________________________________________________________________________ public class AlchemyEntityGeoCleanser { // Stats: private int _nDocusModified = 0; private int _nDocusProcessed = 0; private int _nStayedWithOriginal = 0; private int _nMovedToRegion = 0; private int _nMovedToLargeCity = 0; private int _nMovedToForeignCity = 0; public int getDocusModified() { return _nDocusModified; } public int getDocusProcessed() { return _nDocusProcessed; } public int getStayedWithOriginal() { return _nStayedWithOriginal; } public int getMovedToRegion() { return _nMovedToRegion; } public int getMovedToLargeCity() { return _nMovedToLargeCity; } public int getMovedToForeignCity() { return _nMovedToForeignCity; } // Debug: private int _nDebugLevel = 0; public void setDebugLevel(int nDebugLevel) { //1==replacements, 2=feeds/candidate entities, 3=entities, 4=decomposition _nDebugLevel = nDebugLevel; } //______________________________________________________________________________________ // Processing code //______________________________________________________________________________________ // Top level logic // For running remotely // For cleaning local feeds, just call cleansePeopleInDocu(feed) // Host/Port - obvious // HexSlice - sub-samples somewhat efficiently, on last specified digits of _id // userQuery - lets the calling function decide what data to run on (probably for debugging) // nLimit - the max number of entries returned (for debugging) // bAlterDB - writes the results back to the DB (else it's just for debugging) public void doProcessing(int nSkip, BasicDBObject userQuery, int nLimit, boolean bAlterDB) throws NumberFormatException, UnknownHostException, MongoException { // Initialization (regexes and stuff) this.initialize(); // Launch MongoDB query BasicDBObject query = userQuery; if (null == query) { new BasicDBObject(); } // Just get the entity list out to save a few CPU cycles BasicDBObject outFields = new BasicDBObject(); outFields.append(DocumentPojo.entities_, 1); outFields.append(DocumentPojo.url_, 1); // (help with debugging) outFields.append(DocumentPojo.title_, 1); // (help with debugging) DBCursor dbc = null; if (nLimit > 0) { dbc = _docsDB.find(query, outFields).limit(nLimit).skip(nSkip); } else { // Everything! dbc = _docsDB.find(query, outFields).skip(nSkip); } // Create POJO array of documents (definitely not the most efficient, but // will make integration with the harvester easier) List<DocumentPojo> docus = DocumentPojo.listFromDb(dbc, DocumentPojo.listType()); // Loop over array and invoke the cleansing function for each one for (DocumentPojo docu : docus) { if (this.cleanseGeoInDocu(docu)) { this._nDocusModified++; if (bAlterDB) { BasicDBObject inner0 = new BasicDBObject(DocumentPojo.entities_, (DBObject) com.mongodb.util.JSON.parse(new Gson().toJson(docu.getEntities()))); BasicDBObject inner1 = new BasicDBObject(MongoDbManager.set_, inner0); // Overwrite the existing entities list with the new one _docsDB.update(new BasicDBObject(DocumentPojo._id_, docu.getId()), inner1, false, true); // (need the multi-update in case _id isn't the shard key - documentation claims this is not necessary but 2.4.6/shell still enforces it) } //TESTED } this._nDocusProcessed++; } } //________________________________________________ // Initialization variables private DBCollection _docsDB = null; private DBCollection _georefDB = null; private static final String _stateList = "Alabama|Alaska|American Samoa|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|D\\.C\\.|District of Columbia|Florida|Georgia|Guam|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Northern Marianas Islands|Ohio|Oklahoma|Oregon|Pennsylvania|Puerto Rico|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Virgin Islands|Washington|West Virginia|Wisconsin|Wyoming"; private Pattern _statesRegex = null; private static final String _abbrStateList = "(?:m\\.d|n\\.j|n.m|conn|mich|al\\.|d\\.c|vt|calif|wash\\.|ore\\.|ind\\.)\\.?"; private Pattern _abbrStateRegex = null; //________________________________________________ // Initialization code // Call with null/null to act on local objects vs fetching them from the DB public void initialize() throws NumberFormatException, UnknownHostException, MongoException { // MongoDB _docsDB = MongoDbManager.getDocument().getMetadata(); _georefDB = MongoDbManager.getFeature().getGeo(); // Regex of US states _statesRegex = Pattern.compile(_stateList); _abbrStateRegex = Pattern.compile(_abbrStateList); } //________________________________________________ // Inner loop processing logic public static class Candidate { EntityPojo entity; LinkedList<GeoFeaturePojo> candidates; String state; Candidate(EntityPojo ent, LinkedList<GeoFeaturePojo> cands, String st) { entity = ent; candidates = cands; state = st; } } public boolean cleanseGeoInDocu(DocumentPojo doc) { boolean bChangedAnything = false; Map<String, Candidate> dubiousLocations = new HashMap<String, Candidate>(); Set<String> otherRegions = new HashSet<String>(); Set<String> otherCountries = new HashSet<String>(); Set<String> otherCountriesOrRegionsReferenced = new HashSet<String>(); //Debug if (_nDebugLevel >= 2) { System.out.println( "+++++++ Doc: " + doc.getTitle() + " / " + doc.getId() + " / " + doc.getEntities().size()); } // 1] First off, let's find anything location-based and also determine if it's bad or not if (null != doc.getEntities()) for (EntityPojo ent : doc.getEntities()) { boolean bStrongCandidate = false; // People: decompose names if (EntityPojo.Dimension.Where == ent.getDimension()) { // So locations get disambiguated to one of: // "<city-etc>, <region-or-country>", or "<region-or-country>" // though can also just be left as they are. String sActualName = ent.getActual_name().toLowerCase(); if (!ent.getDisambiguatedName().toLowerCase().equals(sActualName)) { // It's been disambiguated //Debug if (_nDebugLevel >= 3) { System.out.println("disambiguous candidate: " + ent.getDisambiguatedName() + " VS " + ent.getActual_name() + " (" + ((null != ent.getSemanticLinks()) ? ent.getSemanticLinks().size() : 0) + ")"); } // OK next step, is it a disambiguation to a US town? String splitMe[] = ent.getDisambiguatedName().split(", "); if (2 == splitMe.length) { String stateOrCountry = splitMe[1]; Matcher m = _statesRegex.matcher(stateOrCountry); if (m.find()) { // This is a US disambiguation - high risk case // Short cut if state is already directly mentioned? stateOrCountry = stateOrCountry.toLowerCase(); if (!otherRegions.contains(stateOrCountry)) { // See list below - no need to go any further // OK next step - is it a possible ambiguity: ArrayList<BasicDBObject> x = new ArrayList<BasicDBObject>(); BasicDBObject inner0_0 = new BasicDBObject(MongoDbManager.not_, Pattern.compile("US")); BasicDBObject inner1_0 = new BasicDBObject("country_code", inner0_0); x.add(inner1_0); BasicDBObject inner0_1 = new BasicDBObject(MongoDbManager.gte_, 400000); BasicDBObject inner1_1 = new BasicDBObject("population", inner0_1); x.add(inner1_1); BasicDBObject dbo = new BasicDBObject(); dbo.append("search_field", sActualName); dbo.append(MongoDbManager.or_, x); DBCursor dbc = _georefDB.find(dbo); if (dbc.size() >= 1) { // Problems! //Create list of candidates Type listType = new TypeToken<LinkedList<GeoFeaturePojo>>() { }.getType(); LinkedList<GeoFeaturePojo> grpl = new Gson() .fromJson(dbc.toArray().toString(), listType); //Debug if (_nDebugLevel >= 2) { System.out.println("\tERROR CANDIDATE: " + ent.getDisambiguatedName() + " VS " + ent.getActual_name() + " (" + dbc.count() + ")"); if (_nDebugLevel >= 3) { for (GeoFeaturePojo grp : grpl) { System.out.println("\t\tCandidate:" + grp.getCity() + " / " + grp.getRegion() + " / " + grp.getCountry()); } } } Candidate candidate = new Candidate(ent, grpl, stateOrCountry); dubiousLocations.put(ent.getIndex(), candidate); bStrongCandidate = true; } // if strong candidate } //TESTED ("reston, virginia" after "virginia/stateorcounty" mention) // (end if can't shortcut past all this) } // end if a US town } // end if in the format "A, B" } // if weak candidate //TESTED if (!bStrongCandidate) { // Obv can't count on a disambiguous candidate: String type = ent.getType().toLowerCase(); if (type.equals("stateorcounty")) { String disName = ent.getDisambiguatedName().toLowerCase(); if (_abbrStateRegex.matcher(disName).matches()) { otherRegions.add(getStateFromAbbr(disName)); } else { otherRegions.add(ent.getDisambiguatedName().toLowerCase()); } otherCountriesOrRegionsReferenced.add("united states"); } //TESTED: "mich./stateorcounty" else if (type.equals("country")) { String disName = ent.getDisambiguatedName().toLowerCase(); // Translation of known badly transcribed countries: // (England->UK) if (disName.equals("england")) { otherCountries.add("united kingdom"); } //TESTED else { otherCountries.add(ent.getDisambiguatedName().toLowerCase()); } } else if (type.equals("region")) { otherRegions.add(ent.getDisambiguatedName().toLowerCase()); } else if (type.equals("city")) { String splitMe[] = ent.getDisambiguatedName().split(",\\s*"); if (2 == splitMe.length) { otherCountriesOrRegionsReferenced.add(splitMe[1].toLowerCase()); if (this._statesRegex.matcher(splitMe[1]).find()) { otherCountriesOrRegionsReferenced.add("united states"); } //TESTED: "lexingon, kentucky/city" } } } //TESTED: just above clauses } // if location } // (end loop over entities) // Debug: if ((_nDebugLevel >= 3) && (!dubiousLocations.isEmpty())) { for (String s : otherRegions) { System.out.println("Strong region: " + s); } for (String s : otherCountries) { System.out.println("Strong countries: " + s); } for (String s : otherCountriesOrRegionsReferenced) { System.out.println("Weak regionscountries: " + s); } } // 2] The requirements and algorithm are discussed in // http://ikanow.jira.com/wiki/display/INF/Beta...+improving+AlchemyAPI+extraction+%28geo%29 // Canonical cases: // Darfur -> Darfur, MN even though Sudan and sometimes Darfur, Sudan are present // Shanghai -> Shanghai, WV even though China is mentioned (and not WV) // Manchester -> Manchester village, NY (not Manchester, UK) // Philadelphia -> Philadelphia (village), NY (though NY is mentioned and not PA) // We're generating the following order // 10] Sitting tenant with strong direct // 15] Large city with strong direct // 20] Region with direct // 30] Large city with strong indirect // 40] Sitting tenant with strong indirect // 50] Region with indirect // 60] Another foreign possibility with strong direct // 70] Large city with weak direct // 72] Large city with weak indirect // 75] Large city with no reference // 78] Another foreign possibility with strong indirect (>100K population - ie not insignificant) // 80] Sitting tenant with any weak (US) direct or indirect // 90] Another foreign possibility with strong indirect // 100] Another foreign possibility with weak direct // 110] Another foreign possibility with weak indirect // 120] Region with no reference, if there is only 1 // 130] Sitting tenant with none of the above (ie default) // 140] Anything else! for (Map.Entry<String, Candidate> pair : dubiousLocations.entrySet()) { EntityPojo ent = pair.getValue().entity; Candidate candidate = pair.getValue(); // 2.1] Let's analyse the "sitting tenant" int nPrio = 130; GeoFeaturePojo currLeader = null; int nCase = 0; // (just for debugging, 0=st, 1=large city, 2=region, 3=other) if (otherRegions.contains(candidate.state)) { // Strong direct ref, winner! nPrio = 10; // winner! } //TESTED: "san antonio, texas/city" vs "texas" else if (otherCountriesOrRegionsReferenced.contains(candidate.state)) { // Indirect ref nPrio = 40; // good, but beatable... } //TESTED: "philadelphia (village), new york/city" else if (otherCountries.contains("united states")) { // Weak direct ref nPrio = 80; // better than nothing... } //TESTED: "apache, oklahoma/city" else if (otherCountriesOrRegionsReferenced.contains("united states")) { // Weak indirect ref nPrio = 80; // better than nothing... } //TESTED: "washington, d.c." have DC as stateorcounty, but US in countries list // Special case: we don't like "village": if ((80 != nPrio) && ent.getDisambiguatedName().contains("village") && !ent.getActual_name().contains("village")) { nPrio = 80; } //TESTED: "Downvoted: Philadelphia (village), New York from Philadelphia" // Debug if (_nDebugLevel >= 2) { System.out.println(pair.getKey() + " SittingTenantScore=" + nPrio); } // Alternatives if (nPrio > 10) { LinkedList<GeoFeaturePojo> geos = pair.getValue().candidates; for (GeoFeaturePojo geo : geos) { int nAltPrio = 140; int nAltCase = -1; String city = (null != geo.getCity()) ? geo.getCity().toLowerCase() : null; String region = (null != geo.getRegion()) ? geo.getRegion().toLowerCase() : null; String country = (null != geo.getCountry()) ? geo.getCountry().toLowerCase() : null; // 2.2] CASE 1: I'm a city with pop > 1M (best score 15) // 15] Large city with strong direct // 30] Large city with strong indirect // 70] Large city with weak direct // 72] Large city with weak indirect // 75] Large city with no reference if ((null != city) && (geo.getPopulation() >= 400000) && (nPrio > 15)) { nAltCase = 1; if ((null != region) && (otherRegions.contains(region))) { nAltPrio = 15; // strong direct } //TESTED: "dallas / Texas / United States = 15" else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) { nAltPrio = 30; // strong indirect } //TESTED: "sacramento / California / United State" else if ((null != country) && (otherCountries.contains(country))) { nAltPrio = 70; // weak direct } //TESTED: "berlin, germany", with "germany" directly mentioned else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) { nAltPrio = 72; // weak indirect } //TESTED: "los angeles / California / United States = 72" else { nAltPrio = 75; // just for being big! } //TESTED: "barcelona, spain" } // 2.3] CASE 2: I'm a region (best score=20, can beat current score) // 20] Region with direct // 50] Region with indirect // 120] Region with no reference, if there is only 1 else if ((null == city) && (nPrio > 20)) { nAltCase = 2; if ((null != country) && (otherCountries.contains(country))) { nAltPrio = 20; // strong direct } //TESTED: (region) "Berlin, Germany" with "Germany" mentioned else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) { nAltPrio = 50; // strong indirect } //(haven't seen, but we'll live) else { nAltPrio = 120; // (just for being there) } //TESTED: "null / Portland / Jamaica = 120", also "Shanghai / China" } // 2.4] CASE 3: I'm any foreign possibility (best score=60) // 60] Another foreign possibility with strong direct // 78] Another foreign possibility with strong indirect (>100K population - ie not insignificant) // 90] Another foreign possibility with strong indirect // 100] Another foreign possibility with weak direct // 110] Another foreign possibility with weak indirect else if (nPrio > 60) { nAltCase = 3; if ((null != region) && (otherRegions.contains(region))) { nAltPrio = 60; // strong direct // Double check we're not falling into the trap below: if (!geo.getCountry_code().equals("US")) { Matcher m = this._statesRegex.matcher(geo.getRegion()); if (m.matches()) { // non US state matching against (probably) US state, disregard) nAltPrio = 140; } } //TESTED (same clause as below) } //TESTED: lol "philadelphia / Maryland / Liberia = 60" (before above extra clause) if (nAltPrio > 60) { // (may need to re-run test) if ((null != country) && (otherCountries.contains(country))) { if (geo.getPopulation() < 100000) { nAltPrio = 90; // strong indirect } //TESTED: "washington / Villa Clara / Cuba" else { nAltPrio = 78; // strong indirect, with boost! } //TESTED: "geneva, Geneve, Switzerland", pop 180K } else if ((null != region) && (otherCountriesOrRegionsReferenced.contains(region))) { nAltPrio = 100; // weak direct } //TESTED: "lincoln / Lincolnshire / United Kingdom = 100" else if ((null != country) && (otherCountriesOrRegionsReferenced.contains(country))) { nAltPrio = 110; // weak indirect } //(haven't seen, but we'll live) } } // Debug: if ((_nDebugLevel >= 2) && (nAltPrio < 140)) { System.out.println("----Alternative: " + geo.getCity() + " / " + geo.getRegion() + " / " + geo.getCountry() + " score=" + nAltPrio); } // Outcome of results: if (nAltPrio < nPrio) { currLeader = geo; nPrio = nAltPrio; nCase = nAltCase; } } // end loop over alternativse if (null != currLeader) { // Need to change if (1 == nCase) { this._nMovedToLargeCity++; //(Cities are lower case in georef DB for some reason) String city = WordUtils.capitalize(currLeader.getCity()); if (currLeader.getCountry_code().equals("US")) { // Special case: is this just the original? String region = currLeader.getRegion(); if (region.equals("District of Columbia")) { // Special special case region = "D.C."; } String sCandidate = city + ", " + region; if (!sCandidate.equals(ent.getDisambiguatedName())) { ent.setDisambiguatedName(sCandidate); ent.setIndex(ent.getDisambiguatedName() + "/city"); ent.setSemanticLinks(null); bChangedAnything = true; } //TESTED (lots, eg "Philadelphia (village), New York" -> "Philadelphia, PA"; Wash, Ill. -> Wash DC) else { this._nMovedToLargeCity--; _nStayedWithOriginal++; } //TESTED ("Washington DC", "San Juan, Puerto Rico") } //TESTED (see above) else { ent.setDisambiguatedName(city + ", " + currLeader.getCountry()); ent.setIndex(ent.getDisambiguatedName() + "/city"); ent.setSemanticLinks(null); bChangedAnything = true; } //TESTED: "london, california/city to London, United Kingdom" } else if (2 == nCase) { this._nMovedToRegion++; ent.setDisambiguatedName(currLeader.getRegion() + ", " + currLeader.getCountry()); ent.setIndex(ent.getDisambiguatedName() + "/region"); ent.setSemanticLinks(null); bChangedAnything = true; } //TESTED: "Moved madrid, new york/city to Madrid, Spain" (treats Madrid as region, like Berlin see above) else { //(Cities are lower case in georef DB for some reason) String city = WordUtils.capitalize(currLeader.getCity()); this._nMovedToForeignCity++; ent.setDisambiguatedName(city + ", " + currLeader.getCountry()); ent.setIndex(ent.getDisambiguatedName() + "/city"); ent.setSemanticLinks(null); bChangedAnything = true; } //TESTED: "Moved geneva, new york/city to Geneva, Switzerland" if ((_nDebugLevel >= 1) && (null == ent.getSemanticLinks())) { System.out.println("++++ Moved " + pair.getKey() + " to " + ent.getDisambiguatedName()); } } else { _nStayedWithOriginal++; } } // (if sitting tenant not holder) } // (end loop over candidates) if ((_nDebugLevel >= 1) && bChangedAnything) { System.out.println("\t(((Doc: " + doc.getTitle() + " / " + doc.getId() + " / " + doc.getUrl() + ")))"); } return bChangedAnything; } ///////////////////////////////////////////////////////////////////////////////////////////// // Utility: state abbrievations: private static String getStateFromAbbr(String s) { if (s.endsWith(".")) { s = s.substring(0, s.length() - 1); } if (s.equals("m.d")) { s = "maryland"; } else if (s.equals("n.m")) { s = "new mexico"; } else if (s.equals("conn")) { s = "connecticut"; } else if (s.equals("mich")) { s = "michigan"; } else if (s.equals("n.j")) { s = "new jersey"; } else if (s.equals("al")) { s = "alabama"; } else if (s.equals("d.c")) { s = "district of columbia"; } else if (s.equals("vt")) { s = "vermont"; } else if (s.equals("calif")) { s = "california"; } else if (s.equals("wash")) { s = "washington"; } else if (s.equals("ore")) { s = "oregon"; } else if (s.equals("ind")) { s = "indiana"; } return s; } }