Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package opennlp.tools.apps.object_dedup; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import opennlp.tools.similarity.apps.BingQueryRunner; import opennlp.tools.similarity.apps.HitBase; import opennlp.tools.similarity.apps.utils.LevensteinDistanceFinder; import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer; import opennlp.tools.similarity.apps.utils.Utils; import opennlp.tools.textsimilarity.TextProcessor; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /* This is a template class for deduplicator */ public class SimilarityAccessorBase { private static final Logger LOG = LoggerFactory.getLogger(SimilarityAccessorBase.class); public static final int MAX_EV_TO_RECOMM = 6; private List<String> namesBothSides; protected static final String[] englishPrepositions = new String[] { "a", "aboard", "about", "above", "absent", "across", "after", "against", "along", "alongside", "among", "around", "as", "at", "before", "behind", "below", "beneath", "between", "beyond", "but", "by", "despite", "down", "during", "except", "excluding", "failing", "following", "for", "from", "in", "including", "inside", "into", "like", "near", "next", "of", "off", "on", "onto", "only", "opposite", "out", "outside", "over", "pace", "past", "per", "since", "than", "through", "and", "thru", "till", "to", "toward", "under", "up", "upon", "versus", "with", "within", "you", "must", "know", "when" }; protected List<String> commonWordsInEventTitles = Arrays.asList(new String[] { "community", "party", "film", "music", "exhibition", "kareoke", "guitar", "quartet", "reggae", "r&b", "band", "dj ", "piano", "pray", "worship", "god", "training", "class", "development", "training", "class", "course", "our", "comedy", ",fun", "musical", "group", "alliance", "session", "feeding", "introduction", "school", "conversation", "learning", "nursery", "unity", "trivia", "chat", "conference", "tuition", "technology", "teen", "communication", "reception", "management", "beginner", "beginning", "collabora", "reuninon", "political", "course", "age", "ages", "through", "grade", "networking", "workshop", "demonstration", "tuning", "program", "summit", "convention", "day", "night", "one", "two", "outfest", "three", "online", "writing", "seminar", "coach", ",expo", "advanced", "beginner", "intermediate", "earn", "free", "ii", "iii", "skills", "skill", "artist", "summer", "winter", "autumn", "spring", "camp", "vacation", "miscrosoft", "kid", "child", "kids", "children", "every", "everyone", "dancer", "dancers", "senior", "seniors", "basic", "elementary", "outfest", "2008", "2009", "2010", "2011", "2012", "monday", "tuesday", "wednesday", "thirsday", "friday", "saturday", "sunday", "mondays", "tuesdays", "wednesdays", "thirsdays", "fridays", "saturdays", "sundays", "men" // ? }); private BingQueryRunner webSearch = new BingQueryRunner(); private StringDistanceMeasurer stringDistanceMeasurer = new StringDistanceMeasurer(); public SimilarityAccessorBase() { } public void init() { namesBothSides = getWordsThatShouldBeOnBothSidesEvents(); } protected List<String> removeDollarWordAndNonAlphaFromList(List<String> list) { List<String> result = new ArrayList<String>(); Pattern p = Pattern.compile("^\\$(\\d{1,3}(\\,\\d{3})*|(\\d+))(\\.\\d{2})?$"); for (String w : list) { if (!(p.matcher(w).find()) && StringUtils.isAlphanumeric(w) && (w.length() >= 3 || !StringUtils.isAlpha(w))) result.add(w); } return result; } public List<String> getWordsThatShouldBeOnBothSidesEvents() { /* names.addAll(Arrays.asList(new String[] { "woman", "man", "women", "men", "womans", "mans", "womens", "mens", "boy", "girl", "boys", "girls", "men's", "women's", "woman's", "ice", // for disney "flight", "intermediate", "advanced", "beginner", // "tour", TODO special consideration "helicopter", "sexual", "junior", "jr" })); */ return null; } protected Boolean applySemanticNameSimilarityRule(Object es1, Object es2) { //TODO check attributes of objects /* if (!(es1.getVenueName().endsWith(es2.getVenueName()) || es2.getVenueName().endsWith(es1.getVenueName()))) return false; if (Math.abs(es1.getStarttime().getTime() - es2.getStarttime().getTime()) > 100000) return false; */ return true; } // this rule extract "OF" part and treats it as a whole expression protected void applySubPhraseExtractionRule(List<String> name1Tokens, List<String> name2Tokens) { if (name1Tokens.indexOf("of") > 0 && name2Tokens.indexOf("of") > 0) { name1Tokens = extractMainNounPhrase(name1Tokens); name2Tokens = extractMainNounPhrase(name2Tokens); } } private Boolean attemptShortTitlesSimilarityInWebSpace(String name1, String name2) { // first delimeter processing String name1v = name1.replace("'", "").replace("-", " "); String name2v = name2.replace("'", "").replace("-", " "); String name1vv = name1.replace("'", ""); String name2vv = name2.replace("'", ""); String name1vvv = name1.replace("-", " "); String name2vvv = name2.replace("-", " "); if (name1.startsWith(name2) || name1vv.startsWith(name2) || name1.startsWith(name2v) || name1.startsWith(name2vv) || name1.startsWith(name2vvv) || name1v.startsWith(name2v) || name1v.startsWith(name2vv) || name2.startsWith(name1) || name2vv.startsWith(name1) || name2.startsWith(name1v) || name2vvv.startsWith(name1vv) || name2.startsWith(name1vvv) || name2v.startsWith(name1v) || name2v.startsWith(name1vv) || name1.endsWith(name2) || name1vv.endsWith(name2) || name1.endsWith(name2v) || name1.endsWith(name2vv) || name1.endsWith(name2vvv) || name1v.endsWith(name2v) || name1v.endsWith(name2vv) || name2.endsWith(name1) || name2vv.endsWith(name1) || name2.endsWith(name1v) || name1vvv.endsWith(name2vv) || name2.endsWith(name1vvv) || name2v.endsWith(name1v) || name2v.endsWith(name1vv)) { LOG.info("Found fuzzy substring of name1 and name2"); return true; } if (name1.length() > 12 && name2.length() > 12) return false; return areNamesSemanticallyCloseInWebSearchSpace(name1, name2, 0.8f, false).isDecision(); } public Boolean applyBothSidesRuleEvent(String name1, String name2) { List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), false); List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), false); // get unique names List<String> name1TokensC = new ArrayList<String>(name1Tokens), name2TokensC = new ArrayList<String>(name2Tokens); ; name1TokensC.removeAll(name2Tokens); name2TokensC.removeAll(name1Tokens); // get all unique names name1TokensC.addAll(name2TokensC); name1TokensC.retainAll(namesBothSides); name1Tokens.retainAll(name2Tokens); if ((name1TokensC.size() > 0 && name1Tokens.size() < 3) || (name1TokensC.size() > 1 && name1Tokens.size() < 5)) { // 'mens == men; case !(name1TokensC.size()==2 && (name1TokensC.get(0).indexOf(name1TokensC.get(1))>-1 || // name1TokensC.get(1).indexOf(name1TokensC.get(0))>-1 ))){ LOG.info("Found required common word present on one side and not on the other: " + name1TokensC.toString() + " and less than 3 keywords overlap (or >1 common words and less than 5 overl"); return false; } else return true; } protected List<String> tokenizeAndStem(String input) { List<String> results = new ArrayList<String>(); List<String> toks = TextProcessor.fastTokenize(input.toLowerCase(), false); for (String word : toks) { try { if (word.equals("theatre")) word = "theater"; results.add(word); } catch (Exception e) { results.add(word); } } return results; } protected List<String> stemList(List<String> toks) { List<String> results = new ArrayList<String>(); for (String word : toks) { try { if (word.equals("theatre")) word = "theater"; results.add(word); } catch (Exception e) { results.add(word); } } return results; } public List<String> removeVenuePart(ArrayList<String> toks) { List<String> results = new ArrayList<String>(); boolean bVenuePart = false; for (String word : toks) { // beginning of venue part if (word.equals("at") || word.equals("@")) bVenuePart = true; // end of venue part if (!StringUtils.isAlphanumeric(word) || word.startsWith("<punc")) bVenuePart = false; if (!bVenuePart && !word.startsWith("<punc")) results.add(word); } return results; } protected boolean isCapitalized(String lookup) { String[] titleWords = lookup.split(" "); int count = 0; for (String word : titleWords) { if (word.length() < 2) // '-', '|', ':' break; if (word.equals(word.toLowerCase()) && (!Arrays.asList(englishPrepositions).contains(word)) && word.length() > 3 && StringUtils.isAlphanumeric(word)) continue; // was return false; if (count > 3) break; count++; } return true; } protected List<String> extractMainNounPhrase(List<String> name1Tokens) { List<String> results = new ArrayList<String>(); int ofPos = name1Tokens.indexOf("of"); List<String> ofList = name1Tokens.subList(ofPos + 1, name1Tokens.size() - 1); // now iterate till next preposition towards the end of noun phrase for (String preposCand : ofList) { if (Arrays.asList(englishPrepositions).contains(preposCand)) break; results.add(preposCand); } return results; } public boolean verifyEventAttributesPost(List<String> name1Tokens, List<String> name2Tokens) { String[] attributeNamesPost = { "age", "ages", "game", "games", "grade", "grades", "level", "levels", "vs", "vs.", "versus", "pottery", "competition", "contest", "skill", "skills", "day", "only", "basic", "class", "completed", // "tour", ? "advanced", "beginner", "intermediate", "flight", "workshop", "latin", "adobe", "ballet", "dinner", "breakfast", "lunch", "summer", // "canyon" "tfestival", "festival", "mfestival" }; try { for (String attr : attributeNamesPost) { int agePos1 = name1Tokens.indexOf(attr); int agePos2 = name2Tokens.indexOf(attr); if (agePos1 > -1 && agePos2 > -1 && agePos1 < name1Tokens.size() - 1 && agePos2 < name2Tokens.size() - 1) { double dist = LevensteinDistanceFinder.levensteinDistance(name1Tokens.get(agePos1 + 1), name2Tokens.get(agePos2 + 1), 1, 10, 1, 10); if (!name1Tokens.get(agePos1 + 1).equalsIgnoreCase(name2Tokens.get(agePos2 + 1)) && (dist > 2.99 || name1Tokens.get(agePos1 + 1).length() < 4)) { LOG.info("Found disagreement in the attrib value for " + attr + " value = " + name1Tokens.get(agePos1 + 1) + " <=> " + name2Tokens.get(agePos2 + 1)); return false; } } } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return true; } public boolean verifyEventAttributesPre(List<String> name1Tokens, List<String> name2Tokens) { String[] attributeNamesPre = { "hour", "vs", "vs.", "versus", "pottery", "program", "day", "only", // dance styles followed by a param "swing", "rumba", "samba", "doble", "violence", // // "level", "class", "classes", "kid", "kids", "test", "west", "summer_camp", "session", "tfestival", "festival", "mfestival" }; try { for (String attr : attributeNamesPre) { int agePos1 = name1Tokens.indexOf(attr); int agePos2 = name2Tokens.indexOf(attr); if (agePos1 > 0 && agePos2 > 0) { // not the first word is attr name if (!name1Tokens.get(agePos1 - 1).equalsIgnoreCase(name2Tokens.get(agePos2 - 1)) && (agePos1 < 2 || !name1Tokens.get(agePos1 - 2).equalsIgnoreCase(name2Tokens.get(agePos2 - 1))) && // ((agePos1<2 && agePos2 <2) || !name1Tokens.get(agePos1 - // 2).equalsIgnoreCase(name2Tokens.get(agePos2 - 2 ))) && (agePos2 < 2 || !name1Tokens.get(agePos1 - 1).equalsIgnoreCase(name2Tokens.get(agePos2 - 2))) ) { LOG.info("Found disagreement in the attrib value for " + attr + " value = " + name1Tokens.get(agePos1 - 1) + " and " + name2Tokens.get(agePos2 - 1)); return false; } } } } catch (Exception e) { e.printStackTrace(); } return true; } protected boolean bDifferentGroupOneSubnameOfAnother(String name1, String name2) { // first check a special case that both name1 and name2 are DIFFERENT groups at last.fm Map<String, Integer> map1 = null; //LastFM_APIManager.extractTagsForArtist(name1); Map<String, Integer> map2 = null; //LastFM_APIManager.extractTagsForArtist(name2); if (map1 != null && map2 != null && map1.size() > 0 && map2.size() > 0) map1.entrySet().removeAll(map2.entrySet()); if (map1.size() > 0) // same or subset of tags => different groups return true; return false; } public boolean applyBothSidesRule(String name1, String name2) { List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), false); List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), false); // get unique names List<String> name1TokensC = new ArrayList<String>(name1Tokens), name2TokensC = new ArrayList<String>(name2Tokens); ; name1TokensC.removeAll(name2Tokens); name2TokensC.removeAll(name1Tokens); // get all unique names name1TokensC.addAll(name2TokensC); name1TokensC.retainAll(namesBothSides); if (name1TokensC.size() > 0) return false; else return true; } private boolean succeededMenWomenSportsRule(String name1, String name2) { List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), false); List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), false); if (name1Tokens.contains("men") || name2Tokens.contains("men") || name1Tokens.contains("women") || name2Tokens.contains("women") || name1Tokens.contains("disney") || name2Tokens.contains("disney")) { // all words should be the // same name1Tokens.removeAll(name2Tokens); name1Tokens.removeAll(Arrays.asList(englishPrepositions)); name1Tokens.removeAll(Arrays.asList(commonWordsInEventTitles)); if (name1Tokens.size() < 1) return true; return false; } else return true; } private boolean succeededSpecialGroupsSymphoniesRule(String name1, String name2) { List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), false); List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), false); if (name1Tokens.contains("orchestra") || name2Tokens.contains("symphony") || name2Tokens.contains("orchestra") || name1Tokens.contains("symphony") || name2Tokens.contains("band") || name1Tokens.contains("band") || name2Tokens.contains("trio") || name1Tokens.contains("trio") || name1Tokens.contains("soleil") || name2Tokens.contains("soleil") || name1Tokens.contains("disney") || name2Tokens.contains("disney") || name1Tokens.contains("lang") || name2Tokens.contains("lang")) // special group 'lang lang' { // all words should be the // same List<String> name1TokensClone = new ArrayList<String>(name1Tokens); name1Tokens.removeAll(name2Tokens); name2Tokens.removeAll(name1TokensClone); name1Tokens.addAll(name2Tokens); name1Tokens.removeAll(Arrays.asList(this.englishPrepositions)); // name1Tokens.removeAll(Arrays.asList(this.commonWordsInEventTitles)); if (name1Tokens.size() < 1) return true; return false; } else return true; } public int getAttemptedNameMerge(String name1, String name2) { name1 = name1.replaceAll("[a-z][A-Z]", "$0&$0").replaceAll(".&.", " "); ; // suspected word merge if higher case is in the middle of word name2 = name2.replaceAll("[a-z][A-Z]", "$0&$0").replaceAll(".&.", " "); name1 = name1.toLowerCase(); name2 = name2.toLowerCase(); if (name1.equals(name2) || name1.startsWith(name2) || name2.startsWith(name1) || name1.endsWith(name2) || name1.endsWith(name2) || name1.indexOf(name2) > -1 || name1.indexOf(name2) > -1) // ?? return 2; String name2r = name2.replace(" ", ""); if (name1.equals(name2r) || name1.startsWith(name2r) || name1.startsWith(name2r) || name1.endsWith(name2r) || name1.endsWith(name2r)) return 1; String name1r = name1.replace(" ", ""); if (name1r.equals(name2r) || name1r.startsWith(name2r) || name1r.startsWith(name2) || name1r.endsWith(name2r) || name1r.endsWith(name2r) || name2r.equals(name1r) || name2r.startsWith(name1r) || name2r.startsWith(name1) || name2r.endsWith(name1r) || name2r.endsWith(name2) ) return 1; if (stringDistanceMeasurer.measureStringDistance(name1, name2) > 0.95) return 2; if (stringDistanceMeasurer.measureStringDistance(name1, name2) > 0.70) return 1; return 0; } private String normalizeGenderAndOtherAttributes(String name1) { name1 = Utils.convertToASCII(name1.replace("/", " ").replace("w/", "with ")).replace('!', ' ') .toLowerCase(); name1 = name1.replace("woman", "women").replace("womans", "women").replace("womens", "women") .replace("women's", "women").replace("woman's", "women"); name1 = name1.replace(" man ", " men ").replace(" mans ", " men ").replace(" men's ", " men ") .replace(" man's ", " men ").replace(" mens ", " men ").replace("summer camp", "summer_camp") .replace("gaea theatre festival", "tfestival"); // need regexp for this return name1; } /* * Main semantic similarity function which applies boundary cases rule and focus on web mining rule The main * criteria for a commonality between titles: to form an entity, searchable on the web */ public DedupResult areNamesSemanticallyCloseWebMineCommonPart(String name1, String name2, String venue) { // normalize gender name1 = normalizeGenderAndOtherAttributes(name1); name2 = normalizeGenderAndOtherAttributes(name2); Boolean bShortTitlesSimilarInWebSpace = attemptShortTitlesSimilarityInWebSpace(name1, name2); if (bShortTitlesSimilarInWebSpace) return new DedupResult("Accepted as short title by web mining", 2, true); StringBuffer reason = new StringBuffer(); List<String> venueToks = removeVenuePart(TextProcessor.fastTokenize(venue.toLowerCase(), false)); LOG.info("\nComputing similarity between name = '" + name1 + "' and name = '" + name2 + "'"); // convert titles into token lists List<String> name1Tokens = removeVenuePart(TextProcessor.fastTokenize(name1.toLowerCase(), true)); List<String> name2Tokens = removeVenuePart(TextProcessor.fastTokenize(name2.toLowerCase(), true)); // applySubPhraseExtractionRule() Boolean bSameAttrib = verifyEventAttributesPost(name1Tokens, name2Tokens) && verifyEventAttributesPre(name1Tokens, name2Tokens); if (!bSameAttrib) { LOG.info("similar events but different attributes"); return new DedupResult("similar events but different attributes", 0, false); } boolean bothSodesSuccess = applyBothSidesRuleEvent(name1, name2); if (!bothSodesSuccess) { return new DedupResult("Failed common words test for sports", 0, false); } float dist = (float) LevensteinDistanceFinder.levensteinDistance(name1, name2, 1, 10, 1, 10); if (dist < 5.1) { LOG.info("Found low LevensteinDistance for name1 and name2"); return new DedupResult("Found low LevensteinDistance", 2, true); } int nameMergeScore = getAttemptedNameMerge(name1, name2); if (nameMergeScore > 0) { LOG.info("Found low NameMerge Distance for name1 and name2"); return new DedupResult("Found low NameMerge Distance", 2, true); } // todo take into account order // form common sub-list of tokens name1Tokens.retainAll(name2Tokens); name1Tokens.removeAll(venueToks); name1Tokens.removeAll(commonWordsInEventTitles); name1Tokens.removeAll(Arrays.asList(englishPrepositions)); name1Tokens = removeDollarWordAndNonAlphaFromList(name1Tokens); // todo : to use full string measure // boundary case: too many words => just do counts float commonPortion = (float) name1Tokens.size() / (float) name2Tokens.size(); if (commonPortion > 0.8 || name1Tokens.size() >= 4) { // after typical // title words // are revomed 4 // looks OK LOG.info("Accepted since substantial common part"); return new DedupResult("Accepted since substantial common part", Math.max((int) (commonPortion * 5.0), 2), true); } // boundary case: no overlap if (name1Tokens.size() < 1) { LOG.info("Rejected since nothing in common"); return new DedupResult("Rejected since nothing in common", 0, false); } // get from list of tokens back to words to get search expression String entityExpression = name1Tokens.toString().replace('[', ' ').replace(']', ' ').replace(',', ' ') .replace(" ", " ").trim(); /* * // now try name merge reduced strings String entityExpression1 = name1TokensC.toString().replace('[', * ' ').replace(']', ' ').replace(',', ' ') .replace(" ", " ").trim(); String entityExpression2 = * name2Tokens.toString().replace('[', ' ').replace(']', ' ').replace(',', ' ') .replace(" ", " ").trim(); * * nameMergeScore = getAttemptedNameMerge(entityExpression1, entityExpression2); if (nameMergeScore>0){ * LOG.info("Found low NameMerge Distance for REDUCED name1 and name2"); return new * DedupResult("Found low NameMerge Distance REDUCED", 2, true); * * } */ // Before doing web mining, make sure overlap between titles is NOT a // set of common english words (use the vocabulary) // if all words are common, then NOT an entity if (name1Tokens.size() < 2) { boolean bCommonEnglishWord = false; for (String word : name1Tokens) { // if (stopList.isCommonWord(word) /*&& mostFrequent1000Words.isMostFrequent1000Word(word)*/) // bCommonEnglishWord = true; } if (bCommonEnglishWord) { LOG.info("Rejected common entity: common word = " + entityExpression); return new DedupResult("Rejected since common entity is common English word = " + entityExpression, 0, false); } } // accept common expression LOG.info("Formed common entity = " + entityExpression); reason.append("Formed common entity = " + entityExpression + "\n"); // now go to the web / bing api with this common expression List<HitBase> searchResult = webSearch.runSearch(entityExpression); float entityScore = 0f; if (searchResult != null) { int count = 0; for (HitBase item : searchResult) { String lookup = item.getTitle(); LOG.info("Bing hit title = '" + lookup + "'"); reason.append("Bing hit title = '" + lookup + "'\n"); if (count > 4) break; count++; // if occurrence is not capitalized then rejected, do not take // into account in score if (!isCapitalized(lookup)) { LOG.info("Rejected hit title since not capitalized"); reason.append("Rejected hit title since not capitalized\n"); continue; } /* * if (lookup.indexOf('-')>0 ){ lookup = lookup.split("-")[0]; } */ // now compute overlap between what found on the web for hit's // title and the common expression between events List<String> lookupTokens = tokenizeAndStem(lookup); lookupTokens.retainAll(stemList(name1Tokens)); if (lookupTokens.size() >= name1Tokens.size()) // increment score if found hit title is acceptable entityScore += 1.0; else { LOG.info( "Found hit title " + lookupTokens + " does not cover comonality expr = " + name1Tokens); entityScore += 0.25; } } } return new DedupResult(reason.toString(), (int) entityScore, entityScore > 1.0); } public DedupResult areNamesSemanticallyCloseInWebSearchSpace(String name1, String name2, Float thresh, boolean bStem) { if (thresh == null || thresh == 0f) { thresh = 0.8f; } // normalize gender name1 = normalizeGenderAndOtherAttributes(name1); name2 = normalizeGenderAndOtherAttributes(name2); StringBuffer reason = new StringBuffer(); boolean bSportsOrOrchestra = !succeededMenWomenSportsRule(name1, name2); if (bSportsOrOrchestra) return new DedupResult("Sports rule: different teams or teams of different venues", 0, false); bSportsOrOrchestra = !succeededSpecialGroupsSymphoniesRule(name1, name2); if (bSportsOrOrchestra) return new DedupResult("SpecialGroupsSymphoniesRule: different circus/band", 0, false); LOG.info("\nComputing similarity between name = '" + name1 + "' and name = '" + name2 + "'"); List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), true); List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), true); Boolean bSameAttrib = verifyEventAttributesPost(name1Tokens, name2Tokens) && verifyEventAttributesPre(name1Tokens, name2Tokens); if (!bSameAttrib) { LOG.info("similar events but different attributes"); return new DedupResult("similar events but different attributes", 0, false); } List<HitBase> searchResult1 = webSearch.runSearch(name1); List<HitBase> searchResult2 = webSearch.runSearch(name2); int score = 0; if (searchResult1 != null && searchResult2 != null) { for (HitBase item1 : searchResult1) { if (item1.getUrl().indexOf("myspace") > -1 || item1.getUrl().indexOf("wiki") > -1) continue; for (HitBase item2 : searchResult2) { String lookup1 = item1.getTitle().replace("Facebook", "").replace("LinkedIn", "") .replace("MySpace", ""); String lookup2 = item2.getTitle().replace("Facebook", "").replace("LinkedIn", "") .replace("MySpace", ""); double d = 0; if (bStem) d = stringDistanceMeasurer.measureStringDistance(lookup1, lookup2); else d = stringDistanceMeasurer.measureStringDistanceNoStemming(lookup1, lookup2); if (d > thresh) // 0.8) { reason.append("Found common search result title for group names '" + lookup1 + " < > " + lookup2 + " sim = " + d + "\n"); LOG.info(("Found common search result title for group names '" + lookup1 + " < > " + lookup2 + " sim = " + d)); score++; } } } } Boolean bothSidesSuccess = applyBothSidesRule(name1, name2); if (!bothSidesSuccess) { score = 1; reason.append("Failed common words test for sports"); } if (score > 0) { Boolean bDifferentGroup = bDifferentGroupOneSubnameOfAnother(name1, name2); if (bDifferentGroup) { score = 1; reason.append("Failed common words test for sports"); } } return new DedupResult(reason.toString(), score, score > 1); } }