List of usage examples for org.apache.commons.lang3 StringUtils getLevenshteinDistance
public static int getLevenshteinDistance(CharSequence s, CharSequence t)
From source file:com.netcrest.pado.index.provider.lucene.TopNLuceneSearch.java
public static double getLSScore(String fromValue, String targetFieldValue) { // TODO Auto-generated method stub double maxLen = Math.max(fromValue.length(), targetFieldValue.length()); double lsDistance = StringUtils.getLevenshteinDistance(fromValue, targetFieldValue); return 1d - lsDistance / maxLen; }
From source file:edu.usc.ir.geo.gazetteer.GeoNameResolver.java
/** * Select the best match for each location name extracted from a document, * choosing from among a list of lists of candidate matches. Filter uses the * following features: 1) edit distance between name and the resolved name, * choose smallest one 2) content (haven't implemented) * * @param resolvedEntities//from w w w.ja va 2 s . com * final result for the input stream * @param allCandidates * each location name may hits several documents, this is the * collection for all hitted documents * @param count * Number of results for one locations * @throws IOException * @throws RuntimeException */ private void pickBestCandidates(HashMap<String, List<Location>> resolvedEntities, HashMap<String, List<Location>> allCandidates, int count) { for (String extractedName : allCandidates.keySet()) { List<Location> cur = allCandidates.get(extractedName); if (cur.isEmpty()) continue;//continue if no results found int maxWeight = Integer.MIN_VALUE; //In case weight is equal for all return top element int bestIndex = 0; //Priority queue to return top elements PriorityQueue<Location> pq = new PriorityQueue<>(cur.size(), new Comparator<Location>() { @Override public int compare(Location o1, Location o2) { return Integer.compare(o2.getWeight(), o1.getWeight()); } }); for (int i = 0; i < cur.size(); ++i) { int weight = 0; // get cur's ith resolved entry's name String resolvedName = String.format(" %s ", cur.get(i).getName()); if (resolvedName.contains(String.format(" %s ", extractedName))) { // Assign a weight as per configuration if extracted name is found as a exact word in name weight = WEIGHT_NAME_MATCH; } else if (resolvedName.contains(extractedName)) { // Assign a weight as per configuration if extracted name is found partly in name weight = WEIGHT_NAME_PART_MATCH; } // get all alternate names of cur's ith resolved entry's String[] altNames = cur.get(i).getAlternateNames().split(","); float altEditDist = 0; for (String altName : altNames) { if (altName.contains(extractedName)) { altEditDist += StringUtils.getLevenshteinDistance(extractedName, altName); } } //lesser the edit distance more should be the weight weight += getCalibratedWeight(altNames.length, altEditDist); //Give preference to sorted results. 0th result should have more priority weight += (cur.size() - i) * WEIGHT_SORT_ORDER; cur.get(i).setWeight(weight); if (weight > maxWeight) { maxWeight = weight; bestIndex = i; } pq.add(cur.get(i)); } if (bestIndex == -1) continue; List<Location> resultList = new ArrayList<>(); for (int i = 0; i < count && !pq.isEmpty(); i++) { resultList.add(pq.poll()); } resolvedEntities.put(extractedName, resultList); } }
From source file:com.moviejukebox.plugin.ComingSoonPlugin.java
/** * Returns difference between two titles. * * Since ComingSoon returns strange results on some researches, difference is defined as follows: abs(word count difference) - * (searchedTitle word count - matched words); * * @param searchedTitle/*from w ww. j av a 2 s . c o m*/ * @param returnedTitle * @return */ private static int compareTitles(String searchedTitle, String returnedTitle) { if (StringTools.isNotValidString(returnedTitle)) { return COMINGSOON_MAX_DIFF; } LOG.trace("Comparing {} and {}", searchedTitle, returnedTitle); String title1 = searchedTitle.toLowerCase().replaceAll("[,.\\!\\?\"']", ""); String title2 = returnedTitle.toLowerCase().replaceAll("[,.\\!\\?\"']", ""); return StringUtils.getLevenshteinDistance(title1, title2); }
From source file:com.omertron.themoviedbapi.TheMovieDbApi.java
/** * Compare the Levenshtein Distance between the two strings * * @param title1/*from w ww. j a v a2 s . c o m*/ * @param title2 * @param distance */ private static boolean compareDistance(String title1, String title2, int distance) { return StringUtils.getLevenshteinDistance(title1, title2) <= distance; }
From source file:GIST.IzbirkomExtractor.TableExtractor.java
/** * Tests the row if it looks like the 1st row of a parsable table * @param row/*from w ww.j a v a 2 s.c o m*/ * @return */ private boolean isParsableTable(Element row) { Elements cells = row.getElementsByTag("td"); /* number of columns should be 4 */ if (cells.size() != 4) return false; /* look for number signs in 1st cell*/ if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()), " . -") < 3) return true; /* discard the table if any of the cells is empty */ for (Element cell : cells) { if (cleanupUNICODE(cell.text()).isEmpty()) return false; } /* 1st column should be a number */ try { Integer.parseInt(cleanupUNICODE(cells.first().text()).trim()); return true; } catch (NumberFormatException e) { return false; } }
From source file:edu.umass.cs.gnsclient.console.ConsoleBasedTest.java
/** * Run inCommands commands through the CLI and compare the output with * expectedOutput. Can also check if a default GNS and/or GUID have been set. * * @param inCommands list of console commands separated by '\n' * @param expectedOutput list of expected console output (can use Java regular * expressions, each line separated by '\n') * @param requireDefaultGns check if a default GNS has been defined (will * error out if not)/*from w w w . ja va 2 s . c o m*/ * @param requireDefaultAccountGuid check if a default GUID has been defined * (will error out if not) */ protected void runCommandsInConsole(String inCommands, String expectedOutput, boolean requireDefaultGns, boolean requireDefaultAccountGuid) { boolean success = false; StringWriter output = new StringWriter(); try { ConsoleReader consoleReader = new ConsoleReader(new ByteArrayInputStream(inCommands.getBytes("UTF-8")), output); ConsoleModule module = new ConsoleModule(consoleReader); module.printString("GNS Client Version: " + GNSClientConfig.readBuildVersion() + "\n"); // Run the commands module.handlePrompt(); // Check the output StringTokenizer expected = new StringTokenizer(expectedOutput, "\n"); StringTokenizer actual = new StringTokenizer(output.toString(), "\n"); if (!actual.hasMoreTokens()) { fail("No console output"); } if (!actual.nextToken().startsWith("GNS Client Version")) { fail("Unexpected console output, should start with 'GNS Client Version'"); } // Check that default GNS defaults is set if (requireDefaultGns) { if (!actual.hasMoreTokens()) { fail("Default GNS not set"); } String defaultGns = actual.nextToken(); if ("Default GNS: null".equals(defaultGns)) { fail("Default GNS not set"); } if (!defaultGns.startsWith("Default GNS: ")) { fail("Unexpected console output, should start with 'Default GNS: '"); } // Check GNS Connectivity . if (!actual.hasMoreTokens()) { fail("No console output during GNS connectivity check"); } if (!actual.nextToken().startsWith("Checking GNS connectivity")) { fail("Unexpected console output during GNS connectivity check"); } if (!actual.hasMoreTokens()) { fail("No console output during GNS connectivity check"); } if (!actual.nextToken().startsWith("Connected to GNS")) { fail("Default GNS is not reachable"); } } else { // Consume lines until we connected or not to a default GNS while (actual.hasMoreTokens()) { String line = actual.nextToken(); if (line.startsWith("Connected to GNS") || line.startsWith("Failed to connect to GNS") || line.startsWith("Couldn't connect to default GNS")) { break; } } } if (requireDefaultAccountGuid) { // Check default GUID if (!actual.hasMoreTokens()) { fail("Default GUID not set"); } String defaultGuid = actual.nextToken(); if (defaultGuid.matches("Default GUID: null")) { fail("Default GUID not set"); } if (!actual.hasMoreTokens()) { fail("Default GUID not set"); } defaultGuid = actual.nextToken(); if (!defaultGuid.matches("Looking up alias .*")) { fail("Unexpected console output, should start with 'Looking up alias'"); } if (!actual.hasMoreTokens()) { fail("Default GUID not set"); } if (!actual.nextToken().startsWith("Current GUID set to ")) { fail("Default GUID not set or not valid"); } if (!actual.hasMoreTokens()) { fail("Default GUID not set or not valid"); } // Next should be the console prompt if (!actual.nextToken().startsWith(GNS_CLI_PROMPT)) { fail("Default GUID not properly set, expected console prompt"); } } else { // Consume all input until first prompt while (actual.hasMoreTokens() && !actual.nextToken().startsWith(GNS_CLI_PROMPT)) ; } // Diff outputs while (expected.hasMoreTokens()) { String nextExpected = expected.nextToken(); String nextActual = null; if (actual.hasMoreTokens()) { // Skip command line prompts to get real output nextActual = actual.nextToken(); while (nextActual.startsWith(GNS_CLI_PROMPT) && actual.hasMoreTokens()) { nextActual = actual.nextToken(); } // Treat expected as a regular expression if (!nextActual.matches("(?s)" + nextExpected)) { if (StringUtils.getLevenshteinDistance(nextActual, nextExpected) < 5) { for (int i = 0; i < nextActual.length(); i++) { final char actualChar = nextActual.charAt(i); if (i < expectedOutput.length()) { final char expectedChar = nextExpected.charAt(i); if (actualChar != expectedChar) { System.out.println("Character " + i + " differs: " + ((int) actualChar) + " vs expected " + ((int) expectedChar) + " - " + actualChar + "<>" + expectedChar + "\n"); } } else { System.out.println("Character " + i + " is extra: " + ((int) actualChar) + " - " + actualChar); } } } fail("Got : '" + nextActual + "'\n" + "Expected: '" + nextExpected + "'\n"); } } else { fail("Missing expected output: " + nextExpected); } } // Check extra output while (actual.hasMoreTokens()) { String nextActual = actual.nextToken(); while (nextActual.startsWith(GNS_CLI_PROMPT) && actual.hasMoreTokens()) { nextActual = actual.nextToken(); } if (!nextActual.startsWith(GNS_CLI_PROMPT)) { fail("Extra output: " + actual.nextToken()); } } success = true; } catch (IOException e) { fail("Error during console execution " + e); } finally { if (!success) { System.out.println("**** COMPLETE OUTPUT for " + testName.getMethodName() + " ****"); System.out.println(output.toString()); } } }
From source file:com.zilbo.flamingSailor.TE.model.TextPage.java
/** * Try and remove boilerplate text from the page. * this was designed to be called just after the pieces have been converted to lines, * and before higher order structures (tables/headings) have been determined. * * @param headerTemplate potential template text (in components) * @param maxLevenshteinDistance max distance to allow. (to take into account page-numbers) * @param headerTemplateString the template text (in a string) * @param boundingBox the bounding box of the template text * @param isHeader is this at the top of the page (true) or bottom. * @param doUpdate actually modify the header/footer. * @return true if matched the boilerplate *//*from w ww .j ava 2 s .c om*/ public boolean removeBoilerPlateComponent(Component headerTemplate, int maxLevenshteinDistance, String headerTemplateString, Rectangle2D boundingBox, boolean isHeader, boolean doUpdate) { List<Component> topC = this.findByGeomByLines(boundingBox); if (topC.size() == 0) { if (lines.size() < 2) { return false; } if (isHeader) { topC.add(this.getTopLine()); } else { topC.add(this.getBottomLine()); } } String topCAsString = componentListToString(topC); int distance = StringUtils.getLevenshteinDistance(headerTemplateString, topCAsString); if (distance <= maxLevenshteinDistance) { if (doUpdate) { for (Component c : topC) { assert (c instanceof TextLine); if (isHeader) { header.addChild(c); } else { footer.addChildAtTop(c); } for (Component tpC : c.getChildren()) { assert (tpC instanceof TextPiece); adjustFontTally((TextPiece) tpC); } if (c instanceof TextLine) { int index = lines.indexOf(c); if (index >= 0) { lines.remove(index); } else { logger.error("Component not found in lines?"); } } else { logger.error("BoilerPlate! need to remove other components than lines:"); } } } return true; } else { // logger.info(headerTemplateString + "\t" + topCAsString + "\t distance:" + distance); } return false; }
From source file:codeu.chat.client.commandline.Chat.java
private Set<String> splitMessage(String body) { Set<String> sentences = new HashSet<String>(); for (String sentence : body.split("(?<=[!\\?\\.])")) { // Split the user message by punctuation if (sentence.trim().length() > 1) { // Avoid phrases of length 1, which are likely meaningless sentences.add(sentence.trim()); // Store the trimmed sentences in a set }//w w w.ja v a 2 s .c o m } for (String trimmed : sentences) { // Don't add anything that matches a general greeting if (!((StringUtils.getLevenshteinDistance(trimmed, "How are you?")) <= "How are you?".length() / 3.0 || trimmed.contains("hello") || trimmed.contains("Hello"))) { if (!(userPhraseSet.contains(trimmed + ".") || userPhraseSet.contains(trimmed + "!") || userPhraseSet.contains(trimmed + "?"))) { // This will check for verbatim duplicates on its own and add phrases accordingly if (userPhraseSet.add(trimmed)) { // Also add this phrase to the list, which will be used for random access userPhraseList.add(trimmed); } } // Finally, also map this phrase to the previous response, which hasn't been updated yet. if (response.length() >= 1) { userPhraseMap.get(response).add(trimmed); } } } return sentences; }
From source file:codeu.chat.client.commandline.Chat.java
private boolean mostRecentContainsSubstring(Set<String> mostRecent) { int distance; int randResponse; double bestPercent = 1; String bestResponse = ""; // Check the prewritten responses for (String key : responseMap.keySet()) { for (String phrase : mostRecent) { // Check for a match between user phrase and stored phrases using edit distance if ((!phraseReferencesSelf(phrase) && (distance = StringUtils.getLevenshteinDistance(key, phrase)) <= phrase.length() / 3.0)) { // If a match is found, record the percent match and response and move on if ((double) distance / key.length() < bestPercent) { bestPercent = (double) distance / key.length(); bestResponse = responseMap.get(key); }/*from www . ja v a 2 s . c om*/ } else if (phrase.toLowerCase().contains(key) && phrase.length() <= key.length() * 1.3) { bestResponse = responseMap.get(key); } } } // Check the responses gained from online movie scripts for (String key : scriptMap.keySet()) { for (String phrase : mostRecent) { // Check for a match between user phrase and stored phrases using edit distance if ((!phraseReferencesSelf(phrase) && (distance = StringUtils.getLevenshteinDistance(key, phrase)) <= phrase.length() / 3.0)) { // If a match is found, record the percent match and response and move on if ((double) distance / key.length() < bestPercent) { bestPercent = (double) distance / key.length(); randResponse = generator.nextInt(scriptMap.get(key).size()); bestResponse = scriptMap.get(key).get(randResponse); } } else if (phrase.toLowerCase().contains(key)) { randResponse = generator.nextInt(scriptMap.get(key).size()); bestResponse = scriptMap.get(key).get(randResponse); } } } if (bestResponse.length() == 0) // Return false if no match gets made return false; response = bestResponse; return true; }
From source file:com.moviejukebox.plugin.TheMovieDbPlugin.java
/** * Attempt to find the TMDB ID for a person using their full name * * @param name/*from w w w . ja v a2 s . c o m*/ * @return */ public String getPersonId(String name) { String tmdbId = ""; PersonFind closestPerson = null; int closestMatch = Integer.MAX_VALUE; boolean foundPerson = Boolean.FALSE; boolean includeAdult = PropertiesUtil.getBooleanProperty("themoviedb.includeAdult", Boolean.FALSE); try { ResultList<PersonFind> results = tmdb.searchPeople(name, 0, includeAdult, SearchType.PHRASE); LOG.info("Found {} person results for {}", results.getResults().size(), name); for (PersonFind person : results.getResults()) { if (name.equalsIgnoreCase(person.getName())) { tmdbId = String.valueOf(person.getId()); foundPerson = Boolean.TRUE; break; } LOG.trace("Checking {} against {}", name, person.getName()); int lhDistance = StringUtils.getLevenshteinDistance(name, person.getName()); LOG.trace("{}: Current closest match is {}, this match is {}", name, closestMatch, lhDistance); if (lhDistance < closestMatch) { LOG.trace("{}: TMDB ID {} is a better match", name, person.getId()); closestMatch = lhDistance; closestPerson = person; } } if (foundPerson) { LOG.debug("{}: Matched against TMDB ID: {}", name, tmdbId); } else if (closestMatch < Integer.MAX_VALUE && closestPerson != null) { tmdbId = String.valueOf(closestPerson.getId()); LOG.debug("{}: Closest match is '{}' differing by {} characters", name, closestPerson.getName(), closestMatch); } else { LOG.debug("{}: No match found", name); } } catch (MovieDbException ex) { LOG.warn("Failed to get information on '{}', error: {}", name, ex.getMessage(), ex); } return tmdbId; }