Example usage for org.apache.commons.lang3 StringUtils getLevenshteinDistance

List of usage examples for org.apache.commons.lang3 StringUtils getLevenshteinDistance

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringUtils getLevenshteinDistance.

Prototype

public static int getLevenshteinDistance(CharSequence s, CharSequence t) 

Source Link

Document

<p>Find the Levenshtein distance between two Strings.</p> <p>This is the number of changes needed to change one String into another, where each change is a single character modification (deletion, insertion or substitution).</p> <p>The previous implementation of the Levenshtein distance algorithm was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError which can occur when my Java implementation is used with very large strings.<br> This implementation of the Levenshtein distance algorithm is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> <pre> StringUtils.getLevenshteinDistance(null, *) = IllegalArgumentException StringUtils.getLevenshteinDistance(*, null) = IllegalArgumentException StringUtils.getLevenshteinDistance("","") = 0 StringUtils.getLevenshteinDistance("","a") = 1 StringUtils.getLevenshteinDistance("aaapppp", "") = 7 StringUtils.getLevenshteinDistance("frog", "fog") = 1 StringUtils.getLevenshteinDistance("fly", "ant") = 3 StringUtils.getLevenshteinDistance("elephant", "hippo") = 7 StringUtils.getLevenshteinDistance("hippo", "elephant") = 7 StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz") = 8 StringUtils.getLevenshteinDistance("hello", "hallo") = 1 </pre>

Usage

From source file:com.netcrest.pado.index.provider.lucene.TopNLuceneSearch.java

public static double getLSScore(String fromValue, String targetFieldValue) {
    // TODO Auto-generated method stub
    double maxLen = Math.max(fromValue.length(), targetFieldValue.length());
    double lsDistance = StringUtils.getLevenshteinDistance(fromValue, targetFieldValue);
    return 1d - lsDistance / maxLen;
}

From source file:edu.usc.ir.geo.gazetteer.GeoNameResolver.java

/**
 * Select the best match for each location name extracted from a document,
 * choosing from among a list of lists of candidate matches. Filter uses the
 * following features: 1) edit distance between name and the resolved name,
 * choose smallest one 2) content (haven't implemented)
 *
 * @param resolvedEntities//from  w w w.ja  va 2  s  .  com
 *            final result for the input stream
 * @param allCandidates
 *            each location name may hits several documents, this is the
 *            collection for all hitted documents
 * @param count
 *            Number of results for one locations
 * @throws IOException
 * @throws RuntimeException
 */

private void pickBestCandidates(HashMap<String, List<Location>> resolvedEntities,
        HashMap<String, List<Location>> allCandidates, int count) {

    for (String extractedName : allCandidates.keySet()) {

        List<Location> cur = allCandidates.get(extractedName);
        if (cur.isEmpty())
            continue;//continue if no results found

        int maxWeight = Integer.MIN_VALUE;
        //In case weight is equal for all return top element
        int bestIndex = 0;
        //Priority queue to return top elements
        PriorityQueue<Location> pq = new PriorityQueue<>(cur.size(), new Comparator<Location>() {
            @Override
            public int compare(Location o1, Location o2) {
                return Integer.compare(o2.getWeight(), o1.getWeight());
            }
        });

        for (int i = 0; i < cur.size(); ++i) {
            int weight = 0;
            // get cur's ith resolved entry's name
            String resolvedName = String.format(" %s ", cur.get(i).getName());
            if (resolvedName.contains(String.format(" %s ", extractedName))) {
                // Assign a weight as per configuration if extracted name is found as a exact word in name
                weight = WEIGHT_NAME_MATCH;
            } else if (resolvedName.contains(extractedName)) {
                // Assign a weight as per configuration if extracted name is found partly in name
                weight = WEIGHT_NAME_PART_MATCH;
            }
            // get all alternate names of cur's ith resolved entry's
            String[] altNames = cur.get(i).getAlternateNames().split(",");
            float altEditDist = 0;
            for (String altName : altNames) {
                if (altName.contains(extractedName)) {
                    altEditDist += StringUtils.getLevenshteinDistance(extractedName, altName);
                }
            }
            //lesser the edit distance more should be the weight
            weight += getCalibratedWeight(altNames.length, altEditDist);

            //Give preference to sorted results. 0th result should have more priority
            weight += (cur.size() - i) * WEIGHT_SORT_ORDER;

            cur.get(i).setWeight(weight);

            if (weight > maxWeight) {
                maxWeight = weight;
                bestIndex = i;
            }

            pq.add(cur.get(i));
        }
        if (bestIndex == -1)
            continue;

        List<Location> resultList = new ArrayList<>();

        for (int i = 0; i < count && !pq.isEmpty(); i++) {
            resultList.add(pq.poll());
        }

        resolvedEntities.put(extractedName, resultList);
    }
}

From source file:com.moviejukebox.plugin.ComingSoonPlugin.java

/**
 * Returns difference between two titles.
 *
 * Since ComingSoon returns strange results on some researches, difference is defined as follows: abs(word count difference) -
 * (searchedTitle word count - matched words);
 *
 * @param searchedTitle/*from  w  ww.  j av a  2 s . c o  m*/
 * @param returnedTitle
 * @return
 */
private static int compareTitles(String searchedTitle, String returnedTitle) {
    if (StringTools.isNotValidString(returnedTitle)) {
        return COMINGSOON_MAX_DIFF;
    }

    LOG.trace("Comparing {} and {}", searchedTitle, returnedTitle);

    String title1 = searchedTitle.toLowerCase().replaceAll("[,.\\!\\?\"']", "");
    String title2 = returnedTitle.toLowerCase().replaceAll("[,.\\!\\?\"']", "");
    return StringUtils.getLevenshteinDistance(title1, title2);
}

From source file:com.omertron.themoviedbapi.TheMovieDbApi.java

/**
 * Compare the Levenshtein Distance between the two strings
 *
 * @param title1/*from   w ww.  j  a v a2  s .  c  o  m*/
 * @param title2
 * @param distance
 */
private static boolean compareDistance(String title1, String title2, int distance) {
    return StringUtils.getLevenshteinDistance(title1, title2) <= distance;
}

From source file:GIST.IzbirkomExtractor.TableExtractor.java

/**
     * Tests the row if it looks like the 1st row of a parsable table
     * @param row/*from w ww.j a  v  a  2 s.c  o  m*/
     * @return
     */
    private boolean isParsableTable(Element row) {

        Elements cells = row.getElementsByTag("td");

        /* number of columns should be 4 */
        if (cells.size() != 4)
            return false;

        /* look for number signs in 1st cell*/
        if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()),
                " . -") < 3)
            return true;

        /* discard the table if any of the cells is empty */
        for (Element cell : cells) {
            if (cleanupUNICODE(cell.text()).isEmpty())
                return false;
        }

        /* 1st column should be a number */
        try {
            Integer.parseInt(cleanupUNICODE(cells.first().text()).trim());
            return true;
        } catch (NumberFormatException e) {
            return false;
        }
    }

From source file:edu.umass.cs.gnsclient.console.ConsoleBasedTest.java

/**
 * Run inCommands commands through the CLI and compare the output with
 * expectedOutput. Can also check if a default GNS and/or GUID have been set.
 *
 * @param inCommands list of console commands separated by '\n'
 * @param expectedOutput list of expected console output (can use Java regular
 * expressions, each line separated by '\n')
 * @param requireDefaultGns check if a default GNS has been defined (will
 * error out if not)/*from w w  w  . ja  va  2  s  .  c  o  m*/
 * @param requireDefaultAccountGuid check if a default GUID has been defined
 * (will error out if not)
 */
protected void runCommandsInConsole(String inCommands, String expectedOutput, boolean requireDefaultGns,
        boolean requireDefaultAccountGuid) {
    boolean success = false;
    StringWriter output = new StringWriter();
    try {
        ConsoleReader consoleReader = new ConsoleReader(new ByteArrayInputStream(inCommands.getBytes("UTF-8")),
                output);
        ConsoleModule module = new ConsoleModule(consoleReader);
        module.printString("GNS Client Version: " + GNSClientConfig.readBuildVersion() + "\n");

        // Run the commands
        module.handlePrompt();

        // Check the output
        StringTokenizer expected = new StringTokenizer(expectedOutput, "\n");
        StringTokenizer actual = new StringTokenizer(output.toString(), "\n");

        if (!actual.hasMoreTokens()) {
            fail("No console output");
        }

        if (!actual.nextToken().startsWith("GNS Client Version")) {
            fail("Unexpected console output, should start with 'GNS Client Version'");
        }

        // Check that default GNS defaults is set
        if (requireDefaultGns) {
            if (!actual.hasMoreTokens()) {
                fail("Default GNS not set");
            }
            String defaultGns = actual.nextToken();
            if ("Default GNS: null".equals(defaultGns)) {
                fail("Default GNS not set");
            }
            if (!defaultGns.startsWith("Default GNS: ")) {
                fail("Unexpected console output, should start with 'Default GNS: '");
            }

            // Check GNS Connectivity .
            if (!actual.hasMoreTokens()) {
                fail("No console output during GNS connectivity check");
            }

            if (!actual.nextToken().startsWith("Checking GNS connectivity")) {
                fail("Unexpected console output during GNS connectivity check");
            }
            if (!actual.hasMoreTokens()) {
                fail("No console output during GNS connectivity check");
            }
            if (!actual.nextToken().startsWith("Connected to GNS")) {
                fail("Default GNS is not reachable");
            }
        } else { // Consume lines until we connected or not to a default GNS
            while (actual.hasMoreTokens()) {
                String line = actual.nextToken();
                if (line.startsWith("Connected to GNS") || line.startsWith("Failed to connect to GNS")
                        || line.startsWith("Couldn't connect to default GNS")) {
                    break;
                }
            }
        }

        if (requireDefaultAccountGuid) {
            // Check default GUID
            if (!actual.hasMoreTokens()) {
                fail("Default GUID not set");
            }
            String defaultGuid = actual.nextToken();
            if (defaultGuid.matches("Default GUID: null")) {
                fail("Default GUID not set");
            }
            if (!actual.hasMoreTokens()) {
                fail("Default GUID not set");
            }
            defaultGuid = actual.nextToken();
            if (!defaultGuid.matches("Looking up alias .*")) {
                fail("Unexpected console output, should start with 'Looking up alias'");
            }
            if (!actual.hasMoreTokens()) {
                fail("Default GUID not set");
            }
            if (!actual.nextToken().startsWith("Current GUID set to ")) {
                fail("Default GUID not set or not valid");
            }
            if (!actual.hasMoreTokens()) {
                fail("Default GUID not set or not valid");
            }

            // Next should be the console prompt
            if (!actual.nextToken().startsWith(GNS_CLI_PROMPT)) {
                fail("Default GUID not properly set, expected console prompt");
            }
        } else {
            // Consume all input until first prompt
            while (actual.hasMoreTokens() && !actual.nextToken().startsWith(GNS_CLI_PROMPT))
                ;
        }

        // Diff outputs
        while (expected.hasMoreTokens()) {
            String nextExpected = expected.nextToken();
            String nextActual = null;
            if (actual.hasMoreTokens()) { // Skip command line prompts to get real output
                nextActual = actual.nextToken();
                while (nextActual.startsWith(GNS_CLI_PROMPT) && actual.hasMoreTokens()) {
                    nextActual = actual.nextToken();
                }
                // Treat expected as a regular expression
                if (!nextActual.matches("(?s)" + nextExpected)) {
                    if (StringUtils.getLevenshteinDistance(nextActual, nextExpected) < 5) {
                        for (int i = 0; i < nextActual.length(); i++) {
                            final char actualChar = nextActual.charAt(i);
                            if (i < expectedOutput.length()) {
                                final char expectedChar = nextExpected.charAt(i);
                                if (actualChar != expectedChar) {
                                    System.out.println("Character " + i + " differs: " + ((int) actualChar)
                                            + " vs expected " + ((int) expectedChar) + " - " + actualChar + "<>"
                                            + expectedChar + "\n");
                                }
                            } else {
                                System.out.println("Character " + i + " is extra: " + ((int) actualChar) + " - "
                                        + actualChar);
                            }
                        }
                    }
                    fail("Got     : '" + nextActual + "'\n" + "Expected: '" + nextExpected + "'\n");
                }
            } else {
                fail("Missing expected output: " + nextExpected);
            }
        }

        // Check extra output
        while (actual.hasMoreTokens()) {
            String nextActual = actual.nextToken();
            while (nextActual.startsWith(GNS_CLI_PROMPT) && actual.hasMoreTokens()) {
                nextActual = actual.nextToken();
            }
            if (!nextActual.startsWith(GNS_CLI_PROMPT)) {
                fail("Extra output: " + actual.nextToken());
            }
        }

        success = true;
    } catch (IOException e) {
        fail("Error during console execution " + e);
    } finally {
        if (!success) {
            System.out.println("**** COMPLETE OUTPUT for " + testName.getMethodName() + " ****");
            System.out.println(output.toString());
        }
    }
}

From source file:com.zilbo.flamingSailor.TE.model.TextPage.java

/**
 * Try and remove boilerplate text from the page.
 * this was designed to be called just after the pieces have been converted to lines,
 * and before higher order structures (tables/headings) have been determined.
 *
 * @param headerTemplate         potential template text  (in components)
 * @param maxLevenshteinDistance max distance to allow. (to take into account page-numbers)
 * @param headerTemplateString   the template text (in a string)
 * @param boundingBox            the bounding box of the template text
 * @param isHeader               is this at the top of the page (true) or bottom.
 * @param doUpdate               actually modify the header/footer.
 * @return true if matched the boilerplate
 *//*from  w ww  .j ava  2  s  .c om*/
public boolean removeBoilerPlateComponent(Component headerTemplate, int maxLevenshteinDistance,
        String headerTemplateString, Rectangle2D boundingBox, boolean isHeader, boolean doUpdate) {
    List<Component> topC = this.findByGeomByLines(boundingBox);
    if (topC.size() == 0) {
        if (lines.size() < 2) {
            return false;
        }
        if (isHeader) {
            topC.add(this.getTopLine());
        } else {
            topC.add(this.getBottomLine());
        }
    }
    String topCAsString = componentListToString(topC);
    int distance = StringUtils.getLevenshteinDistance(headerTemplateString, topCAsString);

    if (distance <= maxLevenshteinDistance) {
        if (doUpdate) {
            for (Component c : topC) {
                assert (c instanceof TextLine);
                if (isHeader) {
                    header.addChild(c);
                } else {
                    footer.addChildAtTop(c);
                }
                for (Component tpC : c.getChildren()) {
                    assert (tpC instanceof TextPiece);
                    adjustFontTally((TextPiece) tpC);
                }

                if (c instanceof TextLine) {

                    int index = lines.indexOf(c);
                    if (index >= 0) {
                        lines.remove(index);
                    } else {
                        logger.error("Component not found in lines?");
                    }
                } else {
                    logger.error("BoilerPlate! need to remove other components than lines:");
                }
            }
        }
        return true;
    } else {
        //   logger.info(headerTemplateString + "\t" + topCAsString + "\t distance:" + distance);
    }
    return false;
}

From source file:codeu.chat.client.commandline.Chat.java

private Set<String> splitMessage(String body) {
    Set<String> sentences = new HashSet<String>();
    for (String sentence : body.split("(?<=[!\\?\\.])")) { // Split the user message by punctuation
        if (sentence.trim().length() > 1) { // Avoid phrases of length 1, which are likely meaningless
            sentences.add(sentence.trim()); // Store the trimmed sentences in a set
        }//w  w  w.ja  v a 2 s  .c o m
    }

    for (String trimmed : sentences) {
        // Don't add anything that matches a general greeting
        if (!((StringUtils.getLevenshteinDistance(trimmed, "How are you?")) <= "How are you?".length() / 3.0
                || trimmed.contains("hello") || trimmed.contains("Hello"))) {
            if (!(userPhraseSet.contains(trimmed + ".") || userPhraseSet.contains(trimmed + "!")
                    || userPhraseSet.contains(trimmed + "?"))) {
                // This will check for verbatim duplicates on its own and add phrases accordingly
                if (userPhraseSet.add(trimmed)) {
                    // Also add this phrase to the list, which will be used for random access
                    userPhraseList.add(trimmed);
                }
            }
            // Finally, also map this phrase to the previous response, which hasn't been updated yet.
            if (response.length() >= 1) {
                userPhraseMap.get(response).add(trimmed);
            }
        }
    }
    return sentences;
}

From source file:codeu.chat.client.commandline.Chat.java

private boolean mostRecentContainsSubstring(Set<String> mostRecent) {
    int distance;
    int randResponse;
    double bestPercent = 1;
    String bestResponse = "";

    // Check the prewritten responses
    for (String key : responseMap.keySet()) {
        for (String phrase : mostRecent) {
            // Check for a match between user phrase and stored phrases using edit distance
            if ((!phraseReferencesSelf(phrase)
                    && (distance = StringUtils.getLevenshteinDistance(key, phrase)) <= phrase.length() / 3.0)) {
                // If a match is found, record the percent match and response and move on
                if ((double) distance / key.length() < bestPercent) {
                    bestPercent = (double) distance / key.length();
                    bestResponse = responseMap.get(key);
                }/*from   www . ja  v  a  2  s  . c  om*/
            } else if (phrase.toLowerCase().contains(key) && phrase.length() <= key.length() * 1.3) {
                bestResponse = responseMap.get(key);
            }
        }
    }

    // Check the responses gained from online movie scripts
    for (String key : scriptMap.keySet()) {
        for (String phrase : mostRecent) {
            // Check for a match between user phrase and stored phrases using edit distance
            if ((!phraseReferencesSelf(phrase)
                    && (distance = StringUtils.getLevenshteinDistance(key, phrase)) <= phrase.length() / 3.0)) {
                // If a match is found, record the percent match and response and move on
                if ((double) distance / key.length() < bestPercent) {
                    bestPercent = (double) distance / key.length();
                    randResponse = generator.nextInt(scriptMap.get(key).size());
                    bestResponse = scriptMap.get(key).get(randResponse);
                }
            } else if (phrase.toLowerCase().contains(key)) {
                randResponse = generator.nextInt(scriptMap.get(key).size());
                bestResponse = scriptMap.get(key).get(randResponse);
            }
        }
    }

    if (bestResponse.length() == 0) // Return false if no match gets made
        return false;

    response = bestResponse;
    return true;
}

From source file:com.moviejukebox.plugin.TheMovieDbPlugin.java

/**
 * Attempt to find the TMDB ID for a person using their full name
 *
 * @param name/*from  w  w w .  ja v  a2 s  .  c  o m*/
 * @return
 */
public String getPersonId(String name) {
    String tmdbId = "";
    PersonFind closestPerson = null;
    int closestMatch = Integer.MAX_VALUE;
    boolean foundPerson = Boolean.FALSE;
    boolean includeAdult = PropertiesUtil.getBooleanProperty("themoviedb.includeAdult", Boolean.FALSE);

    try {
        ResultList<PersonFind> results = tmdb.searchPeople(name, 0, includeAdult, SearchType.PHRASE);
        LOG.info("Found {} person results for {}", results.getResults().size(), name);
        for (PersonFind person : results.getResults()) {
            if (name.equalsIgnoreCase(person.getName())) {
                tmdbId = String.valueOf(person.getId());
                foundPerson = Boolean.TRUE;
                break;
            }

            LOG.trace("Checking {} against {}", name, person.getName());
            int lhDistance = StringUtils.getLevenshteinDistance(name, person.getName());
            LOG.trace("{}: Current closest match is {}, this match is {}", name, closestMatch, lhDistance);
            if (lhDistance < closestMatch) {
                LOG.trace("{}: TMDB ID {} is a better match", name, person.getId());
                closestMatch = lhDistance;
                closestPerson = person;
            }
        }

        if (foundPerson) {
            LOG.debug("{}: Matched against TMDB ID: {}", name, tmdbId);
        } else if (closestMatch < Integer.MAX_VALUE && closestPerson != null) {
            tmdbId = String.valueOf(closestPerson.getId());
            LOG.debug("{}: Closest match is '{}' differing by {} characters", name, closestPerson.getName(),
                    closestMatch);
        } else {
            LOG.debug("{}: No match found", name);
        }
    } catch (MovieDbException ex) {
        LOG.warn("Failed to get information on '{}', error: {}", name, ex.getMessage(), ex);
    }
    return tmdbId;
}