Example usage for org.apache.commons.lang3 StringUtils getLevenshteinDistance

List of usage examples for org.apache.commons.lang3 StringUtils getLevenshteinDistance

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringUtils getLevenshteinDistance.

Prototype

public static int getLevenshteinDistance(CharSequence s, CharSequence t) 

Source Link

Document

<p>Find the Levenshtein distance between two Strings.</p> <p>This is the number of changes needed to change one String into another, where each change is a single character modification (deletion, insertion or substitution).</p> <p>The previous implementation of the Levenshtein distance algorithm was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError which can occur when my Java implementation is used with very large strings.<br> This implementation of the Levenshtein distance algorithm is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> <pre> StringUtils.getLevenshteinDistance(null, *) = IllegalArgumentException StringUtils.getLevenshteinDistance(*, null) = IllegalArgumentException StringUtils.getLevenshteinDistance("","") = 0 StringUtils.getLevenshteinDistance("","a") = 1 StringUtils.getLevenshteinDistance("aaapppp", "") = 7 StringUtils.getLevenshteinDistance("frog", "fog") = 1 StringUtils.getLevenshteinDistance("fly", "ant") = 3 StringUtils.getLevenshteinDistance("elephant", "hippo") = 7 StringUtils.getLevenshteinDistance("hippo", "elephant") = 7 StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz") = 8 StringUtils.getLevenshteinDistance("hello", "hallo") = 1 </pre>

Usage

From source file:codeu.chat.client.commandline.Chat.java

private boolean parseScript(String link, String phrase, boolean springfield) {
    String[] script;/*from  w  ww .ja  v a 2  s .c  om*/

    try {
        Document doc = Jsoup.connect(link).get();

        /* If the script was retrieved from the Springfield website, the lines
           must be split up using the <br> tag instead of new line characters */
        if (springfield) {
            String temp = Jsoup.parse(doc.html().replaceAll("(?i)<br[^>]*>", "br2n")).text();
            script = mergeScriptSentences(temp.split("br2n"));
        } else {
            script = mergeScriptSentences(doc.body().text().split("\n"));
        }

        /* Search for a line containing the phrase. Once one is found,
           determine the best response and return accordingly. In some
           cases, this will mean continuing to search for a later match */
        for (int lineNum = 0; lineNum < script.length; lineNum++) {
            script[lineNum] = script[lineNum].trim().toLowerCase();
            for (String sentence : script[lineNum].split("(?<=[!\\?\\.])")) {
                if (sentence.contains(phrase)
                        || StringUtils.getLevenshteinDistance(sentence, phrase) <= phrase.length() / 3.0) {
                    if (findNextScriptResponse(lineNum, phrase, script)) {
                        return true;
                    }
                }
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return false; // Return false if no line containing the phrase was found
}

From source file:codeu.chat.client.commandline.Chat.java

private boolean findNextScriptResponse(int lineNum, String phrase, String[] script) {
    /* Check if a line from a different character (i.e. an actual response)
       can be found. If not, just respond with the very next line of dialogue */
    if (canFindNewCharacter(lineNum, script)) {
        return true;
    }//from ww w.  j av a 2 s  . c om

    /* Next, check if the line containing this phrase contains more wording. If it
       does, use the next sentence as the response */
    if (lineContainsMoreSentences(script[lineNum], phrase)) {
        return true;
    }

    // Advance to the next non-blank line, indicating the next piece of dialogue
    lineNum = advanceToNextNonBlankLine(lineNum, script);

    // Ensure that there was a next line of dialogue
    if (lineNum >= script.length
            || StringUtils.getLevenshteinDistance(script[lineNum].trim(), phrase) < phrase.length() / 3.0) {
        return false;
    }

    /* Add this response found from an online script to the
       response map for future reference/usage */
    scriptMap.get(MATCHED_PHRASE).add(adjustScriptLine(script[lineNum].trim()));
    return true;
}

From source file:net.stargraph.rank.impl.LevenshteinRanker.java

@Override
double computeStringDistance(CharSequence s1, CharSequence s2) {
    return StringUtils.getLevenshteinDistance(s1, s2);
}

From source file:nl.mvdr.umvc3replayanalyser.ocr.TesseractOCREngine.java

/**
 * Matches the given string to a character's name.
 * // w  w  w. j av  a2s  .  co m
 * @param text
 *            text to be matched, should be a Marvel character name
 * @param possibleCharacters
 *            the characters that text may match, may not be empty
 * @return the character to whose name the given text is closest
 * @throws OCRException
 *             in case the matching character cannot be uniquely determined
 */
private Umvc3Character matchToCharacterName(String text, Set<Umvc3Character> possibleCharacters)
        throws OCRException {
    if (log.isDebugEnabled()) {
        if (possibleCharacters.size() == Umvc3Character.values().length) {
            log.debug(String.format("Attempting to match %s to the UMvC3 characters", text));
        } else {
            log.debug(String.format("Attempting to match %s to the following characters: %s", text,
                    possibleCharacters));
        }
    }

    // Compute the minimal Levenshtein distance between the given text and the uppercase character names.
    int minimalDistance = Integer.MAX_VALUE;
    Set<Umvc3Character> matchingCharacters = EnumSet.noneOf(Umvc3Character.class);

    for (Umvc3Character character : possibleCharacters) {
        int distance = StringUtils.getLevenshteinDistance(character.getName().toUpperCase(), text);
        if (distance < minimalDistance) {
            minimalDistance = distance;
            matchingCharacters.clear();
            matchingCharacters.add(character);
        } else if (distance == minimalDistance) {
            matchingCharacters.add(character);
        }
    }

    // matchingCharacters is not empty, since there must be at least one character with a distance less than
    // Integer.MAX_INT.
    Umvc3Character result;
    if (1 < matchingCharacters.size()) {
        // More than one match found.
        result = handleMultipleMatches(text, minimalDistance, matchingCharacters);
    } else {
        // Exactly one match, return it.
        result = matchingCharacters.iterator().next();
    }

    if (log.isDebugEnabled()) {
        log.debug(String.format("Match found: %s. levenshtein(%s, %s) = %s", result,
                result.getName().toUpperCase(), text, "" + minimalDistance));
    }
    return result;
}

From source file:nl.mvdr.umvc3replayanalyser.video.ReplayAnalyserIntegrationTest.java

/**
 * Checks the given player name.//from   w  w  w  .ja  v a2  s  .c o m
 * 
 * @param expected
 *            expected name
 * @param actual
 *            actual name
 */
private void assertPlayerName(String expected, String actual) {
    // OCR'ing player names is hard, so we're going to give Tesseract a break and allow for some mismatched
    // characters.
    int distance = StringUtils.getLevenshteinDistance(expected, actual);
    String message = String.format("Expected \"%s\", got \"%s\", Levenshtein distance: %s", expected, actual,
            "" + distance);
    log.info(message);
    Assert.assertTrue(message, distance < 5);
}

From source file:org.apache.hadoop.hive.ql.udf.generic.GenericUDFLevenshtein.java

@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
    String str0 = getStringValue(arguments, 0, converters);
    String str1 = getStringValue(arguments, 1, converters);

    if (str0 == null || str1 == null) {
        return null;
    }//from  w  w  w  .  jav  a2 s .  c  o m

    int dist = StringUtils.getLevenshteinDistance(str0, str1);
    output.set(dist);
    return output;
}

From source file:org.cellcore.code.engine.page.extractor.AbstractEditionsExtractor.java

protected String checkList(String name) {
    if (name.toLowerCase().contains("foil")) {
        return null;
    }//  ww  w.  java2s . c  om
    name = name.replaceAll("(^( )+|( )+$)", "");
    for (String key : getEditions().keySet()) {
        String iname = Normalizer.normalize(name, Normalizer.Form.NFD).toLowerCase().replaceAll("[^\\p{ASCII}]",
                "");
        String kname = Normalizer.normalize(key, Normalizer.Form.NFD).toLowerCase().replaceAll("[^\\p{ASCII}]",
                "");
        int distance = StringUtils.getLevenshteinDistance(iname, kname);
        boolean numeral = false;
        boolean numeralProceed = false;
        if (iname.replaceAll("\\D+", "").length() > 0) {
            numeral = true;
            if (iname.replaceAll("\\D+", "").equals(kname.replaceAll("\\D+", ""))) {
                numeralProceed = true;
            }
        }
        if (((kname.contains(iname) || iname.contains(kname) || distance <= 2) && iname.length() > 3)
                && (numeral == numeralProceed)) {

            logger.info("Found " + iname + " " + getEditions().get(key));
            return getEditions().get(key);
        }
    }
    return null;
}

From source file:org.codarama.haxsync.services.ContactsSyncAdapterService.java

private static String matches(Set<String> phoneContacts, String fbContact, int maxdistance) {
    if (maxdistance == 0) {
        if (phoneContacts.contains(fbContact)) {
            return fbContact;
        }/*from   www .j a  v  a 2 s .  co  m*/
        return null;
        //return phoneContacts.contains(fbContact);
    }
    int bestDistance = maxdistance;
    String bestMatch = null;
    for (String contact : phoneContacts) {
        int distance = StringUtils.getLevenshteinDistance(contact != null ? contact.toLowerCase() : "",
                fbContact != null ? fbContact.toLowerCase() : "");
        if (distance <= bestDistance) {
            //Log.i("FOUND MATCH", "Phone Contact: " + contact +" FB Contact: " + fbContact +" distance: " + distance + "max distance: " +maxdistance);
            bestMatch = contact;
            bestDistance = distance;
        }
    }
    return bestMatch;
}

From source file:org.dbgl.util.StringRelatedUtils.java

public static int findBestMatchIndex(final String search, final String[] titles) {
    if (titles == null || titles.length == 0)
        return -1;
    String s = search.toLowerCase();
    int minDistance = Integer.MAX_VALUE;
    int result = 0;
    for (int i = 0; i < titles.length; i++) {
        String title = FilenameUtils.removeExtension(titles[i].toLowerCase());
        int distance = (i == 0) ? StringUtils.getLevenshteinDistance(s, title)
                : StringUtils.getLevenshteinDistance(s, title, minDistance - 1);
        if (distance == 0)
            return i;
        if (distance != -1) {
            minDistance = distance;// www.ja  v  a2s  .  c o m
            result = i;
        }
    }
    return result;
}

From source file:org.krobot.util.MessageUtils.java

/**
 * Get the most similar message of a list to a base<br><br>
 *
 * <b>Example:</b><br><br>
 *
 * base = hello<br>/*from ww w . java  2  s .  c o  m*/
 * messages = [haul, hella, yay]<br><br>
 *
 * It returns <b>hella</b>
 *
 * @param base The base message
 * @param messages The messages where to get the most similar
 *
 * @return The most similar message to the base
 */
public static String getMostSimilar(String base, String[] messages) {
    ArrayList<Integer> matches = new ArrayList<>();

    for (String message : messages) {
        matches.add(StringUtils.getLevenshteinDistance(base, message));
    }

    int candidateIndex = 0;
    int candidate = Integer.MAX_VALUE;

    for (int i = 0; i < matches.size(); i++) {
        int entry = matches.get(i);

        if (entry < candidate) {
            candidate = entry;
            candidateIndex = i;
        }
    }

    if (candidate > 10) {
        return null;
    }

    return messages[matches.get(candidateIndex)];
}