Example usage for org.apache.commons.lang3 StringUtils getLevenshteinDistance

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringUtils getLevenshteinDistance.

Prototype

public static int getLevenshteinDistance(CharSequence s, CharSequence t)

Source Link

Document

Find the Levenshtein distance between two Strings. This is the number of changes needed to change one String into another, where each change is a single character modification (deletion, insertion or substitution). The previous implementation of the Levenshtein distance algorithm was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a> Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError which can occur when my Java implementation is used with very large strings. This implementation of the Levenshtein distance algorithm is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a> <pre> StringUtils.getLevenshteinDistance(null, *) = IllegalArgumentException StringUtils.getLevenshteinDistance(*, null) = IllegalArgumentException StringUtils.getLevenshteinDistance("","") = 0 StringUtils.getLevenshteinDistance("","a") = 1 StringUtils.getLevenshteinDistance("aaapppp", "") = 7 StringUtils.getLevenshteinDistance("frog", "fog") = 1 StringUtils.getLevenshteinDistance("fly", "ant") = 3 StringUtils.getLevenshteinDistance("elephant", "hippo") = 7 StringUtils.getLevenshteinDistance("hippo", "elephant") = 7 StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz") = 8 StringUtils.getLevenshteinDistance("hello", "hallo") = 1 </pre>

Usage

From source file:annis.gui.flatquerybuilder.ValueField.java

@Override
public void textChange(TextChangeEvent event) {
    ReducingStringComparator rsc = sq.getRSC();
    String fm = sq.getFilterMechanism();
    if (!"generic".equals(fm)) {
        ConcurrentSkipListSet<String> notInYet = new ConcurrentSkipListSet<>();
        String txt = event.getText();
        if (!txt.equals("")) {
            scb.removeAllItems();//from w w w . j a  v a 2s.com
            for (Iterator<String> it = values.keySet().iterator(); it.hasNext();) {
                String s = it.next();
                if (rsc.compare(s, txt, fm) == 0) {
                    scb.addItem(s);
                } else {
                    notInYet.add(s);
                }
            }
            //startsWith
            for (String s : notInYet) {
                if (rsc.startsWith(s, txt, fm)) {
                    scb.addItem(s);
                    notInYet.remove(s);
                }
            }
            //contains
            for (String s : notInYet) {
                if (rsc.contains(s, txt, fm)) {
                    scb.addItem(s);
                }
            }
        } else {
            buildValues(this.vm);
        }
    } else {
        String txt = event.getText();
        HashMap<Integer, Collection> levdistvals = new HashMap<>();
        if (txt.length() > 1) {
            scb.removeAllItems();
            for (String s : values.keySet()) {
                Integer d = StringUtils.getLevenshteinDistance(removeAccents(txt).toLowerCase(),
                        removeAccents(s).toLowerCase());
                if (levdistvals.containsKey(d)) {
                    levdistvals.get(d).add(s);
                }
                if (!levdistvals.containsKey(d)) {
                    Set<String> newc = new TreeSet<>();
                    newc.add(s);
                    levdistvals.put(d, newc);
                }
            }
            SortedSet<Integer> keys = new TreeSet<>(levdistvals.keySet());
            for (Integer k : keys.subSet(0, 10)) {
                List<String> valueList = new ArrayList(levdistvals.get(k));
                Collections.sort(valueList, String.CASE_INSENSITIVE_ORDER);
                for (String v : valueList) {
                    scb.addItem(v);
                }
            }
        }
    }
}

From source file:model.SongMeaningsScraper.java

public static String validateArtist(String artist) {
    HashMap<String, String> artists = DataManager.getArtistMap();
    for (String artistFromMap : artists.keySet()) {
        int levDist = StringUtils.getLevenshteinDistance(artistFromMap.toUpperCase(), artist.toUpperCase());
        double ratio = (artistFromMap.length() - levDist + 0.0) / (artistFromMap.length() + 0.0);
        if (ratio == 1.0) {
            Logger.LogToStatusBar(artistFromMap + " exactly matches");
            return artistFromMap;
        } else if (ratio >= 0.5) {
            ArrayList<String> matches = DataManager.getArtistMatches().get(artist);
            if (matches == null) {
                matches = new ArrayList<String>();
                matches.add(artistFromMap);
                DataManager.getArtistMatches().put(artist, matches);
            } else {
                matches.add(artistFromMap);
                DataManager.getArtistMatches().remove(artist);
                DataManager.getArtistMatches().put(artist, matches);
            }//from  ww w  .j a  v a2  s .  co  m
        }
    }
    return "";
}

From source file:annis.gui.flatquerybuilder.SearchBox.java

@Override
public void textChange(TextChangeEvent event) {
    if ("specific".equals(sq.getFilterMechanism())) {
        ConcurrentSkipListSet<String> notInYet = new ConcurrentSkipListSet<String>();
        reducingStringComparator esc = new reducingStringComparator();
        String txt = event.getText();
        if (!txt.equals("")) {
            cb.removeAllItems();/*w  ww .j  a v  a 2  s. com*/
            for (Iterator<String> it = annonames.iterator(); it.hasNext();) {
                String s = it.next();
                if (esc.compare(s, txt) == 0) {
                    cb.addItem(s);
                } else {
                    notInYet.add(s);
                }
            }
            //startsWith
            for (String s : notInYet) {
                if (esc.startsWith(s, txt)) {
                    cb.addItem(s);
                    notInYet.remove(s);
                }
            }
            //contains
            for (String s : notInYet) {
                if (esc.contains(s, txt)) {
                    cb.addItem(s);
                }
            }
        } else {
            //have a look and speed it up
            SpanBox.buildBoxValues(cb, ebene, sq);
        }
    }

    if ("levenshtein".equals(sq.getFilterMechanism())) {
        String txt = event.getText();
        HashMap<Integer, Collection> levdistvals = new HashMap<Integer, Collection>();
        if (txt.length() > 1) {
            cb.removeAllItems();
            for (String s : annonames) {
                Integer d = StringUtils.getLevenshteinDistance(removeAccents(txt), removeAccents(s));
                if (levdistvals.containsKey(d)) {
                    levdistvals.get(d).add(s);
                }
                if (!levdistvals.containsKey(d)) {
                    Set<String> newc = new TreeSet<String>();
                    newc.add(s);
                    levdistvals.put(d, newc);
                }
            }
            SortedSet<Integer> keys = new TreeSet<Integer>(levdistvals.keySet());
            for (Integer k : keys.subSet(0, 5)) {
                List<String> values = new ArrayList(levdistvals.get(k));
                Collections.sort(values, String.CASE_INSENSITIVE_ORDER);
                for (String v : values) {
                    cb.addItem(v);
                }
            }
        }
    }
}

From source file:edu.jhuapl.dorset.agents.StockAgent.java

protected CompanyInfo findStockSymbol(String stockCompanyName) {
    CompanyInfo companyInfo = null;/*from ww  w. j  a  v  a 2 s .  c  om*/
    ArrayList<String> regexMatches = new ArrayList<String>();

    if (this.stockSymbolMap.get(stockCompanyName) != null) {
        companyInfo = this.stockSymbolMap.get(stockCompanyName);

    } else {
        String regex = "\\b" + stockCompanyName + "\\b";

        Pattern pat = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);

        for (Map.Entry<String, CompanyInfo> entry : stockSymbolMap.entrySet()) {
            Matcher matcher = pat.matcher(entry.getKey());

            if (matcher.find()) {
                regexMatches.add(entry.getKey());
            }
        }

        if (regexMatches.size() == 0) {
            companyInfo = null;
        } else if (regexMatches.size() == 1) {
            companyInfo = this.stockSymbolMap.get(regexMatches.get(0));

        } else {
            int distance;
            HashMap<String, Integer> matchDistanceMap = new HashMap<String, Integer>();
            for (int i = 0; i < regexMatches.size(); i++) {
                distance = (StringUtils.getLevenshteinDistance(regexMatches.get(i), stockCompanyName));
                matchDistanceMap.put(regexMatches.get(i), distance);
            }

            Entry<String, Integer> minDistancePair = null;
            for (Entry<String, Integer> entry : matchDistanceMap.entrySet()) {
                if (minDistancePair == null || minDistancePair.getValue() > entry.getValue()) {
                    minDistancePair = entry;
                }
            }

            companyInfo = this.stockSymbolMap.get(minDistancePair.getKey());

        }

    }

    return companyInfo;
}

From source file:com.epam.ta.reportportal.util.analyzer.IssuesAnalyzerService.java

@Override
public void analyze(String launchId, List<TestItem> resources, List<TestItem> scope) {
    for (TestItem current : resources) {
        List<Double> curRate = Lists.newArrayList();
        TestItemIssue issue = null;// w w w .  ja  v  a 2s  .co m
        boolean isInvestigated = false;
        List<Log> curItemErr = logRepository.findTestItemErrorLogs(current.getId());

        Launch launch = launchRepository.findOne(current.getLaunchRef());
        Project project = projectRepository.findOne(launch.getProjectRef());

        for (TestItem item : scope) {
            /*
             * Avoid comparison with itself as investigated item during
             * in_progress launch. Cause manually investigated item will be
             * included in history of current one.
             */
            if (item.getId().equalsIgnoreCase(current.getId()))
                continue;

            List<Log> errors = logRepository.findTestItemErrorLogs(item.getId());
            if (errors.size() == curItemErr.size()) {
                for (int i = 0; i < curItemErr.size(); i++) {
                    String curMsg = curItemErr.get(i).getLogMsg().replaceAll("\\d+", "")
                            .replaceAll("\\s(at)\\s", "");
                    String scopeMsg = errors.get(i).getLogMsg().replaceAll("\\d+", "").replaceAll("\\s(at)\\s",
                            "");
                    /*
                     * Get Levenshtein distance for two comparing log
                     * strings
                     */
                    int maxString = Math.max(curMsg.length(), scopeMsg.length());
                    int diff = StringUtils.getLevenshteinDistance(curMsg, scopeMsg);
                    /*
                     * Store percentage of equality
                     */
                    curRate.add(((double) (maxString - diff)) / maxString * 100);
                }
            }
            if (!curRate.isEmpty() && (this.mathMiddle(curRate) >= acceptRate)) {
                isInvestigated = true;
                issue = item.getIssue();
                /* Stop looping cause acceptable item found already. */
                break;
            } else
                curRate.clear();
        }

        if (isInvestigated) {
            TestItemIssue currentIssue = current.getIssue();
            /* If item was investigated till Launch finished. */
            if ((null != currentIssue.getExternalSystemIssues())
                    || (!currentIssue.getIssueType()
                            .equalsIgnoreCase(TestItemIssueType.TO_INVESTIGATE.getLocator()))
                    || (null != currentIssue.getIssueDescription())) {
                currentIssue.setIssueDescription(
                        this.suggest(currentIssue.getIssueDescription(), issue, project.getConfiguration()));
                current.setIssue(currentIssue);
                testItemRepository.save(current);
                /* If system investigate item from scratch */
            } else {
                issue.setIssueDescription(this.mark(issue.getIssueDescription()));
                current = statisticsFacadeFactory
                        .getStatisticsFacade(project.getConfiguration().getStatisticsCalculationStrategy())
                        .resetIssueStatistics(current);
                current.setIssue(issue);
                testItemRepository.save(current);
                statisticsFacadeFactory
                        .getStatisticsFacade(project.getConfiguration().getStatisticsCalculationStrategy())
                        .updateIssueStatistics(current);
            }
        }
    }
    analyzeFinished(launchId);
}

From source file:controllers.WidgetAdmin.java

private static String isPasswordStrongEnough(String password, String email) {
    if (StringUtils.length(password) < 8) {
        return "Password is too short";
    }/*  w w  w.  j  a v a2 s .  co  m*/
    if (!Pattern.matches("(?=^.{8,}$)((?=.*\\d)|(?=.*\\W+))(?![.\\n])(?=.*[A-Z])(?=.*[a-z]).*$", password)
            && !StringUtils.containsIgnoreCase(email, password)) {
        return "Password must match requirements";
    }

    Set<String> strSet = new HashSet<String>();
    for (String s : password.split("")) {
        if (StringUtils.length(s) > 0) {
            strSet.add(s.toLowerCase());
        }
    }

    if (CollectionUtils.size(strSet) < 3) {
        return "Too many repeating letters";
    }

    if (StringUtils.getLevenshteinDistance(password, email.split("@")[0]) < 5
            || StringUtils.getLevenshteinDistance(password, email.split("@")[1]) < 5) {
        return "Password similar to email";
    }

    return null;
}

From source file:net.samuelbjohnson.javadev.crosstopix.Joiner.java

protected BigDecimal computeDistance(String string1, String string2) {
    return new BigDecimal(StringUtils.getLevenshteinDistance(string1, string2));
}

From source file:airportApp.Query.java

/**
 * This method suggests the closest matching word and country code.
 * /*www  .  ja v a  2s . co m*/
 * The matching is done based on the Levenshtein distance: the minimum number 
 * of single-character edits (i.e. insertions, deletions or substitutions) 
 * required to change one word into the other. A disadvantage of the Levenshtein 
 * distance is that any change (edit) has an equal influence on the matching 
 * distance, whereas an algorithm that penalizes the different errors in a 
 * different degree (i.e. insertions penalty = 1; deletions penalty = 2; 
 * substitutions penalty = 3) can produce more accurate results in some cases.
 * 
 * To demonstrate the above statement, let us consider the input "Zimb" compared 
 * to the two strings "Zimbabwe" and "Fiji".
 * The Levenshtein distance between "Zimb" and "Zimbabwe" is 4; while between 
 * "Zimb" and "Fiji" it is 3. Therefore, "Fiji" is the closest match. However, 
 * if the input is "Zimbab", the distance between "Zimbab" and "Zimbabwe" is 
 * 2; while between "Zimbab" and "Fiji" is 5. Therefore, "Zimbabwe" is the 
 * closest match.
 * 
 * The Levenshtein distance is not necessarily a bad similarity test. It all 
 * depends on our preferences of how we would like to do the comparisons. In 
 * other cases than the one mentioned above, the Levenshtein distance can 
 * produce more accurate results. There are also many other similarity matching 
 * algorithms (some based on longest common subsequence).
 * 
 * Attention! This method uses a third-party library from commons.apache.org. 
 * Make sure that the library is added to the compilation libraries of your 
 * IDE. The library's Levenshtein algorithm is updated with a newer version 
 * (by Chas Emerick) that avoids OutOfMemory errors that can occur for very 
 * large Strings. The library also contains implementations for finding the 
 * longest common prefix, the Jaro-Winkler distance and another Fuzzy distance. 
 * So they are all alternatives, if one is not satisfied with the Levenshtein's 
 * one, and can also possibly be combined in different ways. Assuming the user 
 * does not misspell the beginning of the country, an algorithm counting the 
 * length of the longest common prefix towards the length of the larger 
 * sequence (a check larger or equal to 25% ) can precede a check for the 
 * Levenshtein's distance.
 * 
 * @param country User input for which a match will be returned.
 * @return Returns a Suggestion object (the closest match to the input 
 * argument and its associated country code) or null , if the match fails 
 * for any reason.
 */
private static Suggestion suggestCountry(String country) {

    Suggestion suggestion = null;
    String suggestionMatch = "";
    String suggestionCode = "";
    int suggestionDistance = -1;

    try {
        countriesReader = new BufferedReader(new FileReader("resources/countries.csv"));

        String line; // pointer (line reader) used with the countries

        countriesReader.readLine(); // skip first line (column names)

        while ((line = countriesReader.readLine()) != null) {

            Country c = Utils.readCountry(line);

            // use the smaller distance of code name and country name
            int codeDistance = StringUtils.getLevenshteinDistance(country, c.getCode());
            int nameDistance = StringUtils.getLevenshteinDistance(country, c.getName());
            int distance = ((codeDistance < nameDistance) ? codeDistance : nameDistance);

            // keep track of the associated match
            String match = ((codeDistance < nameDistance) ? c.getCode() : c.getName());

            // keep track of the associated country code
            String code = c.getCode();

            // for the first country just overwrite global values
            if (suggestionDistance == -1) {

                suggestionDistance = distance;
                suggestionMatch = match;
                suggestionCode = code;
            } else { // for the rest use the global comaprison

                if (distance < suggestionDistance) {

                    suggestionDistance = distance;
                    suggestionMatch = match;
                    suggestionCode = code;
                }
            }

            suggestion = new Suggestion(suggestionMatch, suggestionCode);

        }

    } catch (IllegalArgumentException ex) {
        System.err.println("Error: " + ex.getMessage());
    } catch (FileNotFoundException ex) {
        System.err.println("Error: " + ex.getMessage());
    } catch (IOException ex) {
        System.err.println("Error: " + ex.getMessage());
    }

    return suggestion;
}

From source file:bear.plugins.groovy.GroovyCodeCompleter.java

public Replacements completeCode(String script, int position) {
    int[] range = scanForStart(script, position, -1);

    int start = range[0];
    int end = range[1] + 1;

    boolean endsWithDot = script.charAt(firstNonSpace(script, Math.min(end, script.length() - 1), -1)) == '.';

    List<Token> tokens = tokenize(script, start, end);

    //fo| => size 1
    //foo. => size 1
    Class<?> firstTokenClass;
    if (tokens.size() == 1 && !endsWithDot) {
        firstTokenClass = null;/*w w w . j a  va  2 s. c o m*/
        //match vars from binding, there should be a few

        Set<Map.Entry<String, ?>> entries = binding.getVariables().entrySet();
        List<Candidate> candidates = new ArrayList<Candidate>();

        for (Map.Entry<String, ?> entry : entries) {
            String varName = entry.getKey();

            char[] chars = tokens.get(0).name.toCharArray();

            int score = 0;

            for (int i = 0; i < chars.length; i++) {
                score += frequency(varName, chars[i]) * i == 0 ? 3 : (i == 1 ? 2 : 1);
            }

            if (score > 0) {
                candidates.add(new Candidate(
                        new Replacement(varName, entry.getValue().getClass().getSimpleName()), score));
            }
        }

        Collections.sort(candidates);

        return new Replacements(start, end).addAll(candidates);
    } else {
        Object variable = shell.getVariable(tokens.get(0).name);
        firstTokenClass = variable == null ? null : variable.getClass();
    }

    List<Class<?>> currentClasses = firstTokenClass == null ? new ArrayList<Class<?>>()
            : Lists.<Class<?>>newArrayList(firstTokenClass);

    for (int i = 1; i < tokens.size(); i++) {
        Token token = tokens.get(i);
        boolean lastToken = i == tokens.size() - 1;

        if (lastToken && !endsWithDot) {
            break;
        }

        //strict match
        List<Class<?>> returnTypes = new ArrayList<Class<?>>();

        if (token.method) {
            for (Class<?> currentClass : currentClasses) {
                for (MethodDesc method : OpenBean.methods(currentClass)) {
                    if (method.getName().equals(token.name)) {
                        returnTypes.add(method.getMethod().getReturnType());
                    }
                }
            }
        } else {
            for (Class<?> currentClass : currentClasses) {
                for (Field field : OpenBean.fields(currentClass)) {
                    if (field.getName().equals(token.name)) {
                        returnTypes.add(field.getType());
                    }
                }
            }
        }

        if (returnTypes.size() > 1) {
            currentClasses = Lists.newArrayList(new LinkedHashSet<Class<?>>(returnTypes));
        } else {
            currentClasses = returnTypes;
        }
    }

    String pattern = null;

    if (!endsWithDot) {
        pattern = tokens.get(tokens.size() - 1).name.toLowerCase();
    }

    Replacements replacements = endsWithDot ? new Replacements(position, position)
            : new Replacements(tokens.get(tokens.size() - 1).start, position);

    if (endsWithDot) {
        for (Class<?> currentClass : currentClasses) {
            for (Field field : OpenBean.fields(currentClass)) {
                replacements.add(new Replacement(field));
            }

            for (MethodDesc method : OpenBean.methods(currentClass)) {
                replacements.add(new Replacement(method));
            }
        }
    } else {
        final String patternLC = pattern.toLowerCase();
        Set<Field> usedFields = new HashSet<Field>();
        Set<Method> usedMethods = new HashSet<Method>();

        List<Candidate> candidates = new ArrayList<Candidate>();

        {
            int score = 10000;
            for (Class<?> currentClass : currentClasses) {
                for (Field field : OpenBean.fields(currentClass)) {
                    if (field.getName().toLowerCase().startsWith(patternLC)) {
                        usedFields.add(field);
                        candidates.add(new Candidate(new Replacement(field), score--));
                    }
                }

                for (MethodDesc method : OpenBean.methods(currentClass)) {
                    if (method.getName().toLowerCase().startsWith(patternLC)) {
                        usedMethods.add(method.getMethod());
                        candidates.add(new Candidate(new Replacement(method), score--));
                    }
                }
            }
        }

        Collections.sort(candidates, new Comparator<Candidate>() {
            @Override
            public int compare(Candidate o1, Candidate o2) {
                int d1 = StringUtils.getLevenshteinDistance(o1.r.name.toLowerCase(), patternLC);
                int d2 = StringUtils.getLevenshteinDistance(o2.r.name.toLowerCase(), patternLC, d1 + 1);
                return Integer.compare(d1, d2);
            }
        });

        for (int i = 0; i < candidates.size(); i++) {
            Candidate candidate = candidates.get(i);
            candidate.score = 10000 - i;
        }

        char[] chars = pattern.toCharArray();
        for (Class<?> currentClass : currentClasses) {
            for (Field field : OpenBean.fields(currentClass)) {
                if (usedFields.contains(field))
                    continue;

                int r = 0;

                for (char aChar : chars) {
                    r += frequency(field.getName(), aChar);
                }

                if (r > 0)
                    candidates.add(new Candidate(new Replacement(field), r));

            }

            for (MethodDesc method : OpenBean.methods(currentClass)) {
                if (usedMethods.contains(method.getMethod()))
                    continue;

                int r = 0;

                for (char aChar : chars) {
                    r += frequency(method.getName(), aChar);
                }

                if (r > 0)
                    candidates.add(new Candidate(new Replacement(method), r));
            }
        }

        Collections.sort(candidates);

        replacements.addAll(candidates);
    }

    return replacements;
}

From source file:edu.harvard.mcz.nametools.AuthorNameComparator.java

/**
 * Return a measure of the similarity between two strings in the range of
 * 0 (no similarity) to 1 (exact same strings), using a measure of the
 * string edit distance scaled to the length differences of the two strings.
 * //from  w  ww.ja v  a2  s  .c o  m
 * @param string1 one string for comparison
 * @param string2 the string to compare with string1
 * @return a double in the range 0 to 1.
 */
public static double stringSimilarity(String string1, String string2) {
    double result = 0d;
    String longer = string1;
    String shorter = string2;
    if (string1.length() < string2.length()) {
        // flip so that longer string is the longest.
        longer = string2;
        shorter = string1;
    }
    if (longer.length() == 0) {
        result = 1.0;
    } else {
        result = (longer.length() - StringUtils.getLevenshteinDistance(longer, shorter))
                / (double) longer.length();
    }
    return result;
}