Java tutorial
// Copyright 2013 Michel Kraemer // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package de.undercouch.citeproc.helper; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import org.apache.commons.lang3.StringUtils; /** * Uses {@link StringUtils#getLevenshteinDistance(CharSequence, CharSequence)} * to calculate the edit distance between two strings. Provides useful helper * methods to traverse a set of strings and select the most similar ones * to a given input string. * @author Michel Kraemer */ public class Levenshtein { private static class Item<T> implements Comparable<Item<T>> { private final T str; private final int distance; public Item(T str, int distance) { this.str = str; this.distance = distance; } @Override public int compareTo(Item<T> o) { return (distance < o.distance ? -1 : (distance == o.distance ? 0 : 1)); } } /** * Searches the given collection of strings and returns the string that * has the lowest Levenshtein distance to a given second string <code>t</code>. * If the collection contains multiple strings with the same distance to * <code>t</code> only the first one will be returned. * @param <T> the type of the strings in the given collection * @param ss the collection to search * @param t the second string * @return the string with the lowest Levenshtein distance */ public static <T extends CharSequence> T findMinimum(Collection<T> ss, CharSequence t) { int min = Integer.MAX_VALUE; T result = null; for (T s : ss) { int d = StringUtils.getLevenshteinDistance(s, t); if (d < min) { min = d; result = s; } } return result; } /** * Searches the given collection of strings and returns a collection of at * most <code>n</code> strings that have the lowest Levenshtein distance * to a given string <code>t</code>. The returned collection will be * sorted according to the distance with the string with the lowest * distance at the first position. * @param <T> the type of the strings in the given collection * @param ss the collection to search * @param t the string to compare to * @param n the maximum number of strings to return * @param threshold a threshold for individual item distances. Only items * with a distance below this threshold will be included in the result. * @return the strings with the lowest Levenshtein distance */ public static <T extends CharSequence> Collection<T> findMinimum(Collection<T> ss, CharSequence t, int n, int threshold) { LinkedList<Item<T>> result = new LinkedList<Item<T>>(); for (T s : ss) { int d = StringUtils.getLevenshteinDistance(s, t); if (d < threshold) { result.offer(new Item<T>(s, d)); if (result.size() > n + 10) { //resort, but not too often Collections.sort(result); while (result.size() > n) result.removeLast(); } } } Collections.sort(result); while (result.size() > n) result.removeLast(); List<T> arr = new ArrayList<T>(n); for (Item<T> i : result) { arr.add(i.str); } return arr; } /** * Searches the given collection of strings and returns a collection of * strings similar to a given string <code>t</code>. Uses reasonable default * values for human-readable strings. The returned collection will be * sorted according to their similarity with the string with the best * match at the first position. * @param <T> the type of the strings in the given collection * @param ss the collection to search * @param t the string to compare to * @return a collection with similar strings */ public static <T extends CharSequence> Collection<T> findSimilar(Collection<T> ss, CharSequence t) { //look for strings prefixed by 't' Collection<T> result = new LinkedHashSet<T>(); for (T s : ss) { if (StringUtils.startsWithIgnoreCase(s, t)) { result.add(s); } } //find strings according to their levenshtein distance Collection<T> mins = findMinimum(ss, t, 5, Math.min(t.length() - 1, 7)); result.addAll(mins); return result; } }