de.undercouch.citeproc.helper.Levenshtein.java Source code

Java tutorial

Introduction

Here is the source code for de.undercouch.citeproc.helper.Levenshtein.java

Source

// Copyright 2013 Michel Kraemer
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package de.undercouch.citeproc.helper;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;

import org.apache.commons.lang3.StringUtils;

/**
 * Uses {@link StringUtils#getLevenshteinDistance(CharSequence, CharSequence)}
 * to calculate the edit distance between two strings. Provides useful helper
 * methods to traverse a set of strings and select the most similar ones
 * to a given input string.
 * @author Michel Kraemer
 */
public class Levenshtein {
    private static class Item<T> implements Comparable<Item<T>> {
        private final T str;
        private final int distance;

        public Item(T str, int distance) {
            this.str = str;
            this.distance = distance;
        }

        @Override
        public int compareTo(Item<T> o) {
            return (distance < o.distance ? -1 : (distance == o.distance ? 0 : 1));
        }
    }

    /**
     * Searches the given collection of strings and returns the string that
     * has the lowest Levenshtein distance to a given second string <code>t</code>.
     * If the collection contains multiple strings with the same distance to
     * <code>t</code> only the first one will be returned.
     * @param <T> the type of the strings in the given collection
     * @param ss the collection to search
     * @param t the second string
     * @return the string with the lowest Levenshtein distance
     */
    public static <T extends CharSequence> T findMinimum(Collection<T> ss, CharSequence t) {
        int min = Integer.MAX_VALUE;
        T result = null;
        for (T s : ss) {
            int d = StringUtils.getLevenshteinDistance(s, t);
            if (d < min) {
                min = d;
                result = s;
            }
        }
        return result;
    }

    /**
     * Searches the given collection of strings and returns a collection of at
     * most <code>n</code> strings that have the lowest Levenshtein distance
     * to a given string <code>t</code>. The returned collection will be
     * sorted according to the distance with the string with the lowest
     * distance at the first position.
     * @param <T> the type of the strings in the given collection
     * @param ss the collection to search
     * @param t the string to compare to
     * @param n the maximum number of strings to return
     * @param threshold a threshold for individual item distances. Only items
     * with a distance below this threshold will be included in the result.
     * @return the strings with the lowest Levenshtein distance
     */
    public static <T extends CharSequence> Collection<T> findMinimum(Collection<T> ss, CharSequence t, int n,
            int threshold) {
        LinkedList<Item<T>> result = new LinkedList<Item<T>>();
        for (T s : ss) {
            int d = StringUtils.getLevenshteinDistance(s, t);
            if (d < threshold) {
                result.offer(new Item<T>(s, d));

                if (result.size() > n + 10) {
                    //resort, but not too often
                    Collections.sort(result);
                    while (result.size() > n)
                        result.removeLast();
                }
            }
        }

        Collections.sort(result);
        while (result.size() > n)
            result.removeLast();

        List<T> arr = new ArrayList<T>(n);
        for (Item<T> i : result) {
            arr.add(i.str);
        }
        return arr;
    }

    /**
     * Searches the given collection of strings and returns a collection of
     * strings similar to a given string <code>t</code>. Uses reasonable default
     * values for human-readable strings. The returned collection will be
     * sorted according to their similarity with the string with the best
     * match at the first position.
     * @param <T> the type of the strings in the given collection
     * @param ss the collection to search
     * @param t the string to compare to
     * @return a collection with similar strings
     */
    public static <T extends CharSequence> Collection<T> findSimilar(Collection<T> ss, CharSequence t) {
        //look for strings prefixed by 't'
        Collection<T> result = new LinkedHashSet<T>();
        for (T s : ss) {
            if (StringUtils.startsWithIgnoreCase(s, t)) {
                result.add(s);
            }
        }

        //find strings according to their levenshtein distance
        Collection<T> mins = findMinimum(ss, t, 5, Math.min(t.length() - 1, 7));
        result.addAll(mins);

        return result;
    }
}