Java examples for java.lang:String Algorithm
get Damerau Levenshtein Substring Distance
//package com.java2s; public class Main { /**/* ww w . j a v a2 s . c o m*/ * * @param t string searched * @param s word or phrase, additional characters at the start or end are ignored * @return */ private static final double getDamerauLevenshteinSubstringDistance( String t, String s) { if (s == null || t == null) throw new IllegalArgumentException("Strings must not be null"); if (s.equals(t)) return 0; /* The difference between this impl. and the previous is that, rather than creating and retaining a matrix of size s.length()+1 by t.length()+1, we maintain two single-dimensional arrays of length s.length()+1. The first, d, is the 'current working' distance array that maintains the newest distance cost counts as we iterate through the characters of String s. Each time we increment the index of String t we are comparing, d is copied to p, the second int[]. Doing so allows us to retain the previous cost counts as required by the algorithm (taking the minimum of the cost count to the left, up one, and diagonally up and to the left of the current cost count being calculated). (Note that the arrays aren't really copied anymore, just switched...this is clearly much better than cloning an array or doing a System.arraycopy() each time through the outer loop.) Effectively, the difference between the two implementations is this one does not cause an out of memory condition when calculating the LD over two very large strings. */ int n = s.length(); // length of s int m = t.length(); // length of t if (n == 0) return m; if (m == 0) return n; // if (n > m) { // // swap the input strings to consume less memory // String tmp = s; // s = t; // t = tmp; // n = m; // m = t.length(); // } double p[] = new double[n + 1]; //'previous' cost array, horizontally double p_p[] = new double[n + 1]; //n-2 cost array for transpositions double d[] = new double[n + 1]; // cost array, horizontally double addCost = 1.0 / n;//0.5d; // cost double removeCost = 1.5d; // cost double editCost = 1.5d; // cost // additional caracters at the start have small cost int i; // iterates through s for (i = 1; i <= n; i++) { p[i] = 0.5d + i * addCost; } double min = p[n]; for (int j = 1; j <= m; j++) { // iterates through t char t_j = t.charAt(j - 1); d[0] = j; for (i = 1; i <= n; i++) { double cost = s.charAt(i - 1) == t_j ? 0 : editCost; // minimum of cell to the left+1, to the top+1, diagonally left and up +cost if (d[i - 1] + addCost < p[i] + editCost) if (d[i - 1] + 0.5d + addCost < p[i - 1] + cost) { d[i] = d[i - 1] + 0.5d + addCost; // addCost=i/n; // editCost=1.5d; // removeCost=1.5d; } else { d[i] = p[i - 1] + cost; // addCost=1.0d; // editCost=1.0d; // removeCost=1.5d; } else if (p[i] + removeCost < p[i - 1] + cost) { d[i] = p[i] + removeCost; // addCost=1.0d; // editCost=1.5d; // removeCost=1.0d; } else { d[i] = p[i - 1] + cost; // addCost=1.0d; // editCost=1.0d; // removeCost=1.5d; } // d[i] = Math.min(Math.min(d[i-1]+addCost, p[i]+removeCost), p[i-1]+cost); //damerau extension if (i > 1 && j > 1 && s.charAt(i - 1) == t.charAt(j - 2) && s.charAt(i - 2) == t_j) d[i] = Math.min(d[i], p_p[i - 2] + cost); // transposition } // copy current distance counts to 'previous row' distance counts double _d[] = p_p; //placeholder to assist in swapping p and d p_p = p; p = d; d = _d; min = Math.min(min + 0.9d, p[n]); } // our last action in the above loop was to switch d and p, so p now // actually has the most recent cost counts min = Math.min(min + 0.9d, p[n]); for (i = 0; i <= n; ++i) min = Math.min(min, p[n - i] + addCost); return min; } }