Example usage for org.apache.commons.lang3 StringUtils getLevenshteinDistance

List of usage examples for org.apache.commons.lang3 StringUtils getLevenshteinDistance

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringUtils getLevenshteinDistance.

Prototype

public static int getLevenshteinDistance(CharSequence s, CharSequence t) 

Source Link

Document

<p>Find the Levenshtein distance between two Strings.</p> <p>This is the number of changes needed to change one String into another, where each change is a single character modification (deletion, insertion or substitution).</p> <p>The previous implementation of the Levenshtein distance algorithm was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError which can occur when my Java implementation is used with very large strings.<br> This implementation of the Levenshtein distance algorithm is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> <pre> StringUtils.getLevenshteinDistance(null, *) = IllegalArgumentException StringUtils.getLevenshteinDistance(*, null) = IllegalArgumentException StringUtils.getLevenshteinDistance("","") = 0 StringUtils.getLevenshteinDistance("","a") = 1 StringUtils.getLevenshteinDistance("aaapppp", "") = 7 StringUtils.getLevenshteinDistance("frog", "fog") = 1 StringUtils.getLevenshteinDistance("fly", "ant") = 3 StringUtils.getLevenshteinDistance("elephant", "hippo") = 7 StringUtils.getLevenshteinDistance("hippo", "elephant") = 7 StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz") = 8 StringUtils.getLevenshteinDistance("hello", "hallo") = 1 </pre>

Usage

From source file:at.jps.sanction.core.util.TokenTool.java

public static float compareCheck(final List<String> textTokens, final String text, final boolean fuzzy,
        final int minlen) {

    float deltaValue = 100;
    float percentHitrate = 0;

    // +/- fuzzy string compare

    if (textTokens.size() > 0) {
        for (final String token : textTokens) {
            if (token.length() >= minlen) {

                if (fuzzy) {
                    final float lsHitValue = StringUtils.getLevenshteinDistance(text, token);

                    if (lsHitValue < deltaValue) {
                        deltaValue = lsHitValue;
                    }/*ww w.ja  v  a  2  s .  c  o m*/
                } else {
                    deltaValue = (text.equals(token) ? 0 : 100);
                }
            }
        }
        percentHitrate = 100 - ((100 / ((float) (text.length()))) * deltaValue);
    }

    return percentHitrate;
}

From source file:GIST.IzbirkomExtractor.TableExtractor.java

public void processHTMLfile(File input_html) throws IOException, TableExtractorException,
            CloneNotSupportedException, SQLException, ResultSinkException {

        logger.info("Start processing " + input_html);

        Document doc = Jsoup.parse(input_html, "UTF-8");
        Elements tables = doc.getElementsByTag("table");

        /* count of parseable tables found */
        int tables_found = 0;

        /* determine raion name */
        String raion_name = extractRaionFromFileName(input_html.getName());
        //System.err.println(raion_name);

        // TODO: inflect raion name in  case

        /* searches for a table that has " . -" in its very 1st cell */
        for (Element table : tables) {
            Elements rows = table.getElementsByTag("tr");
            boolean firstRow = true;

            row_loop: for (Element row : rows) {
                Elements cells = row.getElementsByTag("td");

                if (firstRow) {
                    //System.err.println(row.text());
                    if (isParsableTable(row)) {
                        firstRow = false;
                        logger.info("Processing table #" + ++tables_found + " in " + input_html);
                    } else
                        break row_loop;
                }/*from ww  w.j av a  2s.  co m*/

                if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()),
                        " . -") < 3)
                    continue row_loop; /* skip the row if it looks like a table header */

                /* skip rows with all cells empty */
                boolean emptyRow = true;
                for (Element cell : cells)
                    emptyRow = emptyRow && cleanupUNICODE(cell.text()).isEmpty();
                if (emptyRow)
                    continue;

                int i_cell = 0;
                Element station_id = null;
                Element address_field = null;
                Element org_address = null; /* address of the ??? */
                Element station_address = null;

                for (Element cell : cells) {
                    switch (i_cell) {
                    case 0:
                        station_id = cell;
                        break;
                    case 1:
                        address_field = cell;
                        break;
                    case 2:
                        org_address = cell;
                        break;
                    case 3:
                        station_address = cell;
                    default:
                        break;
                    }
                    i_cell++;
                }

                if (station_id == null)
                    throw new TableExtractorException("Polling station ID not found", row, input_html);
                if (address_field == null)
                    throw new TableExtractorException("Address list not found", row, input_html);

                /* extract int from poll station id */
                int psid;
                try {
                    psid = Integer.valueOf(cleanupUNICODE(station_id.text()).trim().replaceAll("[^\\d]", ""));
                } catch (NumberFormatException e) {
                    Exception te = new TableExtractorException("Failed to parse polling station ID >"
                            + cleanupUNICODE(station_id.text()).trim() + "<: ", station_id, input_html);
                    logger.severe(te.getMessage() + "; rest of " + input_html + " ignored.");
                    return;
                }

                /* extraction from HTML completely finished, now we work only with the addresses in the text form */
                extractAddressesFromText(raion_name.trim(), psid, cleanLeftoverHTML(address_field),
                        cleanLeftoverHTML(org_address), cleanLeftoverHTML(station_address));
            }
        }

        if (tables_found == 0)
            logger.severe("No parsable tables found in " + input_html);
        resultSink.commit();

        logger.info("" + tables_found + " table(s) processed in " + input_html);
    }

From source file:com.omertron.themoviedbapi.CompareTest.java

/**
 * Close match//w ww . j ava 2 s  . c o m
 */
@Test
public void testCloseMatch() {
    int maxDistance = 6;
    boolean result;

    String closeMain = "bloderannar";
    String closeOther = "Blade Runner Dir Cut";

    // Make sure they are close enough
    int currentDistance;

    currentDistance = StringUtils.getLevenshteinDistance(TITLE_MAIN, closeMain);
    LOG.info("Distance between '{}' and '{}' is {}", TITLE_MAIN, closeMain, currentDistance);
    assertTrue(currentDistance <= maxDistance);

    currentDistance = StringUtils.getLevenshteinDistance(TITLE_OTHER, closeOther);
    LOG.info("Distance between '{}' and '{}' is {}", TITLE_OTHER, closeOther, currentDistance);
    assertTrue(currentDistance <= maxDistance);

    result = Compare.movies(moviedb, closeMain, YEAR_SHORT, maxDistance, CASE_SENSITIVE);
    assertTrue(result);
    result = Compare.movies(moviedb, closeOther, YEAR_SHORT, maxDistance, CASE_SENSITIVE);
    assertTrue(result);
    result = Compare.movies(moviedb, closeMain, YEAR_SHORT, maxDistance, NOT_CASE_SENSITIVE);
    assertTrue(result);
    result = Compare.movies(moviedb, closeOther, YEAR_SHORT, maxDistance, NOT_CASE_SENSITIVE);
    assertTrue(result);

    result = Compare.movies(moviedb, closeMain, "", maxDistance, CASE_SENSITIVE);
    assertTrue(result);
    result = Compare.movies(moviedb, closeOther, "", maxDistance, CASE_SENSITIVE);
    assertTrue(result);
    result = Compare.movies(moviedb, closeMain, "", maxDistance, NOT_CASE_SENSITIVE);
    assertTrue(result);
    result = Compare.movies(moviedb, closeOther, "", maxDistance, NOT_CASE_SENSITIVE);
    assertTrue(result);
}

From source file:fr.lip6.segmentations.ProcessHTML5.java

public void run() {
    //ArrayList<String> s1 = new ArrayList<String>();
    //ArrayList<String> s2= new ArrayList<String>();;
    String s1 = "";
    String s2 = "";
    try {//from   w w w. ja  va2s.c  om
        Class.forName("com.mysql.jdbc.Driver");
        Connection con = DriverManager.getConnection(
                "jdbc:mysql://" + Config.mysqlHost + "/" + Config.mysqlDatabase + "", Config.mysqlUser,
                Config.mysqlPassword);
        Statement st2 = con.createStatement();
        ResultSet rs = st2.executeQuery("select * from html5repo where descriptorbom<>''");
        while (rs.next()) {
            s1 = "";
            s2 = "";
            String d1 = rs.getString("descriptor");
            String d2 = rs.getString("descriptorbom");
            int dsize = d1.split(",").length;
            int d2size = d2.split(",").length;
            for (String s : d1.split(",")) {
                String[] part = s.split("=");
                if (!part[0].equals("PAGE")) {
                    if (part[1].equals("SECTION"))
                        s1 += "S";
                    if (part[1].equals("ARTICLE"))
                        s1 += "A";
                    if (part[1].equals("ASIDE"))
                        s1 += "D";
                    if (part[1].equals("HEADER"))
                        s1 += "H";
                    if (part[1].equals("FOOTER"))
                        s1 += "F";
                    if (part[1].equals("NAV"))
                        s1 += "N";
                }
            }

            for (String s : d2.split(",")) {
                String[] part = s.split("=");
                if (!part[0].equals("PAGE")) {
                    if (part[1].equals("SECTION"))
                        s2 += "S";
                    if (part[1].equals("ARTICLE"))
                        s2 += "A";
                    if (part[1].equals("ASIDE"))
                        s2 += "D";
                    if (part[1].equals("HEADER"))
                        s2 += "H";
                    if (part[1].equals("FOOTER"))
                        s2 += "F";
                    if (part[1].equals("NAV"))
                        s2 += "N";
                }
            }
            int ed = StringUtils.getLevenshteinDistance(s1.toString(), s2.toString());
            int edtotal = Math.max(s1.length(), s2.length());
            HashSet<Character> h1 = new HashSet<Character>(), h2 = new HashSet<Character>();
            for (int i = 0; i < s1.length(); i++) {
                h1.add(s1.charAt(i));
            }
            for (int i = 0; i < s2.length(); i++) {
                h2.add(s2.charAt(i));
            }
            h1.retainAll(h2);
            int inter = h1.size();

            char[] code1 = s1.toCharArray();
            char[] code2 = s2.toCharArray();

            Set set1 = new HashSet();

            for (char c : code1) {
                set1.add(c);
            }

            Set set2 = new HashSet();

            for (char c : code2) {
                set2.add(c);
            }

            int total = set1.size();

            System.out.println(set1);
            System.out.println(set2);
            System.out.println(s1);
            System.out.println(s2);
            System.out.println(rs.getString("id") + ". " + rs.getString("datafolder") + "=" + ed + "/" + edtotal
                    + "=" + ((double) ed / edtotal) + "," + inter + " of " + total + " Prec:("
                    + ((double) inter / total) + ")");
            Statement st3 = con.createStatement();
            //base=distancemax
            st3.execute("update html5repo set distance='" + ed + "',base='" + edtotal + "',found='" + inter
                    + "', expected='" + total + "' where datafolder='" + rs.getString("datafolder") + "'");
            File f = new File("/home/sanojaa/Documents/00_Tesis/work/dataset/dataset/data/"
                    + rs.getString("datafolder") + "/" + rs.getString("datafolder") + ".5.html");
            if (!f.exists()) {
                f.createNewFile();
            }
            FileOutputStream fop = new FileOutputStream(f);
            fop.write(rs.getString("src").getBytes());
            fop.flush();
            fop.close();

        }
    } catch (SQLException ex) {
        Logger.getLogger(SeleniumWrapper.class.getName()).log(Level.SEVERE, null, ex);
    } catch (ClassNotFoundException ex) {
        Logger.getLogger(HTML5Bom.class.getName()).log(Level.SEVERE, null, ex);
    } catch (FileNotFoundException ex) {
        Logger.getLogger(ProcessHTML5.class.getName()).log(Level.SEVERE, null, ex);
    } catch (IOException ex) {
        Logger.getLogger(ProcessHTML5.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:com.omertron.themoviedbapi.Compare.java

/**
 * Compare the Levenshtein Distance between the two strings
 *
 * @param title1/* www . j ava  2s .com*/
 * @param title2
 * @param distance
 */
private static boolean compareDistance(final String title1, final String title2, int distance) {
    return StringUtils.getLevenshteinDistance(title1, title2) <= distance;
}

From source file:deprecate.compare.TokenizerJava_old.java

/**
 * Compute the similarity between two strings and provide a percentage
 * @param s0//w w w  .  j  a  v a  2 s  . c om
 * @param s1
 * @return 
 */
public static int percentSimilar(String s0, String s1) {
    int percentage = (int) (100
            - (float) StringUtils.getLevenshteinDistance(s0, s1) * 100 / (float) (s0.length() + s1.length()));
    return percentage;
}

From source file:model.SongMeaningsScraper.java

private static String validateSong(String artist, String title) {
    HashMap<String, String> songList = DataManager.getSongMap().get(artist);
    for (String songFromMap : songList.keySet()) {
        int levDist = StringUtils.getLevenshteinDistance(songFromMap.toUpperCase(), title.toUpperCase());
        double ratio = (songFromMap.length() - levDist + 0.0) / (songFromMap.length() + 0.0);
        if (ratio == 1.0) {
            Logger.LogToStatusBar(songFromMap + " exactly matches");
            return songFromMap;
        } else if (ratio >= 0.5) {
            ArrayList<String> matches = DataManager.getSongMatches().get(artist + " " + title);
            if (matches == null) {
                matches = new ArrayList<String>();
                matches.add(songFromMap);
                DataManager.getSongMatches().put(artist + " " + title, matches);
            } else {
                matches.add(songFromMap);
                DataManager.getSongMatches().remove(artist + " " + title);
                DataManager.getSongMatches().put(artist + " " + title, matches);
            }//from   w  w  w  . j  a va  2  s .  c om
        }
    }
    return "";
}

From source file:annis.gui.flatquerybuilder.SpanBox.java

@Override
public void textChange(FieldEvents.TextChangeEvent event) {
    String txt = event.getText();
    HashMap<Integer, Collection> levdistvals = new HashMap<Integer, Collection>();
    if (txt.length() > 1) {
        cb.removeAllItems();// www  . ja v  a2s.co m
        for (String s : annonames) {
            Integer d = StringUtils.getLevenshteinDistance(removeAccents(txt), removeAccents(s));
            if (levdistvals.containsKey(d)) {
                levdistvals.get(d).add(s);
            }
            if (!levdistvals.containsKey(d)) {
                Set<String> newc = new TreeSet<String>();
                newc.add(s);
                levdistvals.put(d, newc);
            }
        }
        SortedSet<Integer> keys = new TreeSet<Integer>(levdistvals.keySet());
        for (Integer k : keys.subSet(0, 5)) {
            List<String> values = new ArrayList(levdistvals.get(k));
            Collections.sort(values, String.CASE_INSENSITIVE_ORDER);
            for (String v : values) {
                cb.addItem(v);
            }
        }
    }
}

From source file:com.streamsets.pipeline.lib.fuzzy.FuzzyMatch.java

private static int calculateLevenshteinDistance(String s1, String s2) {
    int distance = StringUtils.getLevenshteinDistance(s1, s2);
    double ratio = ((double) distance) / (Math.max(s1.length(), s2.length()));
    return 100 - (int) (ratio * 100);
}

From source file:name.martingeisse.common.util.Month.java

/**
 * Finds a month by its name, using fuzzy matching (Levenshtein distance).
 * //from   ww  w.j a va 2 s.c o  m
 * This method determines the Levelshtein distance to all long month names. If all
 * distances are greater than the specified maximum distance, null is returned.
 * Otherwise, if a single month has smallest distance to the argument name, that
 * month is returned. Otherwise, the minimum distance is shared by multiple
 * months, i.e. the name is ambiguous, and an {@link AmbiguousFuzzyMatchException}
 * is thrown.
 * 
 * @param name the name to find. Must not be null.
 * @param maxDistance the maximum distance to accept a match.
 * @return returns the matching month with shortest distance, or null if none matches
 * @throws AmbiguousFuzzyMatchException if the shortest distance is shared by more than one month
 */
public static Month fuzzyFind(String name, int maxDistance) {
    if (name == null) {
        throw new IllegalArgumentException("name argument is null");
    }

    Month shortestMatchMonth = null;
    int shortestMatchDistance = Integer.MAX_VALUE;
    boolean ambiguous = false;
    for (Month month : values()) {
        int distance = StringUtils.getLevenshteinDistance(month.getLongName(), name);
        if (distance < shortestMatchDistance) {
            shortestMatchDistance = distance;
            ambiguous = false;
            shortestMatchMonth = month;
        } else if (distance == shortestMatchDistance) {
            ambiguous = true;
        }
    }

    if (shortestMatchDistance > maxDistance) {
        return null;
    } else if (ambiguous) {
        throw new AmbiguousFuzzyMatchException();
    } else {
        return shortestMatchMonth;
    }
}