List of usage examples for org.apache.commons.lang3 StringUtils getLevenshteinDistance
public static int getLevenshteinDistance(CharSequence s, CharSequence t)
From source file:at.jps.sanction.core.util.TokenTool.java
public static float compareCheck(final List<String> textTokens, final String text, final boolean fuzzy, final int minlen) { float deltaValue = 100; float percentHitrate = 0; // +/- fuzzy string compare if (textTokens.size() > 0) { for (final String token : textTokens) { if (token.length() >= minlen) { if (fuzzy) { final float lsHitValue = StringUtils.getLevenshteinDistance(text, token); if (lsHitValue < deltaValue) { deltaValue = lsHitValue; }/*ww w.ja v a 2 s . c o m*/ } else { deltaValue = (text.equals(token) ? 0 : 100); } } } percentHitrate = 100 - ((100 / ((float) (text.length()))) * deltaValue); } return percentHitrate; }
From source file:GIST.IzbirkomExtractor.TableExtractor.java
public void processHTMLfile(File input_html) throws IOException, TableExtractorException, CloneNotSupportedException, SQLException, ResultSinkException { logger.info("Start processing " + input_html); Document doc = Jsoup.parse(input_html, "UTF-8"); Elements tables = doc.getElementsByTag("table"); /* count of parseable tables found */ int tables_found = 0; /* determine raion name */ String raion_name = extractRaionFromFileName(input_html.getName()); //System.err.println(raion_name); // TODO: inflect raion name in case /* searches for a table that has " . -" in its very 1st cell */ for (Element table : tables) { Elements rows = table.getElementsByTag("tr"); boolean firstRow = true; row_loop: for (Element row : rows) { Elements cells = row.getElementsByTag("td"); if (firstRow) { //System.err.println(row.text()); if (isParsableTable(row)) { firstRow = false; logger.info("Processing table #" + ++tables_found + " in " + input_html); } else break row_loop; }/*from ww w.j av a 2s. co m*/ if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()), " . -") < 3) continue row_loop; /* skip the row if it looks like a table header */ /* skip rows with all cells empty */ boolean emptyRow = true; for (Element cell : cells) emptyRow = emptyRow && cleanupUNICODE(cell.text()).isEmpty(); if (emptyRow) continue; int i_cell = 0; Element station_id = null; Element address_field = null; Element org_address = null; /* address of the ??? */ Element station_address = null; for (Element cell : cells) { switch (i_cell) { case 0: station_id = cell; break; case 1: address_field = cell; break; case 2: org_address = cell; break; case 3: station_address = cell; default: break; } i_cell++; } if (station_id == null) throw new TableExtractorException("Polling station ID not found", row, input_html); if (address_field == null) throw new TableExtractorException("Address list not found", row, input_html); /* extract int from poll station id */ int psid; try { psid = Integer.valueOf(cleanupUNICODE(station_id.text()).trim().replaceAll("[^\\d]", "")); } catch (NumberFormatException e) { Exception te = new TableExtractorException("Failed to parse polling station ID >" + cleanupUNICODE(station_id.text()).trim() + "<: ", station_id, input_html); logger.severe(te.getMessage() + "; rest of " + input_html + " ignored."); return; } /* extraction from HTML completely finished, now we work only with the addresses in the text form */ extractAddressesFromText(raion_name.trim(), psid, cleanLeftoverHTML(address_field), cleanLeftoverHTML(org_address), cleanLeftoverHTML(station_address)); } } if (tables_found == 0) logger.severe("No parsable tables found in " + input_html); resultSink.commit(); logger.info("" + tables_found + " table(s) processed in " + input_html); }
From source file:com.omertron.themoviedbapi.CompareTest.java
/** * Close match//w ww . j ava 2 s . c o m */ @Test public void testCloseMatch() { int maxDistance = 6; boolean result; String closeMain = "bloderannar"; String closeOther = "Blade Runner Dir Cut"; // Make sure they are close enough int currentDistance; currentDistance = StringUtils.getLevenshteinDistance(TITLE_MAIN, closeMain); LOG.info("Distance between '{}' and '{}' is {}", TITLE_MAIN, closeMain, currentDistance); assertTrue(currentDistance <= maxDistance); currentDistance = StringUtils.getLevenshteinDistance(TITLE_OTHER, closeOther); LOG.info("Distance between '{}' and '{}' is {}", TITLE_OTHER, closeOther, currentDistance); assertTrue(currentDistance <= maxDistance); result = Compare.movies(moviedb, closeMain, YEAR_SHORT, maxDistance, CASE_SENSITIVE); assertTrue(result); result = Compare.movies(moviedb, closeOther, YEAR_SHORT, maxDistance, CASE_SENSITIVE); assertTrue(result); result = Compare.movies(moviedb, closeMain, YEAR_SHORT, maxDistance, NOT_CASE_SENSITIVE); assertTrue(result); result = Compare.movies(moviedb, closeOther, YEAR_SHORT, maxDistance, NOT_CASE_SENSITIVE); assertTrue(result); result = Compare.movies(moviedb, closeMain, "", maxDistance, CASE_SENSITIVE); assertTrue(result); result = Compare.movies(moviedb, closeOther, "", maxDistance, CASE_SENSITIVE); assertTrue(result); result = Compare.movies(moviedb, closeMain, "", maxDistance, NOT_CASE_SENSITIVE); assertTrue(result); result = Compare.movies(moviedb, closeOther, "", maxDistance, NOT_CASE_SENSITIVE); assertTrue(result); }
From source file:fr.lip6.segmentations.ProcessHTML5.java
public void run() { //ArrayList<String> s1 = new ArrayList<String>(); //ArrayList<String> s2= new ArrayList<String>();; String s1 = ""; String s2 = ""; try {//from w w w. ja va2s.c om Class.forName("com.mysql.jdbc.Driver"); Connection con = DriverManager.getConnection( "jdbc:mysql://" + Config.mysqlHost + "/" + Config.mysqlDatabase + "", Config.mysqlUser, Config.mysqlPassword); Statement st2 = con.createStatement(); ResultSet rs = st2.executeQuery("select * from html5repo where descriptorbom<>''"); while (rs.next()) { s1 = ""; s2 = ""; String d1 = rs.getString("descriptor"); String d2 = rs.getString("descriptorbom"); int dsize = d1.split(",").length; int d2size = d2.split(",").length; for (String s : d1.split(",")) { String[] part = s.split("="); if (!part[0].equals("PAGE")) { if (part[1].equals("SECTION")) s1 += "S"; if (part[1].equals("ARTICLE")) s1 += "A"; if (part[1].equals("ASIDE")) s1 += "D"; if (part[1].equals("HEADER")) s1 += "H"; if (part[1].equals("FOOTER")) s1 += "F"; if (part[1].equals("NAV")) s1 += "N"; } } for (String s : d2.split(",")) { String[] part = s.split("="); if (!part[0].equals("PAGE")) { if (part[1].equals("SECTION")) s2 += "S"; if (part[1].equals("ARTICLE")) s2 += "A"; if (part[1].equals("ASIDE")) s2 += "D"; if (part[1].equals("HEADER")) s2 += "H"; if (part[1].equals("FOOTER")) s2 += "F"; if (part[1].equals("NAV")) s2 += "N"; } } int ed = StringUtils.getLevenshteinDistance(s1.toString(), s2.toString()); int edtotal = Math.max(s1.length(), s2.length()); HashSet<Character> h1 = new HashSet<Character>(), h2 = new HashSet<Character>(); for (int i = 0; i < s1.length(); i++) { h1.add(s1.charAt(i)); } for (int i = 0; i < s2.length(); i++) { h2.add(s2.charAt(i)); } h1.retainAll(h2); int inter = h1.size(); char[] code1 = s1.toCharArray(); char[] code2 = s2.toCharArray(); Set set1 = new HashSet(); for (char c : code1) { set1.add(c); } Set set2 = new HashSet(); for (char c : code2) { set2.add(c); } int total = set1.size(); System.out.println(set1); System.out.println(set2); System.out.println(s1); System.out.println(s2); System.out.println(rs.getString("id") + ". " + rs.getString("datafolder") + "=" + ed + "/" + edtotal + "=" + ((double) ed / edtotal) + "," + inter + " of " + total + " Prec:(" + ((double) inter / total) + ")"); Statement st3 = con.createStatement(); //base=distancemax st3.execute("update html5repo set distance='" + ed + "',base='" + edtotal + "',found='" + inter + "', expected='" + total + "' where datafolder='" + rs.getString("datafolder") + "'"); File f = new File("/home/sanojaa/Documents/00_Tesis/work/dataset/dataset/data/" + rs.getString("datafolder") + "/" + rs.getString("datafolder") + ".5.html"); if (!f.exists()) { f.createNewFile(); } FileOutputStream fop = new FileOutputStream(f); fop.write(rs.getString("src").getBytes()); fop.flush(); fop.close(); } } catch (SQLException ex) { Logger.getLogger(SeleniumWrapper.class.getName()).log(Level.SEVERE, null, ex); } catch (ClassNotFoundException ex) { Logger.getLogger(HTML5Bom.class.getName()).log(Level.SEVERE, null, ex); } catch (FileNotFoundException ex) { Logger.getLogger(ProcessHTML5.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(ProcessHTML5.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:com.omertron.themoviedbapi.Compare.java
/** * Compare the Levenshtein Distance between the two strings * * @param title1/* www . j ava 2s .com*/ * @param title2 * @param distance */ private static boolean compareDistance(final String title1, final String title2, int distance) { return StringUtils.getLevenshteinDistance(title1, title2) <= distance; }
From source file:deprecate.compare.TokenizerJava_old.java
/** * Compute the similarity between two strings and provide a percentage * @param s0//w w w . j a v a 2 s . c om * @param s1 * @return */ public static int percentSimilar(String s0, String s1) { int percentage = (int) (100 - (float) StringUtils.getLevenshteinDistance(s0, s1) * 100 / (float) (s0.length() + s1.length())); return percentage; }
From source file:model.SongMeaningsScraper.java
private static String validateSong(String artist, String title) { HashMap<String, String> songList = DataManager.getSongMap().get(artist); for (String songFromMap : songList.keySet()) { int levDist = StringUtils.getLevenshteinDistance(songFromMap.toUpperCase(), title.toUpperCase()); double ratio = (songFromMap.length() - levDist + 0.0) / (songFromMap.length() + 0.0); if (ratio == 1.0) { Logger.LogToStatusBar(songFromMap + " exactly matches"); return songFromMap; } else if (ratio >= 0.5) { ArrayList<String> matches = DataManager.getSongMatches().get(artist + " " + title); if (matches == null) { matches = new ArrayList<String>(); matches.add(songFromMap); DataManager.getSongMatches().put(artist + " " + title, matches); } else { matches.add(songFromMap); DataManager.getSongMatches().remove(artist + " " + title); DataManager.getSongMatches().put(artist + " " + title, matches); }//from w w w . j a va 2 s . c om } } return ""; }
From source file:annis.gui.flatquerybuilder.SpanBox.java
@Override public void textChange(FieldEvents.TextChangeEvent event) { String txt = event.getText(); HashMap<Integer, Collection> levdistvals = new HashMap<Integer, Collection>(); if (txt.length() > 1) { cb.removeAllItems();// www . ja v a2s.co m for (String s : annonames) { Integer d = StringUtils.getLevenshteinDistance(removeAccents(txt), removeAccents(s)); if (levdistvals.containsKey(d)) { levdistvals.get(d).add(s); } if (!levdistvals.containsKey(d)) { Set<String> newc = new TreeSet<String>(); newc.add(s); levdistvals.put(d, newc); } } SortedSet<Integer> keys = new TreeSet<Integer>(levdistvals.keySet()); for (Integer k : keys.subSet(0, 5)) { List<String> values = new ArrayList(levdistvals.get(k)); Collections.sort(values, String.CASE_INSENSITIVE_ORDER); for (String v : values) { cb.addItem(v); } } } }
From source file:com.streamsets.pipeline.lib.fuzzy.FuzzyMatch.java
private static int calculateLevenshteinDistance(String s1, String s2) { int distance = StringUtils.getLevenshteinDistance(s1, s2); double ratio = ((double) distance) / (Math.max(s1.length(), s2.length())); return 100 - (int) (ratio * 100); }
From source file:name.martingeisse.common.util.Month.java
/** * Finds a month by its name, using fuzzy matching (Levenshtein distance). * //from ww w.j a va 2 s.c o m * This method determines the Levelshtein distance to all long month names. If all * distances are greater than the specified maximum distance, null is returned. * Otherwise, if a single month has smallest distance to the argument name, that * month is returned. Otherwise, the minimum distance is shared by multiple * months, i.e. the name is ambiguous, and an {@link AmbiguousFuzzyMatchException} * is thrown. * * @param name the name to find. Must not be null. * @param maxDistance the maximum distance to accept a match. * @return returns the matching month with shortest distance, or null if none matches * @throws AmbiguousFuzzyMatchException if the shortest distance is shared by more than one month */ public static Month fuzzyFind(String name, int maxDistance) { if (name == null) { throw new IllegalArgumentException("name argument is null"); } Month shortestMatchMonth = null; int shortestMatchDistance = Integer.MAX_VALUE; boolean ambiguous = false; for (Month month : values()) { int distance = StringUtils.getLevenshteinDistance(month.getLongName(), name); if (distance < shortestMatchDistance) { shortestMatchDistance = distance; ambiguous = false; shortestMatchMonth = month; } else if (distance == shortestMatchDistance) { ambiguous = true; } } if (shortestMatchDistance > maxDistance) { return null; } else if (ambiguous) { throw new AmbiguousFuzzyMatchException(); } else { return shortestMatchMonth; } }