List of usage examples for org.apache.commons.lang3 StringUtils getLevenshteinDistance
public static int getLevenshteinDistance(CharSequence s, CharSequence t)
From source file:annis.gui.flatquerybuilder.ValueField.java
@Override public void textChange(TextChangeEvent event) { ReducingStringComparator rsc = sq.getRSC(); String fm = sq.getFilterMechanism(); if (!"generic".equals(fm)) { ConcurrentSkipListSet<String> notInYet = new ConcurrentSkipListSet<>(); String txt = event.getText(); if (!txt.equals("")) { scb.removeAllItems();//from w w w . j a v a 2s.com for (Iterator<String> it = values.keySet().iterator(); it.hasNext();) { String s = it.next(); if (rsc.compare(s, txt, fm) == 0) { scb.addItem(s); } else { notInYet.add(s); } } //startsWith for (String s : notInYet) { if (rsc.startsWith(s, txt, fm)) { scb.addItem(s); notInYet.remove(s); } } //contains for (String s : notInYet) { if (rsc.contains(s, txt, fm)) { scb.addItem(s); } } } else { buildValues(this.vm); } } else { String txt = event.getText(); HashMap<Integer, Collection> levdistvals = new HashMap<>(); if (txt.length() > 1) { scb.removeAllItems(); for (String s : values.keySet()) { Integer d = StringUtils.getLevenshteinDistance(removeAccents(txt).toLowerCase(), removeAccents(s).toLowerCase()); if (levdistvals.containsKey(d)) { levdistvals.get(d).add(s); } if (!levdistvals.containsKey(d)) { Set<String> newc = new TreeSet<>(); newc.add(s); levdistvals.put(d, newc); } } SortedSet<Integer> keys = new TreeSet<>(levdistvals.keySet()); for (Integer k : keys.subSet(0, 10)) { List<String> valueList = new ArrayList(levdistvals.get(k)); Collections.sort(valueList, String.CASE_INSENSITIVE_ORDER); for (String v : valueList) { scb.addItem(v); } } } } }
From source file:model.SongMeaningsScraper.java
public static String validateArtist(String artist) { HashMap<String, String> artists = DataManager.getArtistMap(); for (String artistFromMap : artists.keySet()) { int levDist = StringUtils.getLevenshteinDistance(artistFromMap.toUpperCase(), artist.toUpperCase()); double ratio = (artistFromMap.length() - levDist + 0.0) / (artistFromMap.length() + 0.0); if (ratio == 1.0) { Logger.LogToStatusBar(artistFromMap + " exactly matches"); return artistFromMap; } else if (ratio >= 0.5) { ArrayList<String> matches = DataManager.getArtistMatches().get(artist); if (matches == null) { matches = new ArrayList<String>(); matches.add(artistFromMap); DataManager.getArtistMatches().put(artist, matches); } else { matches.add(artistFromMap); DataManager.getArtistMatches().remove(artist); DataManager.getArtistMatches().put(artist, matches); }//from ww w .j a v a2 s . co m } } return ""; }
From source file:annis.gui.flatquerybuilder.SearchBox.java
@Override public void textChange(TextChangeEvent event) { if ("specific".equals(sq.getFilterMechanism())) { ConcurrentSkipListSet<String> notInYet = new ConcurrentSkipListSet<String>(); reducingStringComparator esc = new reducingStringComparator(); String txt = event.getText(); if (!txt.equals("")) { cb.removeAllItems();/*w ww .j a v a 2 s. com*/ for (Iterator<String> it = annonames.iterator(); it.hasNext();) { String s = it.next(); if (esc.compare(s, txt) == 0) { cb.addItem(s); } else { notInYet.add(s); } } //startsWith for (String s : notInYet) { if (esc.startsWith(s, txt)) { cb.addItem(s); notInYet.remove(s); } } //contains for (String s : notInYet) { if (esc.contains(s, txt)) { cb.addItem(s); } } } else { //have a look and speed it up SpanBox.buildBoxValues(cb, ebene, sq); } } if ("levenshtein".equals(sq.getFilterMechanism())) { String txt = event.getText(); HashMap<Integer, Collection> levdistvals = new HashMap<Integer, Collection>(); if (txt.length() > 1) { cb.removeAllItems(); for (String s : annonames) { Integer d = StringUtils.getLevenshteinDistance(removeAccents(txt), removeAccents(s)); if (levdistvals.containsKey(d)) { levdistvals.get(d).add(s); } if (!levdistvals.containsKey(d)) { Set<String> newc = new TreeSet<String>(); newc.add(s); levdistvals.put(d, newc); } } SortedSet<Integer> keys = new TreeSet<Integer>(levdistvals.keySet()); for (Integer k : keys.subSet(0, 5)) { List<String> values = new ArrayList(levdistvals.get(k)); Collections.sort(values, String.CASE_INSENSITIVE_ORDER); for (String v : values) { cb.addItem(v); } } } } }
From source file:edu.jhuapl.dorset.agents.StockAgent.java
protected CompanyInfo findStockSymbol(String stockCompanyName) { CompanyInfo companyInfo = null;/*from ww w. j a v a 2 s . c om*/ ArrayList<String> regexMatches = new ArrayList<String>(); if (this.stockSymbolMap.get(stockCompanyName) != null) { companyInfo = this.stockSymbolMap.get(stockCompanyName); } else { String regex = "\\b" + stockCompanyName + "\\b"; Pattern pat = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); for (Map.Entry<String, CompanyInfo> entry : stockSymbolMap.entrySet()) { Matcher matcher = pat.matcher(entry.getKey()); if (matcher.find()) { regexMatches.add(entry.getKey()); } } if (regexMatches.size() == 0) { companyInfo = null; } else if (regexMatches.size() == 1) { companyInfo = this.stockSymbolMap.get(regexMatches.get(0)); } else { int distance; HashMap<String, Integer> matchDistanceMap = new HashMap<String, Integer>(); for (int i = 0; i < regexMatches.size(); i++) { distance = (StringUtils.getLevenshteinDistance(regexMatches.get(i), stockCompanyName)); matchDistanceMap.put(regexMatches.get(i), distance); } Entry<String, Integer> minDistancePair = null; for (Entry<String, Integer> entry : matchDistanceMap.entrySet()) { if (minDistancePair == null || minDistancePair.getValue() > entry.getValue()) { minDistancePair = entry; } } companyInfo = this.stockSymbolMap.get(minDistancePair.getKey()); } } return companyInfo; }
From source file:com.epam.ta.reportportal.util.analyzer.IssuesAnalyzerService.java
@Override public void analyze(String launchId, List<TestItem> resources, List<TestItem> scope) { for (TestItem current : resources) { List<Double> curRate = Lists.newArrayList(); TestItemIssue issue = null;// w w w . ja v a 2s .co m boolean isInvestigated = false; List<Log> curItemErr = logRepository.findTestItemErrorLogs(current.getId()); Launch launch = launchRepository.findOne(current.getLaunchRef()); Project project = projectRepository.findOne(launch.getProjectRef()); for (TestItem item : scope) { /* * Avoid comparison with itself as investigated item during * in_progress launch. Cause manually investigated item will be * included in history of current one. */ if (item.getId().equalsIgnoreCase(current.getId())) continue; List<Log> errors = logRepository.findTestItemErrorLogs(item.getId()); if (errors.size() == curItemErr.size()) { for (int i = 0; i < curItemErr.size(); i++) { String curMsg = curItemErr.get(i).getLogMsg().replaceAll("\\d+", "") .replaceAll("\\s(at)\\s", ""); String scopeMsg = errors.get(i).getLogMsg().replaceAll("\\d+", "").replaceAll("\\s(at)\\s", ""); /* * Get Levenshtein distance for two comparing log * strings */ int maxString = Math.max(curMsg.length(), scopeMsg.length()); int diff = StringUtils.getLevenshteinDistance(curMsg, scopeMsg); /* * Store percentage of equality */ curRate.add(((double) (maxString - diff)) / maxString * 100); } } if (!curRate.isEmpty() && (this.mathMiddle(curRate) >= acceptRate)) { isInvestigated = true; issue = item.getIssue(); /* Stop looping cause acceptable item found already. */ break; } else curRate.clear(); } if (isInvestigated) { TestItemIssue currentIssue = current.getIssue(); /* If item was investigated till Launch finished. */ if ((null != currentIssue.getExternalSystemIssues()) || (!currentIssue.getIssueType() .equalsIgnoreCase(TestItemIssueType.TO_INVESTIGATE.getLocator())) || (null != currentIssue.getIssueDescription())) { currentIssue.setIssueDescription( this.suggest(currentIssue.getIssueDescription(), issue, project.getConfiguration())); current.setIssue(currentIssue); testItemRepository.save(current); /* If system investigate item from scratch */ } else { issue.setIssueDescription(this.mark(issue.getIssueDescription())); current = statisticsFacadeFactory .getStatisticsFacade(project.getConfiguration().getStatisticsCalculationStrategy()) .resetIssueStatistics(current); current.setIssue(issue); testItemRepository.save(current); statisticsFacadeFactory .getStatisticsFacade(project.getConfiguration().getStatisticsCalculationStrategy()) .updateIssueStatistics(current); } } } analyzeFinished(launchId); }
From source file:controllers.WidgetAdmin.java
private static String isPasswordStrongEnough(String password, String email) { if (StringUtils.length(password) < 8) { return "Password is too short"; }/* w w w. j a v a2 s . co m*/ if (!Pattern.matches("(?=^.{8,}$)((?=.*\\d)|(?=.*\\W+))(?![.\\n])(?=.*[A-Z])(?=.*[a-z]).*$", password) && !StringUtils.containsIgnoreCase(email, password)) { return "Password must match requirements"; } Set<String> strSet = new HashSet<String>(); for (String s : password.split("")) { if (StringUtils.length(s) > 0) { strSet.add(s.toLowerCase()); } } if (CollectionUtils.size(strSet) < 3) { return "Too many repeating letters"; } if (StringUtils.getLevenshteinDistance(password, email.split("@")[0]) < 5 || StringUtils.getLevenshteinDistance(password, email.split("@")[1]) < 5) { return "Password similar to email"; } return null; }
From source file:net.samuelbjohnson.javadev.crosstopix.Joiner.java
protected BigDecimal computeDistance(String string1, String string2) { return new BigDecimal(StringUtils.getLevenshteinDistance(string1, string2)); }
From source file:airportApp.Query.java
/** * This method suggests the closest matching word and country code. * /*www . ja v a 2s . co m*/ * The matching is done based on the Levenshtein distance: the minimum number * of single-character edits (i.e. insertions, deletions or substitutions) * required to change one word into the other. A disadvantage of the Levenshtein * distance is that any change (edit) has an equal influence on the matching * distance, whereas an algorithm that penalizes the different errors in a * different degree (i.e. insertions penalty = 1; deletions penalty = 2; * substitutions penalty = 3) can produce more accurate results in some cases. * * To demonstrate the above statement, let us consider the input "Zimb" compared * to the two strings "Zimbabwe" and "Fiji". * The Levenshtein distance between "Zimb" and "Zimbabwe" is 4; while between * "Zimb" and "Fiji" it is 3. Therefore, "Fiji" is the closest match. However, * if the input is "Zimbab", the distance between "Zimbab" and "Zimbabwe" is * 2; while between "Zimbab" and "Fiji" is 5. Therefore, "Zimbabwe" is the * closest match. * * The Levenshtein distance is not necessarily a bad similarity test. It all * depends on our preferences of how we would like to do the comparisons. In * other cases than the one mentioned above, the Levenshtein distance can * produce more accurate results. There are also many other similarity matching * algorithms (some based on longest common subsequence). * * Attention! This method uses a third-party library from commons.apache.org. * Make sure that the library is added to the compilation libraries of your * IDE. The library's Levenshtein algorithm is updated with a newer version * (by Chas Emerick) that avoids OutOfMemory errors that can occur for very * large Strings. The library also contains implementations for finding the * longest common prefix, the Jaro-Winkler distance and another Fuzzy distance. * So they are all alternatives, if one is not satisfied with the Levenshtein's * one, and can also possibly be combined in different ways. Assuming the user * does not misspell the beginning of the country, an algorithm counting the * length of the longest common prefix towards the length of the larger * sequence (a check larger or equal to 25% ) can precede a check for the * Levenshtein's distance. * * @param country User input for which a match will be returned. * @return Returns a Suggestion object (the closest match to the input * argument and its associated country code) or null , if the match fails * for any reason. */ private static Suggestion suggestCountry(String country) { Suggestion suggestion = null; String suggestionMatch = ""; String suggestionCode = ""; int suggestionDistance = -1; try { countriesReader = new BufferedReader(new FileReader("resources/countries.csv")); String line; // pointer (line reader) used with the countries countriesReader.readLine(); // skip first line (column names) while ((line = countriesReader.readLine()) != null) { Country c = Utils.readCountry(line); // use the smaller distance of code name and country name int codeDistance = StringUtils.getLevenshteinDistance(country, c.getCode()); int nameDistance = StringUtils.getLevenshteinDistance(country, c.getName()); int distance = ((codeDistance < nameDistance) ? codeDistance : nameDistance); // keep track of the associated match String match = ((codeDistance < nameDistance) ? c.getCode() : c.getName()); // keep track of the associated country code String code = c.getCode(); // for the first country just overwrite global values if (suggestionDistance == -1) { suggestionDistance = distance; suggestionMatch = match; suggestionCode = code; } else { // for the rest use the global comaprison if (distance < suggestionDistance) { suggestionDistance = distance; suggestionMatch = match; suggestionCode = code; } } suggestion = new Suggestion(suggestionMatch, suggestionCode); } } catch (IllegalArgumentException ex) { System.err.println("Error: " + ex.getMessage()); } catch (FileNotFoundException ex) { System.err.println("Error: " + ex.getMessage()); } catch (IOException ex) { System.err.println("Error: " + ex.getMessage()); } return suggestion; }
From source file:bear.plugins.groovy.GroovyCodeCompleter.java
public Replacements completeCode(String script, int position) { int[] range = scanForStart(script, position, -1); int start = range[0]; int end = range[1] + 1; boolean endsWithDot = script.charAt(firstNonSpace(script, Math.min(end, script.length() - 1), -1)) == '.'; List<Token> tokens = tokenize(script, start, end); //fo| => size 1 //foo. => size 1 Class<?> firstTokenClass; if (tokens.size() == 1 && !endsWithDot) { firstTokenClass = null;/*w w w . j a va 2 s. c o m*/ //match vars from binding, there should be a few Set<Map.Entry<String, ?>> entries = binding.getVariables().entrySet(); List<Candidate> candidates = new ArrayList<Candidate>(); for (Map.Entry<String, ?> entry : entries) { String varName = entry.getKey(); char[] chars = tokens.get(0).name.toCharArray(); int score = 0; for (int i = 0; i < chars.length; i++) { score += frequency(varName, chars[i]) * i == 0 ? 3 : (i == 1 ? 2 : 1); } if (score > 0) { candidates.add(new Candidate( new Replacement(varName, entry.getValue().getClass().getSimpleName()), score)); } } Collections.sort(candidates); return new Replacements(start, end).addAll(candidates); } else { Object variable = shell.getVariable(tokens.get(0).name); firstTokenClass = variable == null ? null : variable.getClass(); } List<Class<?>> currentClasses = firstTokenClass == null ? new ArrayList<Class<?>>() : Lists.<Class<?>>newArrayList(firstTokenClass); for (int i = 1; i < tokens.size(); i++) { Token token = tokens.get(i); boolean lastToken = i == tokens.size() - 1; if (lastToken && !endsWithDot) { break; } //strict match List<Class<?>> returnTypes = new ArrayList<Class<?>>(); if (token.method) { for (Class<?> currentClass : currentClasses) { for (MethodDesc method : OpenBean.methods(currentClass)) { if (method.getName().equals(token.name)) { returnTypes.add(method.getMethod().getReturnType()); } } } } else { for (Class<?> currentClass : currentClasses) { for (Field field : OpenBean.fields(currentClass)) { if (field.getName().equals(token.name)) { returnTypes.add(field.getType()); } } } } if (returnTypes.size() > 1) { currentClasses = Lists.newArrayList(new LinkedHashSet<Class<?>>(returnTypes)); } else { currentClasses = returnTypes; } } String pattern = null; if (!endsWithDot) { pattern = tokens.get(tokens.size() - 1).name.toLowerCase(); } Replacements replacements = endsWithDot ? new Replacements(position, position) : new Replacements(tokens.get(tokens.size() - 1).start, position); if (endsWithDot) { for (Class<?> currentClass : currentClasses) { for (Field field : OpenBean.fields(currentClass)) { replacements.add(new Replacement(field)); } for (MethodDesc method : OpenBean.methods(currentClass)) { replacements.add(new Replacement(method)); } } } else { final String patternLC = pattern.toLowerCase(); Set<Field> usedFields = new HashSet<Field>(); Set<Method> usedMethods = new HashSet<Method>(); List<Candidate> candidates = new ArrayList<Candidate>(); { int score = 10000; for (Class<?> currentClass : currentClasses) { for (Field field : OpenBean.fields(currentClass)) { if (field.getName().toLowerCase().startsWith(patternLC)) { usedFields.add(field); candidates.add(new Candidate(new Replacement(field), score--)); } } for (MethodDesc method : OpenBean.methods(currentClass)) { if (method.getName().toLowerCase().startsWith(patternLC)) { usedMethods.add(method.getMethod()); candidates.add(new Candidate(new Replacement(method), score--)); } } } } Collections.sort(candidates, new Comparator<Candidate>() { @Override public int compare(Candidate o1, Candidate o2) { int d1 = StringUtils.getLevenshteinDistance(o1.r.name.toLowerCase(), patternLC); int d2 = StringUtils.getLevenshteinDistance(o2.r.name.toLowerCase(), patternLC, d1 + 1); return Integer.compare(d1, d2); } }); for (int i = 0; i < candidates.size(); i++) { Candidate candidate = candidates.get(i); candidate.score = 10000 - i; } char[] chars = pattern.toCharArray(); for (Class<?> currentClass : currentClasses) { for (Field field : OpenBean.fields(currentClass)) { if (usedFields.contains(field)) continue; int r = 0; for (char aChar : chars) { r += frequency(field.getName(), aChar); } if (r > 0) candidates.add(new Candidate(new Replacement(field), r)); } for (MethodDesc method : OpenBean.methods(currentClass)) { if (usedMethods.contains(method.getMethod())) continue; int r = 0; for (char aChar : chars) { r += frequency(method.getName(), aChar); } if (r > 0) candidates.add(new Candidate(new Replacement(method), r)); } } Collections.sort(candidates); replacements.addAll(candidates); } return replacements; }
From source file:edu.harvard.mcz.nametools.AuthorNameComparator.java
/** * Return a measure of the similarity between two strings in the range of * 0 (no similarity) to 1 (exact same strings), using a measure of the * string edit distance scaled to the length differences of the two strings. * //from w ww.ja v a2 s .c o m * @param string1 one string for comparison * @param string2 the string to compare with string1 * @return a double in the range 0 to 1. */ public static double stringSimilarity(String string1, String string2) { double result = 0d; String longer = string1; String shorter = string2; if (string1.length() < string2.length()) { // flip so that longer string is the longest. longer = string2; shorter = string1; } if (longer.length() == 0) { result = 1.0; } else { result = (longer.length() - StringUtils.getLevenshteinDistance(longer, shorter)) / (double) longer.length(); } return result; }