List of usage examples for org.apache.commons.lang3 StringUtils getJaroWinklerDistance
public static double getJaroWinklerDistance(final CharSequence first, final CharSequence second)
Find the Jaro Winkler Distance which indicates the similarity score between two Strings.
The Jaro measure is the weighted sum of percentage of matched characters from each file and transposed characters.
From source file:br.pucminas.ri.jsearch.rest.controller.ApiController.java
private static AutoComplete autocomplete(Request req, Response res, LogController log) { AutoComplete ac = new AutoComplete(new String[] {}); String query = req.queryParams("query"); try {// w ww .jav a 2 s . c om List<Log> logs = log.getLogsByIp(req.ip()); int count = 0; StringList suggestions = new StringList(); for (Log l : logs) { if (StringUtils.getJaroWinklerDistance(query, l.getQuery()) >= 0.80 && !suggestions.contains(l.getQuery()) && count < 5) { suggestions.add(l.getQuery()); count++; } } ac.setSugestions(suggestions.toArray()); } catch (Exception e) { System.err.println(e.getMessage()); } return ac; }
From source file:io.microprofile.showcase.speaker.persistence.SpeakerDAO.java
/** * Really simple fuzzy match using JaroWinklerDistance * * @param left String to match/*w w w. ja va2 s . c o m*/ * @param right String to compare * @return True is match is gt 0.85 */ private static boolean isMatch(String left, String right) { //No empty strings left = null != left ? left : ""; right = null != right ? right : ""; return StringUtils.getJaroWinklerDistance(left, right) > 0.85; }
From source file:com.moviejukebox.plugin.MovieMeterPlugin.java
/** * Get the ID for the movie/*w ww .ja v a 2 s . c om*/ * * @param title Movie title to get the ID for * @param year Movie year to get the ID for * @return The ID, or empty if no idea found */ public String getMovieId(final String title, final String year) { String id = UNKNOWN; LOG.debug("Looking for MovieMeter ID for {} ({})", title, year); List<SearchResult> results; try { results = api.search(title); } catch (MovieMeterException ex) { LOG.warn("Failed to get Movie Meter search results for {} ({}): {}", title, year, ex.getMessage(), ex); return id; } if (results.isEmpty()) { return id; } int fYear = NumberUtils.toInt(year, 0); double maxMatch = 0.0; for (SearchResult sr : results) { // if we have a year, check that first if (fYear > 0 && sr.getYear() != fYear) { continue; } // Check for best text similarity double result = StringUtils.getJaroWinklerDistance(title, sr.getTitle()); if (result > maxMatch) { LOG.trace("Better match found for {} ({}) = {} ({}) [{}]", title, year, sr.getTitle(), sr.getYear(), maxMatch); maxMatch = result; // Update the best result id = Integer.toString(sr.getId()); } } if (isValidString(id)) { LOG.debug("MovieMeter ID '{}' found for {} ({}), Match confidence: {}", id, title, year, maxMatch); } return id; }
From source file:net.longfalcon.newsj.TVRageService.java
private long getRageMatch(ShowInfo showInfo) { String cleanName = showInfo.getCleanName(); try {/* w w w . j a va 2s.c o m*/ TraktResult[] traktResults = new TraktResult[0]; try { traktResults = traktService.searchTvShowByName(cleanName.toLowerCase()); } catch (Exception e) { _log.error("error fetching show data for " + cleanName + " - " + e.toString()); _log.debug("", e); } if (traktResults.length > 0) { TraktResult firstResult = traktResults[0]; if (firstResult.getScore() > 50) { // probably the best match, we wont bother looking elsewhere. if (firstResult.getShowResult() != null) { _log.info("found +50% match: " + firstResult.getShowResult().getTitle()); return getRageIdFromTraktResultsSafe(firstResult); } else { return -2;// error } } else if (traktResults.length > 1) { String firstResultName = firstResult.getShowResult().getTitle(); String secondResultName = traktResults[1].getShowResult().getTitle(); double firstSimilarityScore = StringUtils.getJaroWinklerDistance(cleanName, firstResultName); double secondSimilarityScore = StringUtils.getJaroWinklerDistance(cleanName, secondResultName); if (firstSimilarityScore > secondSimilarityScore) { return getRageIdFromTraktResultsSafe(firstResult); } else { return getRageIdFromTraktResultsSafe(traktResults[1]); } } return getRageIdFromTraktResultsSafe(firstResult); } _log.warn("no trakt show found for name " + cleanName); return -2; // not found } catch (Exception e) { _log.error(e.toString(), e); return -2; // error } }
From source file:net.longfalcon.newsj.TVRageService.java
private long getTraktMatch(ShowInfo showInfo) { String cleanName = showInfo.getCleanName(); _log.info("searching trakt for show " + cleanName); try {// w w w .j ava2 s . c om TraktResult[] traktResults = new TraktResult[0]; try { traktResults = traktService.searchTvShowByName(cleanName.toLowerCase()); } catch (Exception e) { _log.error("error fetching show data for " + cleanName + " - " + e.toString()); _log.debug("", e); } if (traktResults.length > 0) { TraktResult firstResult = traktResults[0]; if (firstResult.getScore() > 50) { // probably the best match, we wont bother looking elsewhere. if (firstResult.getShowResult() != null) { _log.info("found +50% match: " + firstResult.getShowResult().getTitle()); return getTraktIdFromTraktResultsSafe(firstResult); } else { return -2;// error } } else if (traktResults.length > 1) { String firstResultName = firstResult.getShowResult().getTitle(); String secondResultName = traktResults[1].getShowResult().getTitle(); double firstSimilarityScore = StringUtils.getJaroWinklerDistance(cleanName, firstResultName); double secondSimilarityScore = StringUtils.getJaroWinklerDistance(cleanName, secondResultName); if (firstSimilarityScore > secondSimilarityScore) { return getTraktIdFromTraktResultsSafe(firstResult); } else { return getTraktIdFromTraktResultsSafe(traktResults[1]); } } return getTraktIdFromTraktResultsSafe(firstResult); } return -1; // not found } catch (Exception e) { _log.error(e.toString(), e); return -2; // error } }
From source file:net.longfalcon.newsj.TVRageService.java
private TraktResult getTraktMatch(String showname) { _log.info("searching trakt for show " + showname); try {/*from w ww . ja v a2 s. c o m*/ TraktResult[] traktResults = new TraktResult[0]; try { traktResults = traktService.searchTvShowByName(showname.toLowerCase()); } catch (Exception e) { _log.error("error fetching show data for " + showname + " - " + e.toString()); _log.debug("", e); } if (traktResults.length > 0) { TraktResult firstResult = traktResults[0]; if (firstResult.getScore() > 50) { // probably the best match, we wont bother looking elsewhere. if (firstResult.getShowResult() != null) { _log.info("found +50% match: " + firstResult.getShowResult().getTitle()); return firstResult; } else { return null;// error } } else if (traktResults.length > 1) { String firstResultName = firstResult.getShowResult().getTitle(); String secondResultName = traktResults[1].getShowResult().getTitle(); double firstSimilarityScore = StringUtils.getJaroWinklerDistance(showname, firstResultName); double secondSimilarityScore = StringUtils.getJaroWinklerDistance(showname, secondResultName); if (firstSimilarityScore > secondSimilarityScore) { return firstResult; } else { return traktResults[1]; } } return firstResult; } return null; // not found } catch (Exception e) { _log.error(e.toString(), e); return null; // error } }
From source file:com.moviejukebox.plugin.SratimPlugin.java
public void downloadSubtitle(Movie movie, MovieFile mf) throws IOException { if (!subtitleDownload) { mf.setSubtitlesExchange(true);/*w w w . j a v a 2 s .c o m*/ return; } // Get the file base name String path = mf.getFile().getName().toUpperCase(); int lindex = path.lastIndexOf('.'); if (lindex == -1) { return; } String basename = path.substring(0, lindex); // Check if this is a bluray file boolean bluRay = false; if (path.endsWith(".M2TS") && path.startsWith("0")) { bluRay = true; } if (movie.isExtra()) { mf.setSubtitlesExchange(true); return; } // Check if this movie already have subtitles for it (.srt and .sub) if (hasExistingSubtitles(mf, bluRay)) { mf.setSubtitlesExchange(true); return; } basename = basename.replace('.', ' ').replace('-', ' ').replace('_', ' '); LOG.debug("Download Subtitle: {}", mf.getFile().getAbsolutePath()); LOG.debug("Basename : {}", basename); LOG.debug("BluRay : {}", bluRay); int bestFPSCount = 0; int bestBlurayCount = 0; int bestBlurayFPSCount = 0; String bestFPSID = ""; String bestBlurayID = ""; String bestBlurayFPSID = ""; String bestFileID = ""; String bestSimilar = ""; // retrieve subtitles page String subID = movie.getId(SRATIM_PLUGIN_SUBTITLE_ID); String mainXML = httpClient.request("http://www.sratim.co.il/subtitles.php?mid=" + subID); int index = 0; int endIndex; // find the end of hebrew subtitles section, to prevent downloading non-hebrew ones int endHebrewSubsIndex = findEndOfHebrewSubtitlesSection(mainXML); // Check that hebrew subtitle exist String hebrewSub = HTMLTools.getTextAfterElem(mainXML, "<img src=\"images/Flags/1.png"); LOG.debug("hebrewSub: {}", hebrewSub); // Check that there is no 0 hebrew sub if (Movie.UNKNOWN.equals(hebrewSub)) { LOG.debug("No Hebrew subtitles"); return; } double maxMatch = 0.0; double matchThreshold = PropertiesUtil.getFloatProperty("sratim.textMatchSimilarity", 0.8f); while (index < endHebrewSubsIndex) { // // scanID // index = mainXML.indexOf("href=\"downloadsubtitle.php?id=", index); if (index == -1) { break; } index += 30; endIndex = mainXML.indexOf('\"', index); if (endIndex == -1) { break; } String scanID = mainXML.substring(index, endIndex); // // scanDiscs // index = mainXML.indexOf("src=\"images/cds/cd", index); if (index == -1) { break; } index += 18; endIndex = mainXML.indexOf('.', index); if (endIndex == -1) { break; } String scanDiscs = mainXML.substring(index, endIndex); // // scanFileName // index = mainXML.indexOf("subtitle_title\" style=\"direction:ltr;\" title=\"", index); if (index == -1) { break; } index += 46; endIndex = mainXML.indexOf('\"', index); if (endIndex == -1) { break; } String scanFileName = mainXML.substring(index, endIndex).toUpperCase().replace('.', ' '); // removing all characters causing metric to hang. scanFileName = scanFileName.replaceAll("-|\u00A0", " ").replaceAll(" ++", " "); // // scanFormat // index = mainXML.indexOf("\u05e4\u05d5\u05e8\u05de\u05d8", index); // the hebrew letters for the word "format" if (index == -1) { break; } index += 6; endIndex = mainXML.indexOf(',', index); if (endIndex == -1) { break; } String scanFormat = mainXML.substring(index, endIndex); // // scanFPS // index = mainXML.indexOf("\u05dc\u05e9\u05e0\u0027\u003a", index); // the hebrew letters for the word "for sec':" lamed shin nun ' : if (index == -1) { break; } index += 5; endIndex = mainXML.indexOf('<', index); if (endIndex == -1) { break; } String scanFPS = mainXML.substring(index, endIndex); // // scanCount // index = mainXML.indexOf("subt_date\"><span class=\"smGray\">", index); if (index == -1) { break; } index += 32; endIndex = mainXML.indexOf(' ', index); if (endIndex == -1) { break; } String scanCount = mainXML.substring(index, endIndex); // Check for best text similarity double result = StringUtils.getJaroWinklerDistance(basename, scanFileName); if (result > maxMatch) { maxMatch = result; bestSimilar = scanID; } LOG.debug( "scanFileName: {} scanFPS: {} scanID: {} scanCount: {} scanDiscs: {} scanFormat: {} similarity: {}", scanFileName, scanFPS, scanID, scanCount, scanDiscs, scanFormat, result); // Check if movie parts matches int nDiscs = movie.getMovieFiles().size(); if (!String.valueOf(nDiscs).equals(scanDiscs)) { continue; } // Check for exact file name if (scanFileName.equals(basename)) { bestFileID = scanID; break; } int scanCountInt = NumberUtils.toInt(scanCount, 0); float scanFPSFloat = NumberUtils.toFloat(scanFPS, 0F); LOG.debug("FPS: {} scanFPS: {}", movie.getFps(), scanFPSFloat); if (bluRay && ((scanFileName.contains("BRRIP")) || (scanFileName.contains("BDRIP")) || (scanFileName.contains("BLURAY")) || (scanFileName.contains("BLU-RAY")) || (scanFileName.contains("HDDVD")))) { if ((Float.compare(scanFPSFloat, 0F) == 0) && (scanCountInt > bestBlurayCount)) { bestBlurayCount = scanCountInt; bestBlurayID = scanID; } if ((Float.compare(movie.getFps(), scanFPSFloat) == 0) && (scanCountInt > bestBlurayFPSCount)) { bestBlurayFPSCount = scanCountInt; bestBlurayFPSID = scanID; } } if ((Float.compare(movie.getFps(), scanFPSFloat) == 0) && (scanCountInt > bestFPSCount)) { bestFPSCount = scanCountInt; bestFPSID = scanID; } } // Select the best subtitles ID String bestID; // Check for exact file name match if (StringUtils.isNotBlank(bestFileID)) { LOG.debug("Best Filename"); bestID = bestFileID; } else if (maxMatch >= matchThreshold) { // Check for text similarity match, similarity threshold takes precedence over FPS check LOG.debug("Best Text Similarity threshold"); bestID = bestSimilar; } else if (StringUtils.isNotBlank(bestBlurayFPSID)) { // Check for bluray match LOG.debug("Best Bluray FPS"); bestID = bestBlurayFPSID; } else if (StringUtils.isNotBlank(bestBlurayID)) { // Check for bluray match LOG.debug("Best Bluray"); bestID = bestBlurayID; } else if (StringUtils.isNotBlank(bestFPSID)) { // Check for fps match LOG.debug("Best FPS"); bestID = bestFPSID; } else if (maxMatch > 0) { // Check for text match, now just choose the best similar name LOG.debug("Best Similar"); bestID = bestSimilar; } else { LOG.debug("No subtitle found"); return; } LOG.debug("bestID: {}", bestID); // reconstruct movie filename with full path String orgName = mf.getFile().getAbsolutePath(); File subtitleFile = new File(orgName.substring(0, orgName.lastIndexOf('.'))); if (!downloadSubtitleZip(movie, "http://www.sratim.co.il/downloadsubtitle.php?id=" + bestID, subtitleFile, bluRay)) { LOG.error("Error - Subtitle download failed"); return; } mf.setSubtitlesExchange(true); SubtitleTools.addMovieSubtitle(movie, "YES"); }
From source file:net.stargraph.rank.impl.JarowinklerRanker.java
@Override double computeStringDistance(CharSequence s1, CharSequence s2) { return StringUtils.getJaroWinklerDistance(s1, s2); }
From source file:org.darsana.nlp.Scorer.java
public static Map<String, Double> ScoreGram(String srcCorp, String dstCorp, int method, int gramSize) { // Generate grams from corpora, store in Maps Map<String, Double> conceptMap = generateCommonConceptMap(srcCorp, dstCorp, gramSize); if (method == Type.RAW_FREQUENCY.getValue()) { ArrayList<String> toRemove = new ArrayList<>(); // Return raw frequency after nixing any terms that occur only once. conceptMap.keySet().stream().filter((key) -> (conceptMap.get(key) == 1)).forEachOrdered((key) -> { toRemove.add(key);/*w ww. ja v a 2s . co m*/ }); toRemove.forEach((key) -> { conceptMap.remove(key); }); return conceptMap; } else if (method == Type.RELATIVE_FREQUENCY.getValue()) { // Return harmonic frequency of terms as they occur across all documents. Map<String, Double> termFrequencyMap = generateCommonConceptMap(srcCorp, dstCorp, 1); Map<String, Double> relativeFrequencyMap = new TreeMap<>(); conceptMap.keySet().forEach((key) -> { double freq = harmonicFrequency(key, termFrequencyMap); if (freq > 1.0) { relativeFrequencyMap.put(key, freq); } }); return relativeFrequencyMap; } else { // Return most similar strings across all documents, likely needs trigrams or larger // to be truly useful. Map<String, Double> srcMap = generateConceptMap(srcCorp, gramSize); Map<String, Double> dstMap = generateConceptMap(dstCorp, gramSize); Map<String, Double> distanceMap = new TreeMap<>(); Object[] srcKeys = srcMap.keySet().toArray(); Object[] dstKeys = dstMap.keySet().toArray(); for (int i = 0; i < srcKeys.length - 1; i++) { for (int j = 0; j < dstKeys.length - 1; j++) { double score = StringUtils.getJaroWinklerDistance(srcKeys[i].toString(), dstKeys[j].toString()); if (score >= 0.9) { distanceMap.put(srcKeys[i] + "," + dstKeys[j], score); } } } return distanceMap; } }
From source file:org.eclipse.ebr.maven.eclipseip.KnownLicenses.java
public KnownLicense findByUrl(final String url) { for (final KnownLicense l : licensesByName.values()) { for (final String knownUrl : l.getKnownUrls()) { if (StringUtils.getJaroWinklerDistance(url, knownUrl) >= 0.99) return l; }//from w w w . j a v a 2 s .c o m } return null; }