Example usage for org.apache.commons.lang3 StringUtils getJaroWinklerDistance

List of usage examples for org.apache.commons.lang3 StringUtils getJaroWinklerDistance

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringUtils getJaroWinklerDistance.

Prototype

public static double getJaroWinklerDistance(final CharSequence first, final CharSequence second) 

Source Link

Document

Find the Jaro Winkler Distance which indicates the similarity score between two Strings.

The Jaro measure is the weighted sum of percentage of matched characters from each file and transposed characters.

Usage

From source file:br.pucminas.ri.jsearch.rest.controller.ApiController.java

private static AutoComplete autocomplete(Request req, Response res, LogController log) {
    AutoComplete ac = new AutoComplete(new String[] {});
    String query = req.queryParams("query");

    try {//  w ww  .jav  a  2 s  .  c  om
        List<Log> logs = log.getLogsByIp(req.ip());
        int count = 0;
        StringList suggestions = new StringList();

        for (Log l : logs) {
            if (StringUtils.getJaroWinklerDistance(query, l.getQuery()) >= 0.80
                    && !suggestions.contains(l.getQuery()) && count < 5) {
                suggestions.add(l.getQuery());
                count++;
            }
        }

        ac.setSugestions(suggestions.toArray());
    } catch (Exception e) {
        System.err.println(e.getMessage());
    }
    return ac;
}

From source file:io.microprofile.showcase.speaker.persistence.SpeakerDAO.java

/**
 * Really simple fuzzy match using JaroWinklerDistance
 *
 * @param left  String to match/*w w  w. ja  va2 s  . c  o m*/
 * @param right String to compare
 * @return True is match is gt 0.85
 */
private static boolean isMatch(String left, String right) {

    //No empty strings
    left = null != left ? left : "";
    right = null != right ? right : "";

    return StringUtils.getJaroWinklerDistance(left, right) > 0.85;
}

From source file:com.moviejukebox.plugin.MovieMeterPlugin.java

/**
 * Get the ID for the movie/*w ww .ja v a 2  s  . c  om*/
 *
 * @param title Movie title to get the ID for
 * @param year Movie year to get the ID for
 * @return The ID, or empty if no idea found
 */
public String getMovieId(final String title, final String year) {
    String id = UNKNOWN;

    LOG.debug("Looking for MovieMeter ID for {} ({})", title, year);
    List<SearchResult> results;
    try {
        results = api.search(title);
    } catch (MovieMeterException ex) {
        LOG.warn("Failed to get Movie Meter search results for {} ({}): {}", title, year, ex.getMessage(), ex);
        return id;
    }

    if (results.isEmpty()) {
        return id;
    }

    int fYear = NumberUtils.toInt(year, 0);
    double maxMatch = 0.0;

    for (SearchResult sr : results) {
        // if we have a year, check that first
        if (fYear > 0 && sr.getYear() != fYear) {
            continue;
        }

        // Check for best text similarity
        double result = StringUtils.getJaroWinklerDistance(title, sr.getTitle());
        if (result > maxMatch) {
            LOG.trace("Better match found for {} ({}) = {} ({}) [{}]", title, year, sr.getTitle(), sr.getYear(),
                    maxMatch);
            maxMatch = result;
            // Update the best result
            id = Integer.toString(sr.getId());
        }
    }

    if (isValidString(id)) {
        LOG.debug("MovieMeter ID '{}' found for {} ({}), Match confidence: {}", id, title, year, maxMatch);
    }

    return id;
}

From source file:net.longfalcon.newsj.TVRageService.java

private long getRageMatch(ShowInfo showInfo) {
    String cleanName = showInfo.getCleanName();

    try {/* w  w  w .  j a  va 2s.c  o  m*/
        TraktResult[] traktResults = new TraktResult[0];
        try {
            traktResults = traktService.searchTvShowByName(cleanName.toLowerCase());
        } catch (Exception e) {
            _log.error("error fetching show data for " + cleanName + " - " + e.toString());
            _log.debug("", e);
        }
        if (traktResults.length > 0) {
            TraktResult firstResult = traktResults[0];
            if (firstResult.getScore() > 50) {
                // probably the best match, we wont bother looking elsewhere.
                if (firstResult.getShowResult() != null) {
                    _log.info("found +50% match: " + firstResult.getShowResult().getTitle());
                    return getRageIdFromTraktResultsSafe(firstResult);
                } else {
                    return -2;// error
                }
            } else if (traktResults.length > 1) {
                String firstResultName = firstResult.getShowResult().getTitle();
                String secondResultName = traktResults[1].getShowResult().getTitle();
                double firstSimilarityScore = StringUtils.getJaroWinklerDistance(cleanName, firstResultName);
                double secondSimilarityScore = StringUtils.getJaroWinklerDistance(cleanName, secondResultName);
                if (firstSimilarityScore > secondSimilarityScore) {
                    return getRageIdFromTraktResultsSafe(firstResult);
                } else {
                    return getRageIdFromTraktResultsSafe(traktResults[1]);
                }
            }
            return getRageIdFromTraktResultsSafe(firstResult);
        }
        _log.warn("no trakt show found for name " + cleanName);
        return -2; // not found
    } catch (Exception e) {
        _log.error(e.toString(), e);
        return -2; // error
    }
}

From source file:net.longfalcon.newsj.TVRageService.java

private long getTraktMatch(ShowInfo showInfo) {
    String cleanName = showInfo.getCleanName();
    _log.info("searching trakt for show " + cleanName);
    try {// w w w .j  ava2 s . c om
        TraktResult[] traktResults = new TraktResult[0];
        try {
            traktResults = traktService.searchTvShowByName(cleanName.toLowerCase());
        } catch (Exception e) {
            _log.error("error fetching show data for " + cleanName + " - " + e.toString());
            _log.debug("", e);
        }
        if (traktResults.length > 0) {
            TraktResult firstResult = traktResults[0];
            if (firstResult.getScore() > 50) {
                // probably the best match, we wont bother looking elsewhere.
                if (firstResult.getShowResult() != null) {
                    _log.info("found +50% match: " + firstResult.getShowResult().getTitle());
                    return getTraktIdFromTraktResultsSafe(firstResult);
                } else {
                    return -2;// error
                }
            } else if (traktResults.length > 1) {
                String firstResultName = firstResult.getShowResult().getTitle();
                String secondResultName = traktResults[1].getShowResult().getTitle();
                double firstSimilarityScore = StringUtils.getJaroWinklerDistance(cleanName, firstResultName);
                double secondSimilarityScore = StringUtils.getJaroWinklerDistance(cleanName, secondResultName);
                if (firstSimilarityScore > secondSimilarityScore) {
                    return getTraktIdFromTraktResultsSafe(firstResult);
                } else {
                    return getTraktIdFromTraktResultsSafe(traktResults[1]);
                }
            }
            return getTraktIdFromTraktResultsSafe(firstResult);
        }
        return -1; // not found
    } catch (Exception e) {
        _log.error(e.toString(), e);
        return -2; // error
    }
}

From source file:net.longfalcon.newsj.TVRageService.java

private TraktResult getTraktMatch(String showname) {
    _log.info("searching trakt for show " + showname);
    try {/*from w ww . ja  v a2 s. c o m*/
        TraktResult[] traktResults = new TraktResult[0];
        try {
            traktResults = traktService.searchTvShowByName(showname.toLowerCase());
        } catch (Exception e) {
            _log.error("error fetching show data for " + showname + " - " + e.toString());
            _log.debug("", e);
        }
        if (traktResults.length > 0) {
            TraktResult firstResult = traktResults[0];
            if (firstResult.getScore() > 50) {
                // probably the best match, we wont bother looking elsewhere.
                if (firstResult.getShowResult() != null) {
                    _log.info("found +50% match: " + firstResult.getShowResult().getTitle());
                    return firstResult;
                } else {
                    return null;// error
                }
            } else if (traktResults.length > 1) {
                String firstResultName = firstResult.getShowResult().getTitle();
                String secondResultName = traktResults[1].getShowResult().getTitle();
                double firstSimilarityScore = StringUtils.getJaroWinklerDistance(showname, firstResultName);
                double secondSimilarityScore = StringUtils.getJaroWinklerDistance(showname, secondResultName);
                if (firstSimilarityScore > secondSimilarityScore) {
                    return firstResult;
                } else {
                    return traktResults[1];
                }
            }
            return firstResult;
        }
        return null; // not found
    } catch (Exception e) {
        _log.error(e.toString(), e);
        return null; // error
    }
}

From source file:com.moviejukebox.plugin.SratimPlugin.java

public void downloadSubtitle(Movie movie, MovieFile mf) throws IOException {

    if (!subtitleDownload) {
        mf.setSubtitlesExchange(true);/*w w w  . j a v a 2  s  .c  o m*/
        return;
    }

    // Get the file base name
    String path = mf.getFile().getName().toUpperCase();
    int lindex = path.lastIndexOf('.');
    if (lindex == -1) {
        return;
    }

    String basename = path.substring(0, lindex);

    // Check if this is a bluray file
    boolean bluRay = false;
    if (path.endsWith(".M2TS") && path.startsWith("0")) {
        bluRay = true;
    }

    if (movie.isExtra()) {
        mf.setSubtitlesExchange(true);
        return;
    }

    // Check if this movie already have subtitles for it (.srt and .sub)
    if (hasExistingSubtitles(mf, bluRay)) {
        mf.setSubtitlesExchange(true);
        return;
    }

    basename = basename.replace('.', ' ').replace('-', ' ').replace('_', ' ');

    LOG.debug("Download Subtitle: {}", mf.getFile().getAbsolutePath());
    LOG.debug("Basename         : {}", basename);
    LOG.debug("BluRay           : {}", bluRay);

    int bestFPSCount = 0;
    int bestBlurayCount = 0;
    int bestBlurayFPSCount = 0;

    String bestFPSID = "";
    String bestBlurayID = "";
    String bestBlurayFPSID = "";
    String bestFileID = "";
    String bestSimilar = "";

    // retrieve subtitles page
    String subID = movie.getId(SRATIM_PLUGIN_SUBTITLE_ID);
    String mainXML = httpClient.request("http://www.sratim.co.il/subtitles.php?mid=" + subID);

    int index = 0;
    int endIndex;

    // find the end of hebrew subtitles section, to prevent downloading non-hebrew ones
    int endHebrewSubsIndex = findEndOfHebrewSubtitlesSection(mainXML);

    // Check that hebrew subtitle exist
    String hebrewSub = HTMLTools.getTextAfterElem(mainXML, "<img src=\"images/Flags/1.png");

    LOG.debug("hebrewSub: {}", hebrewSub);

    // Check that there is no 0 hebrew sub
    if (Movie.UNKNOWN.equals(hebrewSub)) {
        LOG.debug("No Hebrew subtitles");
        return;
    }

    double maxMatch = 0.0;
    double matchThreshold = PropertiesUtil.getFloatProperty("sratim.textMatchSimilarity", 0.8f);

    while (index < endHebrewSubsIndex) {

        //
        // scanID
        //
        index = mainXML.indexOf("href=\"downloadsubtitle.php?id=", index);
        if (index == -1) {
            break;
        }

        index += 30;

        endIndex = mainXML.indexOf('\"', index);
        if (endIndex == -1) {
            break;
        }

        String scanID = mainXML.substring(index, endIndex);

        //
        // scanDiscs
        //
        index = mainXML.indexOf("src=\"images/cds/cd", index);
        if (index == -1) {
            break;
        }

        index += 18;

        endIndex = mainXML.indexOf('.', index);
        if (endIndex == -1) {
            break;
        }

        String scanDiscs = mainXML.substring(index, endIndex);

        //
        // scanFileName
        //
        index = mainXML.indexOf("subtitle_title\" style=\"direction:ltr;\" title=\"", index);
        if (index == -1) {
            break;
        }

        index += 46;

        endIndex = mainXML.indexOf('\"', index);
        if (endIndex == -1) {
            break;
        }

        String scanFileName = mainXML.substring(index, endIndex).toUpperCase().replace('.', ' ');
        // removing all characters causing metric to hang.
        scanFileName = scanFileName.replaceAll("-|\u00A0", " ").replaceAll(" ++", " ");

        //
        // scanFormat
        //
        index = mainXML.indexOf("\u05e4\u05d5\u05e8\u05de\u05d8", index); // the hebrew letters for the word "format"
        if (index == -1) {
            break;
        }

        index += 6;

        endIndex = mainXML.indexOf(',', index);
        if (endIndex == -1) {
            break;
        }

        String scanFormat = mainXML.substring(index, endIndex);

        //
        // scanFPS
        //
        index = mainXML.indexOf("\u05dc\u05e9\u05e0\u0027\u003a", index); // the hebrew letters for the word "for sec':" lamed shin nun ' :
        if (index == -1) {
            break;
        }

        index += 5;

        endIndex = mainXML.indexOf('<', index);
        if (endIndex == -1) {
            break;
        }

        String scanFPS = mainXML.substring(index, endIndex);

        //
        // scanCount
        //
        index = mainXML.indexOf("subt_date\"><span class=\"smGray\">", index);
        if (index == -1) {
            break;
        }

        index += 32;

        endIndex = mainXML.indexOf(' ', index);
        if (endIndex == -1) {
            break;
        }

        String scanCount = mainXML.substring(index, endIndex);

        // Check for best text similarity
        double result = StringUtils.getJaroWinklerDistance(basename, scanFileName);
        if (result > maxMatch) {
            maxMatch = result;
            bestSimilar = scanID;
        }

        LOG.debug(
                "scanFileName: {} scanFPS: {} scanID: {} scanCount: {} scanDiscs: {} scanFormat: {} similarity: {}",
                scanFileName, scanFPS, scanID, scanCount, scanDiscs, scanFormat, result);

        // Check if movie parts matches
        int nDiscs = movie.getMovieFiles().size();
        if (!String.valueOf(nDiscs).equals(scanDiscs)) {
            continue;
        }

        // Check for exact file name
        if (scanFileName.equals(basename)) {
            bestFileID = scanID;
            break;
        }

        int scanCountInt = NumberUtils.toInt(scanCount, 0);
        float scanFPSFloat = NumberUtils.toFloat(scanFPS, 0F);

        LOG.debug("FPS: {} scanFPS: {}", movie.getFps(), scanFPSFloat);

        if (bluRay && ((scanFileName.contains("BRRIP")) || (scanFileName.contains("BDRIP"))
                || (scanFileName.contains("BLURAY")) || (scanFileName.contains("BLU-RAY"))
                || (scanFileName.contains("HDDVD")))) {

            if ((Float.compare(scanFPSFloat, 0F) == 0) && (scanCountInt > bestBlurayCount)) {
                bestBlurayCount = scanCountInt;
                bestBlurayID = scanID;
            }

            if ((Float.compare(movie.getFps(), scanFPSFloat) == 0) && (scanCountInt > bestBlurayFPSCount)) {
                bestBlurayFPSCount = scanCountInt;
                bestBlurayFPSID = scanID;
            }

        }

        if ((Float.compare(movie.getFps(), scanFPSFloat) == 0) && (scanCountInt > bestFPSCount)) {
            bestFPSCount = scanCountInt;
            bestFPSID = scanID;
        }

    }

    // Select the best subtitles ID
    String bestID;

    // Check for exact file name match
    if (StringUtils.isNotBlank(bestFileID)) {
        LOG.debug("Best Filename");
        bestID = bestFileID;
    } else if (maxMatch >= matchThreshold) {
        // Check for text similarity match, similarity threshold takes precedence over FPS check
        LOG.debug("Best Text Similarity threshold");
        bestID = bestSimilar;
    } else if (StringUtils.isNotBlank(bestBlurayFPSID)) {
        // Check for bluray match
        LOG.debug("Best Bluray FPS");
        bestID = bestBlurayFPSID;
    } else if (StringUtils.isNotBlank(bestBlurayID)) {
        // Check for bluray match
        LOG.debug("Best Bluray");
        bestID = bestBlurayID;
    } else if (StringUtils.isNotBlank(bestFPSID)) {
        // Check for fps match
        LOG.debug("Best FPS");
        bestID = bestFPSID;
    } else if (maxMatch > 0) {
        // Check for text match, now just choose the best similar name
        LOG.debug("Best Similar");
        bestID = bestSimilar;
    } else {
        LOG.debug("No subtitle found");
        return;
    }

    LOG.debug("bestID: {}", bestID);

    // reconstruct movie filename with full path
    String orgName = mf.getFile().getAbsolutePath();
    File subtitleFile = new File(orgName.substring(0, orgName.lastIndexOf('.')));
    if (!downloadSubtitleZip(movie, "http://www.sratim.co.il/downloadsubtitle.php?id=" + bestID, subtitleFile,
            bluRay)) {
        LOG.error("Error - Subtitle download failed");
        return;
    }

    mf.setSubtitlesExchange(true);
    SubtitleTools.addMovieSubtitle(movie, "YES");
}

From source file:net.stargraph.rank.impl.JarowinklerRanker.java

@Override
double computeStringDistance(CharSequence s1, CharSequence s2) {
    return StringUtils.getJaroWinklerDistance(s1, s2);
}

From source file:org.darsana.nlp.Scorer.java

public static Map<String, Double> ScoreGram(String srcCorp, String dstCorp, int method, int gramSize) {
    // Generate grams from corpora, store in Maps
    Map<String, Double> conceptMap = generateCommonConceptMap(srcCorp, dstCorp, gramSize);

    if (method == Type.RAW_FREQUENCY.getValue()) {
        ArrayList<String> toRemove = new ArrayList<>();
        // Return raw frequency after nixing any terms that occur only once.
        conceptMap.keySet().stream().filter((key) -> (conceptMap.get(key) == 1)).forEachOrdered((key) -> {
            toRemove.add(key);/*w ww.  ja  v a  2s . co  m*/
        });

        toRemove.forEach((key) -> {
            conceptMap.remove(key);
        });

        return conceptMap;
    } else if (method == Type.RELATIVE_FREQUENCY.getValue()) {
        // Return harmonic frequency of terms as they occur across all documents.
        Map<String, Double> termFrequencyMap = generateCommonConceptMap(srcCorp, dstCorp, 1);
        Map<String, Double> relativeFrequencyMap = new TreeMap<>();

        conceptMap.keySet().forEach((key) -> {
            double freq = harmonicFrequency(key, termFrequencyMap);

            if (freq > 1.0) {
                relativeFrequencyMap.put(key, freq);
            }
        });

        return relativeFrequencyMap;
    } else {
        // Return most similar strings across all documents, likely needs trigrams or larger 
        // to be truly useful.
        Map<String, Double> srcMap = generateConceptMap(srcCorp, gramSize);
        Map<String, Double> dstMap = generateConceptMap(dstCorp, gramSize);
        Map<String, Double> distanceMap = new TreeMap<>();

        Object[] srcKeys = srcMap.keySet().toArray();
        Object[] dstKeys = dstMap.keySet().toArray();

        for (int i = 0; i < srcKeys.length - 1; i++) {
            for (int j = 0; j < dstKeys.length - 1; j++) {
                double score = StringUtils.getJaroWinklerDistance(srcKeys[i].toString(), dstKeys[j].toString());

                if (score >= 0.9) {
                    distanceMap.put(srcKeys[i] + "," + dstKeys[j], score);
                }
            }
        }

        return distanceMap;
    }
}

From source file:org.eclipse.ebr.maven.eclipseip.KnownLicenses.java

public KnownLicense findByUrl(final String url) {
    for (final KnownLicense l : licensesByName.values()) {
        for (final String knownUrl : l.getKnownUrls()) {
            if (StringUtils.getJaroWinklerDistance(url, knownUrl) >= 0.99)
                return l;
        }//from  w  w w .  j  a  v a 2  s .c o m
    }

    return null;
}