Example usage for org.apache.commons.lang3 StringUtils getLevenshteinDistance

List of usage examples for org.apache.commons.lang3 StringUtils getLevenshteinDistance

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringUtils getLevenshteinDistance.

Prototype

public static int getLevenshteinDistance(CharSequence s, CharSequence t) 

Source Link

Document

<p>Find the Levenshtein distance between two Strings.</p> <p>This is the number of changes needed to change one String into another, where each change is a single character modification (deletion, insertion or substitution).</p> <p>The previous implementation of the Levenshtein distance algorithm was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError which can occur when my Java implementation is used with very large strings.<br> This implementation of the Levenshtein distance algorithm is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> <pre> StringUtils.getLevenshteinDistance(null, *) = IllegalArgumentException StringUtils.getLevenshteinDistance(*, null) = IllegalArgumentException StringUtils.getLevenshteinDistance("","") = 0 StringUtils.getLevenshteinDistance("","a") = 1 StringUtils.getLevenshteinDistance("aaapppp", "") = 7 StringUtils.getLevenshteinDistance("frog", "fog") = 1 StringUtils.getLevenshteinDistance("fly", "ant") = 3 StringUtils.getLevenshteinDistance("elephant", "hippo") = 7 StringUtils.getLevenshteinDistance("hippo", "elephant") = 7 StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz") = 8 StringUtils.getLevenshteinDistance("hello", "hallo") = 1 </pre>

Usage

From source file:org.starnub.utilities.strings.StringUtilities.java

/**
 * This will compare the percentage similarity of two words
 *
 * @param s String string to be compared against
 * @param s2 String string to compare//from   ww  w  . ja v  a2  s . co m
 * @return double the percentage of similarity
 * @throws ArithmeticException if issue calculating
 */
public static double similarityCalculationCaseInsensitive(String s, String s2) throws ArithmeticException {
    s = s.toLowerCase();
    s2 = s2.toLowerCase();
    int levDist = StringUtils.getLevenshteinDistance(s, s2);
    return s.length() > s2.length() ? Math.round(((double) levDist / (double) s.length()) * 100)
            : Math.round(((double) levDist / (double) s2.length()) * 100);
}

From source file:org.starnub.utilities.strings.StringUtilities.java

/**
 * This will compare the percentage similarity of two words
 *
 * @param s String string to be compared against
 * @param s2 String string to compare// w  w w .  j a  v a  2  s . co  m
 * @return double the percentage of similarity
 * @throws ArithmeticException if issue calculating
 */
public static double similarityCalculation(String s, String s2) throws ArithmeticException {
    int levDist = StringUtils.getLevenshteinDistance(s, s2);
    return s.length() > s2.length() ? Math.round(((double) levDist / (double) s.length()) * 100)
            : Math.round(((double) levDist / (double) s2.length()) * 100);
}

From source file:org.xlrnet.metadict.core.aggregation.LevenstheinRelevanceOrderStrategy.java

double calculateEntryScore(@NotNull ResultEntry entry, @NotNull String queryString) {
    int levenstheinInput = StringUtils.getLevenshteinDistance(entry.getSource().getGeneralForm().toLowerCase(),
            queryString.toLowerCase());/*from  w w  w .  j  av  a 2s .co  m*/
    int levenstheinOutput = Integer.MAX_VALUE;
    if (entry.getTarget() != null && entry.getTarget().getGeneralForm() != null)
        levenstheinOutput = StringUtils.getLevenshteinDistance(entry.getTarget().getGeneralForm().toLowerCase(),
                queryString.toLowerCase());
    int levensthein = Integer.min(levenstheinInput, levenstheinOutput);
    return 1.0 - ((double) levensthein / (1 + (double) levensthein));
}

From source file:org.xlrnet.metadict.impl.aggregation.LevenstheinRelevanceOrderStrategy.java

double calculateEntryScore(@NotNull ResultEntry entry, @NotNull String queryString) {
    int levenstheinInput = StringUtils.getLevenshteinDistance(entry.getInput().getGeneralForm().toLowerCase(),
            queryString.toLowerCase());/*from  www  .  j a v  a2 s  .  c om*/
    int levenstheinOutput = Integer.MAX_VALUE;
    if (entry.getOutput() != null && entry.getOutput().getGeneralForm() != null)
        levenstheinOutput = StringUtils.getLevenshteinDistance(entry.getOutput().getGeneralForm().toLowerCase(),
                queryString.toLowerCase());
    int levensthein = Integer.min(levenstheinInput, levenstheinOutput);
    return 1.0 - ((double) levensthein / (1 + (double) levensthein));
}

From source file:org.yamj.core.service.metadata.online.TheMovieDbApiWrapper.java

public String getPersonId(String name, boolean throwTempError) {
    String id = null;/*from  www . j  a v a 2  s  .  c o m*/
    PersonFind closestPerson = null;
    int closestMatch = Integer.MAX_VALUE;
    boolean foundPerson = Boolean.FALSE;
    boolean includeAdult = configService.getBooleanProperty("themoviedb.includeAdult", Boolean.FALSE);

    try {
        ResultList<PersonFind> results = tmdbApi.searchPeople(name, 0, includeAdult, SearchType.PHRASE);
        LOG.info("{}: Found {} results", name, results.getResults().size());
        for (PersonFind person : results.getResults()) {
            if (name.equalsIgnoreCase(person.getName())) {
                id = String.valueOf(person.getId());
                foundPerson = Boolean.TRUE;
                break;
            }
            LOG.trace("{}: Checking against '{}'", name, person.getName());
            int lhDistance = StringUtils.getLevenshteinDistance(name, person.getName());
            LOG.trace("{}: Current closest match is {}, this match is {}", name, closestMatch, lhDistance);
            if (lhDistance < closestMatch) {
                LOG.trace("{}: TMDB ID {} is a better match ", name, person.getId());
                closestMatch = lhDistance;
                closestPerson = person;
            }
        }

        if (foundPerson) {
            LOG.debug("{}: Matched against TMDB ID: {}", name, id);
        } else if (closestMatch < Integer.MAX_VALUE && closestPerson != null) {
            id = String.valueOf(closestPerson.getId());
            LOG.debug("{}: Closest match is '{}' differing by {} characters", name, closestPerson.getName(),
                    closestMatch);
        } else {
            LOG.debug("{}: No match found", name);
        }
    } catch (MovieDbException ex) {
        if (throwTempError && ResponseTools.isTemporaryError(ex)) {
            throw new TemporaryUnavailableException(
                    "TheMovieDb service temporary not available: " + ex.getResponseCode(), ex);
        }
        LOG.error("Failed retrieving TMDb id for person '{}': {}", name, ex.getMessage());
        LOG.trace("TheMovieDb error", ex);
    }
    return id;
}

From source file:org.yamj.core.service.plugin.TheMovieDbScanner.java

@Override
public String getPersonId(String name) {
    String id = "";
    com.omertron.themoviedbapi.model.Person closestPerson = null;
    int closestMatch = Integer.MAX_VALUE;
    boolean foundPerson = Boolean.FALSE;
    boolean includeAdult = configService.getBooleanProperty("themoviedb.includeAdult", Boolean.FALSE);

    try {//from   w ww.  ja  v a  2 s  . com
        TmdbResultsList<com.omertron.themoviedbapi.model.Person> results = tmdbApi.searchPeople(name,
                includeAdult, 0);
        LOG.info("{}: Found {} results", name, results.getResults().size());
        for (com.omertron.themoviedbapi.model.Person person : results.getResults()) {
            if (name.equalsIgnoreCase(person.getName())) {
                id = String.valueOf(person.getId());
                foundPerson = Boolean.TRUE;
                break;
            } else {
                LOG.trace("{}: Checking against '{}'", name, person.getName());
                int lhDistance = StringUtils.getLevenshteinDistance(name, person.getName());
                LOG.trace("{}: Current closest match is {}, this match is {}", name, closestMatch, lhDistance);
                if (lhDistance < closestMatch) {
                    LOG.trace("{}: TMDB ID {} is a better match ", name, person.getId());
                    closestMatch = lhDistance;
                    closestPerson = person;
                }
            }
        }

        if (foundPerson) {
            LOG.debug("{}: Matched against TMDB ID: {}", name, id);
        } else if (closestMatch < Integer.MAX_VALUE && closestPerson != null) {
            id = String.valueOf(closestPerson.getId());
            LOG.debug("{}: Closest match is '{}' differing by {} characters", name, closestPerson.getName(),
                    closestMatch);
        } else {
            LOG.debug("{}: No match found", name);
        }
    } catch (MovieDbException ex) {
        LOG.warn("Failed to get information on '{}' from {}, error: {}", name, SCANNER_ID, ex.getMessage());
    }
    return id;
}

From source file:org.zeroturnaround.isjrebel.IsJRebel.java

public static boolean isJRebel(String input) {
    return Stream.of(input, reverse(input)).map(IsJRebel::dropPunctuation).map(IsJRebel::l33tReplace)
            .filter(s -> JREBEL.equalsIgnoreCase(s) || StringUtils.getLevenshteinDistance(JREBEL, s) <= 1)
            .findAny().isPresent();/*w ww.j ava 2  s.c o  m*/
}

From source file:pl.piotr.TessOCR.java

/**
 *
 * @param img//  w  ww.  j  a v  a2 s .c  om
 * @return
 */
public static Receipt recognizeReceipt(File img) {
    int minEditLength = 100;
    Receipt receipt = null;
    try {
        String text = ocr.doOCR(img).toUpperCase();
        System.out.println(text);
        Scanner scaner = new Scanner(text);
        String line = scaner.nextLine();

        int tmp = 0;
        int LD;
        for (int i = 0; i < shopHeaderList.size(); i++) {
            LD = StringUtils.getLevenshteinDistance(line, shopHeaderList.get(i));
            if (LD < minEditLength) {
                minEditLength = LD;
                tmp = i;
            }
            //System.out.println(LD);
        }
        //System.out.println(tmp);
        switch (tmp) {
        case 0:
            receipt = new Biedronka();
            break;
        case 1:
            receipt = new Lidl();
            break;
        case 2:
            receipt = new Tesco();
            break;
        case 3:
            receipt = new Zabka();
            break;
        }

        receipt.setDate(text);
        receipt.setProductList(text);
        receipt.setSum(text);

    } catch (TesseractException ex) {
        Logger.getLogger(TessOCR.class.getName()).log(Level.SEVERE, null, ex);
    }
    return receipt;
}

From source file:principal.Main.java

public static void main(String[] args) throws UnknownHostException {

    MongoDBConection db = new MongoDBConection("dbmedicamentos");

    String pesquisa = "SUPLEMENTO PARA MEIO DE CULTURA, VANCOMICINA, P P/ RECONSTITUIO, 3 M";

    List<BasicDBObject> objects = db.getAllDocs("catmat");
    List<LevenshteinRelevance> listaRelevantes = null;
    int menortxLev = 15;
    String melhorPalavra = "";

    System.out.println("Buscando: " + pesquisa);
    for (BasicDBObject ob : objects) {
        //            int idxof = ob.get("TIPL_DESCRICAO").toString().indexOf(',');
        //            if(idxof>ob.get("TIPL_DESCRICAO").toString().length())
        //                idxof=ob.get("TIPL_DESCRICAO").toString().length()-1;
        String cmpLev = ob.get("TIPL_DESCRICAO").toString().replace(",", "");
        // System.out.println("codigo: "+ob.getString("TIPL_CODIGO").toString()+" "+cmpLev);

        int txLev = StringUtils.getLevenshteinDistance(pesquisa, cmpLev);

        if (txLev < 20) {
            if (txLev < menortxLev) {
                menortxLev = txLev;/*from   ww  w.  j  a va  2 s .  co m*/
                melhorPalavra = ob.get("TIPL_DESCRICAO").toString();
            }
            //                System.out.println("Taxa Levenshtein: "+txLev);
            //                System.out.println(ob.getString("TIPL_CODIGO").toString()+": "+cmpLev);

            //  listaRelevantes.add(new LevenshteinRelevance(txLev, ob));

        }

    }
    ;

    System.out.println("Menor valor txlev " + menortxLev + " Melhor texto: " + melhorPalavra);
    List<Medicamentos> med = db.doAdvancedSearch(melhorPalavra, "catmat");
    for (Medicamentos m : med) {
        System.out.println("FTS search: " + m.getCodigo() + ":" + m.getDescricao());
    }

}

From source file:qa.aligner.SRLToAligner.java

public Sentence getCorrespondingSentence(String[] tokenizedText, ArrayList<Sentence> sentences) {

    StringBuffer sbTokenized = new StringBuffer();
    sbTokenized.append(String.join(" ", tokenizedText));
    StringBuffer sbSentence = new StringBuffer();
    double sim = 0.0;
    for (int i = 0; i < sentences.size(); i++) {
        Sentence sentence = sentences.get(i);
        sbSentence.setLength(0);/*from  w w w  .  j  a  v a  2  s  . co  m*/
        for (int j = 1; j < sentence.size(); j++) {
            String form = sentence.get(j).getForm();
            if (form.equalsIgnoreCase("-RRB-")) {
                sbSentence.append(") ");
            } else if (form.equalsIgnoreCase("-LRB-")) {
                sbSentence.append("( ");
            } else {
                sbSentence.append(sentence.get(j).getForm() + " ");
            }
        }

        double distance = StringUtils.getLevenshteinDistance(sbTokenized.toString(), sbSentence.toString());
        //System.out.println(distance+ " "+ distance /Math.max(sbTokenized.length(), sbSentence.length()));
        sim = (1 - (distance / Math.max(sbTokenized.length(), sbSentence.length()))) * 100;
        //System.out.println("Sent : " + sbSentence.toString());
        if (sim > 80) {
            //System.out.println(sbTokenized.toString());
            //System.out.println(sbSentence.toString());
            return sentence;
        }
        /*if (StringUtils.getLevenshteinDistance(sbTokenized.toString(), sbSentence.toString()) < 0.3 * sbTokenized.toString().length()) {
         return sentence;
         }*/
    }
    System.out.println(sbTokenized.toString());
    return null;
    /*for (int i = 0; i < sentences.size(); i++) {
     Sentence sentence = sentences.get(i);
     ArrayList<Word> words = new ArrayList<Word>();
     for (int j = 1; j < sentence.size(); j++) {
     words.add(sentence.get(j));
     }
     boolean equal = true;
     for (int k = 0; k < words.size() - 2; k++) {
     if (Pattern.matches("\\p{Punct}", tokenizedText[k]))
     {
                   
     }
     else if (!words.get(k).getDeprel().equalsIgnoreCase("punct") && !tokenizedText[k].equalsIgnoreCase(words.get(k).getForm())) {
     equal = false;
     break;
     }
     }
     if (equal) {
     return sentence;
     }
     }
     return null;*/
}