utils.hashing.similarity.java Source code

Introduction

Here is the source code for utils.hashing.similarity.java
Source

/*
 * SPDXVersion: SPDX-1.1
 * Creator: Person: Nuno Brito (nuno.brito@triplecheck.de)
 * Creator: Organization: TripleCheck (contact@triplecheck.de)
 * Created: 2014-10-23T23:53:20Z
 * LicenseName: AGPL-3.0+
 * FileName: TokenizerJava.java  
 * FileCopyrightText: <text> Copyright 2014 Nuno Brito, TripleCheck </text>
 * FileComment: <text> 
Use this class for generating an outputs that we later use for
comparing similarities between two files or text
</text> 
 */

package utils.hashing;

import org.apache.commons.lang3.StringUtils;

/**
 *
 * @author Nuno Brito, 23rd of October 2014 in Tettnang, Germany
 */
public class similarity {

    /**
     * Compute the similarity between two strings and provide a percentage,
     * doesn't really matter in which order they are compared
     * @param s0    String 1
     * @param s1    String 2
     * @return  A value ranging from 0 to 100%
     */
    public static int levenshteinPercentage(final String s0, final String s1) {
        final int value = StringUtils.getLevenshteinDistance(s0, s1);
        int percentage = (int) (100 - (float) value * 100 / (float) (s0.length() + s1.length()));
        return percentage;
    }

    /**
     * Enjoy life, keep similarity matching as simple as possible.
     * @param c1 The string in your archive
     * @param c2 The string that might be modified
     * @return A percentage of how closely related both strings might be
     */
    public static int britoshteinPercentage(final char[] c1, final char[] c2) {
        // get the smallest of the two arrays to compare
        final int size = Math.min(c1.length, c2.length);
        int points = 0;
        int pointer = 0;
        // compare each case
        for (int i = 0; i < size; i++) {
            if (c1[pointer] == c2[i]) {
                points++;
                pointer++;
            }
        }
        // get the percentual value
        return (points * 100) / size;
    }

    public static int britoshteinPercentage(final StringBuffer c1, final char[] c2) {
        // get the smallest of the two arrays to compare
        final int size = Math.min(c1.length(), c2.length);
        int points = 0;
        int pointer = 0;
        // compare each case
        for (int i = 0; i < size; i++) {
            if (c1.charAt(pointer) == c2[i]) {
                points++;
                pointer++;
            }
        }
        // get the percentual value
        return (points * 100) / size;
    }

    public static int britoshteinPercentage2(final char[] c1, final char[] c2) {
        // get the smallest of the two arrays to compare
        final int smallestSize = Math.min(c1.length, c2.length);
        int points = 0, pointer = 0;
        // compare each case
        for (int index = 0; index < smallestSize; index++) {

            if (c1[index] == c2[index]) {
                points++;
                pointer++;
            } else if (c1[pointer] == c2[index]) {
                points++;
                pointer++;
            } else if (c1[index] == c2[pointer]) {
                points++;
                pointer++;
            }
        }
        // get the percentual value
        return (points * 100) / smallestSize;
    }

    public static void main(String[] args) {
        // test two example strings where relevant characters are not repeated
        String a1 = "abcdefghijklmnopqrstuvwxyz0123456789";
        String a2 = "ab..-cdefghij..klmnopqrstuvwxyz0..123456789";

        String n1 = "Nuno Garcia da Silva Brito";
        String n2 = "Nun.o Garcia da Silva Brito";

        int result = britoshteinPercentage2(n1.toCharArray(), n2.toCharArray());
        System.out.println(result);
        result = britoshteinPercentage2(n2.toCharArray(), n1.toCharArray());
        System.out.println(result);
    }

}