at.jps.sanction.core.util.TokenTool.java Source code

Introduction

Here is the source code for at.jps.sanction.core.util.TokenTool.java
Source

/*******************************************************************************
 * Copyright (c) 2015 Jim Fandango (The Last Guy Coding) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or
 * substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR
 * A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *******************************************************************************/
package at.jps.sanction.core.util;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.StringTokenizer;

import org.apache.commons.codec.language.ColognePhonetic;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
import org.apache.commons.codec.language.Soundex;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TokenTool {

    static final Logger logger = LoggerFactory.getLogger(TokenTool.class);

    public static String buildTokenString(final List<String> textTokens, final String delimiter) {
        final StringBuilder text = new StringBuilder();
        for (final String token : textTokens) {
            text.append(token).append(delimiter);
        }
        return text.toString();
    }

    /**
     * simple plain text exists in text search without LSD
     *
     * @param textTokens1
     * @param textTokens2
     * @return true if shorter in longer
     */
    public static boolean checkContains(final List<String> textTokens1, final List<String> textTokens2,
            final String delimiter) {

        final String text1 = buildTokenString(textTokens1, delimiter);
        final String text2 = buildTokenString(textTokens2, delimiter);

        final boolean contains = (((text1.length() > 0) && (text2.length() > 0)) ? checkContains(text1, text2)
                : false);

        return contains;
    }

    public static boolean checkContains(final String text1, final String text2) {

        boolean contains = false;

        if (text1.length() > text2.length()) {
            contains = text1.contains(text2);
        } else {
            contains = text2.contains(text1);
        }

        return contains;
    }

    public static float compareCheck(final List<String> textTokens, final String text, final boolean fuzzy,
            final int minlen) {

        float deltaValue = 100;
        float percentHitrate = 0;

        // +/- fuzzy string compare

        if (textTokens.size() > 0) {
            for (final String token : textTokens) {
                if (token.length() >= minlen) {

                    if (fuzzy) {
                        final float lsHitValue = StringUtils.getLevenshteinDistance(text, token);

                        if (lsHitValue < deltaValue) {
                            deltaValue = lsHitValue;
                        }
                    } else {
                        deltaValue = (text.equals(token) ? 0 : 100);
                    }
                }
            }
            percentHitrate = 100 - ((100 / ((float) (text.length()))) * deltaValue);
        }

        return percentHitrate;
    }

    /**
     * compare single token with list of tokens no modification of tokens is done
     *
     * @param text1
     * @param text2
     * @param fuzzy
     *            ( if true) LevenshteinDistance is used
     * @param minlen
     * @param fuzzyValue
     *            ( 20 - 80 % )
     * @return Percentage of of equality ( 0 - 100 %)
     */

    public static int compareCheck(final String text1, final String text2, final boolean fuzzy, final int minlen,
            final double fuzzyValue) {

        int deltaValue = text1.length();
        int minWordLen = deltaValue;
        int percentHitrate = 0;

        // +/- fuzzy string compare

        // only compare meaningfull !! deltalength > 80%
        // TODO: calc length to fuzzyness

        if (((text1.length() >= minlen) && (text2.length() >= minlen))
                && (Math.abs(text1.length() - text2.length()) <= minlen)) {

            if (fuzzy) {

                minWordLen = Math.min(text1.length(), text2.length());
                final int threshold = (int) (minWordLen * fuzzyValue) + 1;

                deltaValue = StringUtils.getLevenshteinDistance(text1, text2, threshold);

                if (deltaValue == -1) // threshold cutoff
                {
                    deltaValue = minWordLen;
                }
            } else {
                if (text1.equalsIgnoreCase(text2)) { // TODO: ignoreCase ?
                    deltaValue = 0;
                }
            }
        }

        percentHitrate = (int) ((1 - ((double) deltaValue / minWordLen)) * 100);

        // percentHitrate = 100 - ((100 / ((float) (text1.length()))) * deltaValue);
        // percentHitrate = 100 - ((100 / minWordLen) * deltaValue);

        return percentHitrate;
    }

    public static float compareCheckColognePhonetic(final String text1, final String text2, final boolean fuzzy,
            final int minlen, final double fuzzyValue) {

        final ColognePhonetic encoder = new ColognePhonetic(); // TODO: in reallife
        // make
        // this go away !!

        return (compareCheck(encoder.colognePhonetic(text1), encoder.colognePhonetic(text2), fuzzy, minlen,
                fuzzyValue));

    }

    public static float compareCheckDoubleMetaphone(final String text1, final String text2, final boolean fuzzy,
            final int minlen, final double fuzzyValue) {

        final DoubleMetaphone encoder = new DoubleMetaphone(); // TODO: in reallife
        // make
        // this go away !!

        return (compareCheck(encoder.doubleMetaphone(text1), encoder.doubleMetaphone(text2), fuzzy, minlen,
                fuzzyValue));

    }

    public static float compareCheckMetaphone(final String text1, final String text2, final boolean fuzzy,
            final int minlen, final double fuzzyValue) {

        final Metaphone encoder = new Metaphone(); // TODO: in reallife make this go
        // away
        // !!

        return (compareCheck(encoder.metaphone(text1), encoder.metaphone(text2), fuzzy, minlen, fuzzyValue));

    }

    public static float compareRefinedSoundex(final String text1, final String text2, final boolean fuzzy,
            final int minlen, final double fuzzyValue) {

        final RefinedSoundex encoder = new RefinedSoundex(); // TODO: in reallife make
        // this
        // go away !!

        return (compareCheck(encoder.encode(text1), encoder.encode(text2), fuzzy, minlen, fuzzyValue));

    }

    public static float compareSoundex(final String text1, final String text2, final boolean fuzzy,
            final int minlen, final double fuzzyValue) {

        final Soundex encoder = new Soundex(); // TODO: in reallife make this go away
        // !!

        return (compareCheck(encoder.encode(text1), encoder.encode(text2), fuzzy, minlen, fuzzyValue));

    }

    /**
     * @param list1
     *            ( uppercase entries presumed ! )
     * @param list2
     * @return number of elements contained in both lists
     */
    public static int compareTokenLists(final Collection<String> list1, final Collection<String> list2) {
        int count = 0;

        for (final String element : list2) {
            if (list1.contains(element.toUpperCase())) {
                count++;
            }
        }

        return count;
    }

    // filterTokens --> tokens will be checked against list
    // else pretokens will be checked against list
    public static List<String> getTokenList(final String text, final String delimiters, final String deadCharacters,
            final int minlength, final Collection<String> excludeList, final boolean filterTokens) {

        final List<String> textTokens = new ArrayList<>(10);

        if ((text != null) && (text.length() > minlength)) {

            String oriText = text.toUpperCase();

            // remove dead characters

            if (deadCharacters != null) {
                for (int i = 0; i < deadCharacters.length(); i++) {
                    oriText = StringUtils.remove(oriText, deadCharacters.charAt(i)); // oriText.replace(deadCharacters.charAt(i), ' ');
                }
            }

            // TODO: transliterate

            // remove stopwords - !! dead characters MUST NOT exist in stopwords !!
            // must be sorted by length !!
            if (!filterTokens) {
                if (excludeList != null) {
                    for (final String stopword : excludeList) {
                        oriText = oriText.replace(stopword.toUpperCase(), "");
                    }
                }
            }

            // maybe preserve original formatted text ?

            // StrTokenizer textTokenizer = new StrTokenizer(oriText, delimiters);
            // textTokenizer.setEmptyTokenAsNull(false);
            // textTokenizer.setIgnoreEmptyTokens(true); // feature !!

            if (delimiters != null) {
                final StringTokenizer textTokenizer = new StringTokenizer(oriText, delimiters);

                while (textTokenizer.hasMoreTokens()) {

                    final String token = textTokenizer.nextToken().trim();

                    if ((token != null) && (token.length() >= minlength)) {
                        // TODO: clear tokens ( deadchars...)

                        if (filterTokens) {
                            if ((excludeList == null) || (!excludeList.contains(token))) {
                                if (!textTokens.contains(token)) {
                                    textTokens.add(token);
                                }
                            }
                        } else {
                            if (!textTokens.contains(token)) {
                                textTokens.add(token);
                            }
                        }
                    }
                }
            } else {
                textTokens.add(oriText);
            }
        }

        return textTokens;
    }
}