com.streamsets.pipeline.lib.fuzzy.FuzzyMatch.java Source code

Java tutorial

Introduction

Here is the source code for com.streamsets.pipeline.lib.fuzzy.FuzzyMatch.java

Source

/**
 * Copyright 2015 StreamSets Inc.
 *
 * Licensed under the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.streamsets.pipeline.lib.fuzzy;

import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.collect.Sets;
import com.google.common.collect.Sets.SetView;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeSet;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

public class FuzzyMatch {
    private static final Logger LOG = LoggerFactory.getLogger(FuzzyMatch.class);
    private static final Pattern removeNonAlphaNum = Pattern.compile("[^\\w+]", Pattern.UNICODE_CHARACTER_CLASS);
    private static final Pattern camelAndSnakeCaseSplitter = Pattern.compile("(?=\\p{Lu})|(_)");

    private FuzzyMatch() {
    }

    // Build a cache with sane defaults. Shouldn't have to ask user to configure this.
    private static LoadingCache<Pair<String, String>, Integer> ratioCache = CacheBuilder.newBuilder()
            .maximumSize(1000).expireAfterAccess(1, TimeUnit.SECONDS)
            .build(new CacheLoader<Pair<String, String>, Integer>() {
                @Override
                public Integer load(Pair<String, String> pair) {
                    return FuzzyMatch.getRatio(pair.getLeft(), pair.getRight());
                }
            });

    public static int getRatio(String s1, String s2, boolean useCache) {
        if (useCache) {
            return ratioCache.getUnchecked(Pair.of(s1, s2));
        } else {
            return getRatio(s1, s2);
        }
    }

    /*
     * t0 = [SORTED_INTERSECTION]
     * t1 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1]
     * t2 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2]
     *
     * outcome = max(t0,t1,t2)
     *
     */

    public static int getRatio(String s1, String s2) {

        if (s1.length() >= s2.length()) {
            // We need to swap s1 and s2
            String temp = s2;
            s2 = s1;
            s1 = temp;
        }

        // Get alpha numeric characters
        Set<String> set1 = tokenizeString(escapeString(s1));
        Set<String> set2 = tokenizeString(escapeString(s2));

        SetView<String> intersection = Sets.intersection(set1, set2);

        TreeSet<String> sortedIntersection = Sets.newTreeSet(intersection);

        if (LOG.isTraceEnabled()) {
            StringBuilder sortedSb = new StringBuilder();
            for (String s : sortedIntersection) {
                sortedSb.append(s).append(" ");
            }
            LOG.trace("Sorted intersection --> {}", sortedSb.toString());
        }

        // Find out difference of sets set1 and intersection of set1,set2
        SetView<String> restOfSet1 = Sets.symmetricDifference(set1, intersection);

        // Sort it
        TreeSet<String> sortedRestOfSet1 = Sets.newTreeSet(restOfSet1);

        SetView<String> restOfSet2 = Sets.symmetricDifference(set2, intersection);
        TreeSet<String> sortedRestOfSet2 = Sets.newTreeSet(restOfSet2);

        if (LOG.isTraceEnabled()) {
            StringBuilder sb1 = new StringBuilder();
            for (String s : sortedRestOfSet1) {
                sb1.append(s).append(" ");
            }
            LOG.trace("Sorted rest of 1 --> {}", sb1.toString());

            StringBuilder sb2 = new StringBuilder();
            for (String s : sortedRestOfSet1) {
                sb2.append(s).append(" ");
            }
            LOG.trace("Sorted rest of 2 --> {}", sb2.toString());
        }

        StringBuilder t0Builder = new StringBuilder("");
        StringBuilder t1Builder = new StringBuilder("");
        StringBuilder t2Builder = new StringBuilder("");

        for (String s : sortedIntersection) {
            t0Builder.append(" ").append(s);
        }
        String t0 = t0Builder.toString().trim();

        Set<String> setT1 = Sets.union(sortedIntersection, sortedRestOfSet1);
        for (String s : setT1) {
            t1Builder.append(" ").append(s);
        }
        String t1 = t1Builder.toString().trim();

        Set<String> setT2 = Sets.union(intersection, sortedRestOfSet2);
        for (String s : setT2) {
            t2Builder.append(" ").append(s);
        }

        String t2 = t2Builder.toString().trim();

        int amt1 = calculateLevenshteinDistance(t0, t1);
        int amt2 = calculateLevenshteinDistance(t0, t2);
        int amt3 = calculateLevenshteinDistance(t1, t2);

        LOG.trace("t0 = {} --> {}", t0, amt1);
        LOG.trace("t1 = {} --> {}", t1, amt2);
        LOG.trace("t2 = {} --> {}", t2, amt3);

        return Math.max(Math.max(amt1, amt2), amt3);
    }

    private static Set<String> tokenizeString(String str) {
        Set<String> set = new HashSet<>();

        // Normalize and tokenize the input strings before storing as a set.
        StringTokenizer st = new StringTokenizer(str);
        while (st.hasMoreTokens()) {
            String t1 = st.nextToken();
            String[] tokens = camelAndSnakeCaseSplitter.split(t1);
            for (int i = 0; i < tokens.length; i++) {
                tokens[i] = tokens[i].toLowerCase();
            }
            Collections.addAll(set, tokens);
        }

        set.remove("");

        return set;
    }

    private static int calculateLevenshteinDistance(String s1, String s2) {
        int distance = StringUtils.getLevenshteinDistance(s1, s2);
        double ratio = ((double) distance) / (Math.max(s1.length(), s2.length()));
        return 100 - (int) (ratio * 100);
    }

    private static String escapeString(String token) {
        return removeNonAlphaNum.matcher(token).replaceAll(" ");
    }
}