com.octo.mbo.domain.ApproachingMatcher.java Source code

Java tutorial

Introduction

Here is the source code for com.octo.mbo.domain.ApproachingMatcher.java

Source

/**
 * Copyright (C) 2017 mbojoly (mbojoly@octo.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.octo.mbo.domain;

import org.apache.commons.text.similarity.LevenshteinDistance;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ApproachingMatcher {
    private final Pattern p = Pattern.compile("[ ]+");
    private final LevenshteinDistance levenshteinDistance = new LevenshteinDistance();
    private static Logger log = LoggerFactory.getLogger(ApproachingMatcher.class);

    /**
     * First ignore additional spaces "........"
     * @param inputString string to normalize
     * @return inputString without more than one consecutive space
     */
    public String normalize(String inputString) {
        assert inputString != null;

        Matcher m = p.matcher(inputString);
        return m.replaceAll(" ");
    }

    /**
     * Sort the list of keys according to their distance with the reference key
     * @param keysOfSlides List of keys
     * @param key A reference key that will be compared to each String
     * @return A map whose index are the distances and values a list of keys for the corresponding distance.
     */
    public SortedMap<Integer, Set<String>> sortByDistanceWithKey(Collection<String> keysOfSlides, String key) {
        assert keysOfSlides != null;
        assert key != null;

        SortedMap<Integer, Set<String>> keysSortedByDistance = new TreeMap<>();

        for (String slideKey : keysOfSlides) {
            int distK2k = levenshteinDistance.apply(normalize(key), normalize(slideKey));
            if (keysSortedByDistance.containsKey(distK2k)) {
                keysSortedByDistance.get(distK2k).add(slideKey);
            } else {
                keysSortedByDistance.put(distK2k, new HashSet<>((Collections.singletonList(slideKey))));
            }
        }

        log.trace("Sort by least distance to '{}' after normalization : {}", key, keysSortedByDistance);

        return keysSortedByDistance;
    }

    public Optional<String> findBestMatch(Collection<String> srcSlidesKeys, Collection<String> tgtSlideKeys,
            String tgtKeyToMatch) {
        assert srcSlidesKeys != null;
        assert tgtSlideKeys != null;
        assert tgtKeyToMatch != null;

        SortedMap<Integer, Set<String>> sortedSrcSlidesKeys = sortByDistanceWithKey(srcSlidesKeys, tgtKeyToMatch);
        int smallestDistanceBtwTgtKeyToMatchAndSrcKey = sortedSrcSlidesKeys.firstKey();
        Set<String> potentialKeys = sortedSrcSlidesKeys.get(smallestDistanceBtwTgtKeyToMatchAndSrcKey);
        if (potentialKeys.size() > 1) {
            log.warn("Tgt key '{}' could both match {} with levenstein distance {}", tgtKeyToMatch, potentialKeys,
                    smallestDistanceBtwTgtKeyToMatchAndSrcKey);
            return Optional.empty();
        } else {
            String potentialKey = potentialKeys.iterator().next();
            //Be sure that this potential key is to at the equal distance of another tgtKey
            SortedMap<Integer, Set<String>> sortedTgtSlideKeys = sortByDistanceWithKey(tgtSlideKeys, potentialKey);
            int smallestDistanceBtwPotentialKeyAndTgtKey = sortedTgtSlideKeys.firstKey();
            Set<String> potentialTgtKeys = sortedTgtSlideKeys.get(smallestDistanceBtwPotentialKeyAndTgtKey);
            if (potentialTgtKeys.size() > 1) {
                log.warn(
                        "Potential Src key '{}' matching Tgt keys {} could both match TgtKeys {} with levenstein distance {}",
                        tgtKeyToMatch, potentialKeys, potentialTgtKeys, smallestDistanceBtwTgtKeyToMatchAndSrcKey);
                return Optional.empty();
            } else {
                String potentialTgtKey = potentialTgtKeys.iterator().next();
                if (potentialTgtKey.equals(tgtKeyToMatch)) {
                    log.info("Src key '{}' has been associated with Tgt key '{}'", potentialKey, potentialTgtKey);
                    return Optional.of(potentialKey);
                } else {
                    log.warn(
                            "Potential Src key '{}' matches Tgt key '{}' with distance '{}' with is different from the Tgt we looked up '{}'",
                            potentialKey, potentialTgtKey, smallestDistanceBtwPotentialKeyAndTgtKey, tgtKeyToMatch);
                    return Optional.empty();
                }
            }
        }
    }
}