edu.harvard.mcz.nametools.ICNafpAuthorNameComparator.java Source code

Java tutorial

Introduction

Here is the source code for edu.harvard.mcz.nametools.ICNafpAuthorNameComparator.java

Source

/**
 * ICZNAuthorNameComparator.java
 *
 * Copyright 2015 President and Fellows of Harvard College
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.harvard.mcz.nametools;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * Make comparisons between pairs of scientificNameAuthor strings, under the assumption
 * that both strings are from names covered by the botanical code.
 * 
 * @author mole
 *
 */
public class ICNafpAuthorNameComparator extends AuthorNameComparator {

    private static final Log logger = LogFactory.getLog(ICNafpAuthorNameComparator.class);

    /**
     *  Threshold of similarity (0-1) over which strong similarity is asserted.
     */
    protected double similarityThreshold = .75d;
    /**
     *  Threshold of similarity (0-1) over which a weak similarity is asserted.
     */
    protected double weakThreshold = .5d;

    @Override
    public double getSimilarityThreshold() {
        return similarityThreshold;
    }

    /**
      * Constructor for an ICNapf author name comparator that specifies values for similarity
      * assertions.
      * 
      * @param similarityThreshold in range 0 to 1 for marking comparisons as similar
      * @param weakThreshold in range 0 to 1 for marking comparisons as weakly similar
      */
    public ICNafpAuthorNameComparator(double similarityThreshold, double weakThreshold) {
        this.similarityThreshold = similarityThreshold;
        this.weakThreshold = weakThreshold;
    }

    /* (non-Javadoc)
     * @see edu.harvard.mcz.nametools.AuthorNameComparator#compare(java.lang.String, java.lang.String)
     */
    @Override
    public NameComparison compare(String anAuthor, String toOtherAuthor) {

        NameComparison result = new NameComparison(anAuthor, toOtherAuthor);

        result.setMatchType(NameComparison.MATCH_ERROR);
        if (anAuthor == null || toOtherAuthor == null) {
            result.setMatchType(NameComparison.MATCH_ERROR);
        } else {
            if (anAuthor.equals(toOtherAuthor) || anAuthor.toLowerCase().replaceAll("[ .,]", "")
                    .equals(toOtherAuthor.toLowerCase().replaceAll("[ .,]", ""))) {
                result.setMatchType(NameComparison.MATCH_EXACT);
                result.setSimilarity(1.0d);
            } else {
                double similarity = AuthorNameComparator.calulateSimilarityOfAuthor(anAuthor, toOtherAuthor);
                result.setSimilarity(similarity);
                if (anAuthor.length() == 0 && toOtherAuthor.length() > 0) {
                    result.setMatchType(NameComparison.MATCH_ADDSAUTHOR);
                } else {
                    if (similarity > similarityThreshold) {
                        result.setMatchType(NameComparison.MATCH_AUTHSIMILAR);
                    } else if (similarity > weakThreshold) {
                        result.setMatchType(NameComparison.MATCH_DISSIMILAR);
                    } else {
                        result.setMatchType(NameComparison.MATCH_STRONGDISSIMILAR);
                    }
                    double similarityAlpha = AuthorNameComparator.calulateSimilarityOfAuthorAlpha(anAuthor,
                            toOtherAuthor);
                    boolean parenSame = ICNafpAuthorNameComparator.calculateHasParen(
                            anAuthor) == ICNafpAuthorNameComparator.calculateHasParen(toOtherAuthor);

                    List<String> anAuthorBits = tokenizeAuthorship(anAuthor);
                    List<String> toOtherAuthorBits = tokenizeAuthorship(toOtherAuthor);
                    if (anAuthorBits.size() != toOtherAuthorBits.size()) {
                        result.setMatchType(NameComparison.MATCH_PARTSDIFFER);
                    } else {
                        if (ICNafpAuthorNameComparator.matchedOnWordsInTokens(anAuthor, toOtherAuthor)) {
                            result.setMatchType(NameComparison.MATCH_SAMEBUTABBREVIATED);
                        }
                    }

                    if (anAuthor.contains(" ex ") && !toOtherAuthor.contains(" ex ")) {
                        result.setMatchType(NameComparison.MATCH_PARTSDIFFER);
                    }
                    if (anAuthor.contains(":") && !toOtherAuthor.contains(":")) {
                        result.setMatchType(NameComparison.MATCH_PARTSDIFFER);
                    }
                    if (anAuthor.contains("(") && !toOtherAuthor.contains("(")) {
                        result.setMatchType(NameComparison.MATCH_PARTSDIFFER);
                    }

                    if (!parenSame && (similarityAlpha == 1d)) {
                        result.setMatchType(NameComparison.MATCH_PARENTHESIESDIFFER);
                    }
                }
            }
        }
        return result;
    }

    /**
     * Compare the two author strings to see if they look like differing abbreviations
     * of the same set of authors in the same semantic positions (comparing each 
     * parenthetical author, ex author, sanctioning author, etc separately.
     * 
     * @param anAuthor for the comparison
     * @param toOtherAuthor to compare with anAuthor
     * 
     * @return true if each semantic piece of the two author strings is the same, or if 
     * each semantic piece appears to be an abbreviation of the corresponding semantic piece
     * otherwise return false. 
     */
    public static boolean matchedOnWordsInTokens(String anAuthor, String toOtherAuthor) {
        logger.debug("in matchedOnWordsInTokens(" + anAuthor + "," + toOtherAuthor + ")");
        // TODO: Break this into several methods, called from this method, each 
        // method with its own unit tests, this assembly is large enough to be difficult to 
        // debug when tests fail.
        boolean result = false;
        List<String> anAuthorBits = tokenizeAuthorship(anAuthor);
        List<String> toOtherAuthorBits = tokenizeAuthorship(toOtherAuthor);
        if (anAuthorBits.size() == toOtherAuthorBits.size()) {
            Iterator<String> iA = anAuthorBits.iterator();
            Iterator<String> iO = toOtherAuthorBits.iterator();
            boolean foundMissmatch = false;
            while (iA.hasNext()) {
                String anAuthorBit = iA.next();
                String toOtherAuthorBit = iO.next();
                if (shorterButNotAbbreviation(anAuthorBit, toOtherAuthorBit)) {
                    // declare a missmatch.
                    foundMissmatch = true;
                    logger.debug("Missmatch shorter but not abbreviation: " + anAuthorBit + ":" + toOtherAuthorBit);
                }
                if (!anAuthorBit.equals(toOtherAuthorBit)) {
                    // Check if initials are the same
                    if (initalsAreDifferent(anAuthorBit, toOtherAuthorBit)) {
                        foundMissmatch = true;
                        logger.debug("Missmatch: " + anAuthorBit + ":" + toOtherAuthorBit);
                    }

                    // remove punctuation.
                    anAuthorBit = anAuthorBit.replaceAll("[^A-Za-z ::alpha::]", " ");
                    toOtherAuthorBit = toOtherAuthorBit.replaceAll("[^A-Za-z ::alpha::]", " ");
                    if (anAuthorBit.trim().equals("L") && toOtherAuthorBit.equals("Lamarck")) {
                        // Special case, botany, fail
                        foundMissmatch = true;
                        logger.debug("Missmatch: " + anAuthorBit + ": " + toOtherAuthorBit);
                    }
                    if (anAuthorBit.trim().equals("Lamarck") && toOtherAuthorBit.equals("L")) {
                        // Special case, botany, fail
                        foundMissmatch = true;
                        logger.debug("Missmatch: " + anAuthorBit + ": " + toOtherAuthorBit);
                    }
                    // remove single letters (except if authorbit is only a single letter) 
                    if (!anAuthorBit.trim().matches("^[A-Z]$")) {
                        anAuthorBit = anAuthorBit.replaceAll("^[A-Z] ", " ");
                        anAuthorBit = anAuthorBit.replaceAll(" [A-Z] ", " ");
                        anAuthorBit = anAuthorBit.replaceAll(" [A-Z]$", " ");
                    }
                    if (!toOtherAuthorBit.trim().matches("^[A-Z]$")) {
                        toOtherAuthorBit = toOtherAuthorBit.replaceAll("^[A-Z] ", " ");
                        toOtherAuthorBit = toOtherAuthorBit.replaceAll(" [A-Z] ", " ");
                        toOtherAuthorBit = toOtherAuthorBit.replaceAll(" [A-Z]$", " ");
                    }
                    anAuthorBit = anAuthorBit.trim();
                    toOtherAuthorBit = toOtherAuthorBit.trim();
                    // remove double spaces
                    anAuthorBit = anAuthorBit.replaceAll("  ", " ").trim();
                    toOtherAuthorBit = toOtherAuthorBit.replaceAll("  ", " ").trim();
                    logger.debug(anAuthorBit);
                    logger.debug(toOtherAuthorBit);
                    List<String> anAuthorSubBits = Arrays.asList(anAuthorBit.trim().split(" "));
                    List<String> toOtherAuthorSubBits = Arrays.asList(toOtherAuthorBit.trim().split(" "));
                    if (!knownMatch(anAuthorBit, toOtherAuthorBit)) {
                        if (anAuthorSubBits.size() != toOtherAuthorSubBits.size()) {
                            foundMissmatch = true;
                            logger.debug("Missmatch: " + anAuthorBit.trim() + " " + anAuthorSubBits.size() + ": "
                                    + toOtherAuthorBit.trim() + " " + toOtherAuthorSubBits.size());
                        } else {
                            Iterator<String> iAsb = anAuthorSubBits.iterator();
                            Iterator<String> iOsb = toOtherAuthorSubBits.iterator();
                            while (iAsb.hasNext()) {
                                String subBit = iAsb.next();
                                String otherSubBit = iOsb.next();
                                if (!compareSameOrStartsWith(subBit, otherSubBit)) {
                                    foundMissmatch = true;
                                    logger.debug("Missmatch: " + subBit + ": " + otherSubBit);
                                }
                            }
                        }
                    }
                }
            }
            result = !foundMissmatch;
        }
        logger.debug("matchedOnWordsInTokens(): " + result);
        return result;
    }

    /**
     * Given a string, return any initials from that string.
     * 
     * @param name to examine for initials
     * @return portion of name that fits the pattern of initials.
     */
    public static String extractInitials(String name) {
        StringBuffer result = new StringBuffer();
        Pattern p = Pattern.compile("(^| |\\()[A-Z]\\.");
        Matcher matcher = p.matcher(name.replaceAll("\\.", ". "));
        while (matcher.find()) {
            String match = matcher.group();
            result.append(match.replaceAll("[^A-Z]", ""));
        }
        return result.toString();
    }

    /**
     * Compare to strings to see if they are the same or if the longer starts with
     * the shorter.  Special case, "L." is not same or starts with "Lamarck".  
     * 
     * @param aString for comparison
     * @param anotherString to compare with aString
     * @return true if the two strings are the same, or if the longer of the two strings
     * starts with the shorter of the two strings.
     */
    public static boolean compareSameOrStartsWith(String aString, String anotherString) {
        boolean result = false;
        if (aString.equals(anotherString)) {
            result = true;
        } else {
            String shorter = aString;
            String longer = anotherString;
            if (aString.length() > anotherString.length()) {
                shorter = anotherString;
                longer = aString;
            }
            if (longer.replace(".", "").startsWith(shorter.replace(".", ""))) {
                result = true;
            }
            if (knownMatch(shorter.replace(".", ""), longer.replace(".", ""))) {
                result = true;
            }
            // Special case, for botany, L doesn't abbreviate Lamarck, only Linnaeus:
            if (shorter.replace(".", "").equals("L") && longer.equals("Lamarck")) {
                result = false;
            }
        }
        return result;
    }

    /**
     * Given a botanical authorship string, split it into a list of component 
     * authors on parenthetical authors, ex authors, and sanctioning authors.
     * 
     * @param authorship to tokenize
     * @return a list of authorship strings representing the components of the 
     * authorship string.
     */
    public static List<String> tokenizeAuthorship(String authorship) {
        ArrayList<String> bits = new ArrayList<String>();
        ArrayList<String> subbits = new ArrayList<String>();
        ArrayList<String> result = new ArrayList<String>();

        if (authorship == null || authorship.length() == 0) {
            return result;
        }

        // separate out parenthetical author
        logger.debug(authorship);
        if (authorship.matches("^\\(.*\\).+$")) {
            String[] parBits = authorship.split("\\)");
            logger.debug(parBits.length);
            logger.debug(parBits[0]);
            bits.add(parBits[0].replaceFirst("^\\(", ""));
            bits.add(parBits[1]);
        } else {
            bits.add(authorship);
        }

        // separate out ex author
        Iterator<String> i = bits.iterator();
        while (i.hasNext()) {
            String bit = i.next();
            if (bit.contains(" ex ")) {
                String[] exBits = bit.split(" ex ");
                logger.debug(exBits.length);
                logger.debug(exBits[0]);
                logger.debug(exBits[1]);
                subbits.add(exBits[0]);
                subbits.add(exBits[1]);
            } else {
                logger.debug(bit);
                subbits.add(bit);
            }

        }

        // separate out sanctioning author
        Iterator<String> ir = subbits.iterator();
        while (ir.hasNext()) {
            String bit = ir.next();
            if (bit.contains(":")) {
                String[] exBits = bit.split(":");
                result.add(exBits[0]);
                result.add(exBits[1]);
            } else {
                result.add(bit);
            }

        }
        // strip any leading/trailing spaces
        for (int j = 0; j < result.size(); j++) {
            result.set(j, result.get(j).trim());
        }

        return result;
    }

    /**
     * Are anAuthor and anotherAuthor a known match for a small set of 
     * unusual standard abbreviations linked to name and initials 
     * (e.g. DC. for de Candolle). 
     * 
     * @param anAuthor for comparison
     * @param anotherAuthor to compare with anAuthor
     * @return true if anAuthor and anotherAuthor form a known pair of
     * name and abbreviation.
     */
    public static boolean knownMatch(String anAuthor, String anotherAuthor) {
        boolean result = false;

        HashMap<String, String> knowns = new HashMap<String, String>();
        knowns.put("DC", "de Candolle");
        knowns.put("Mendoza-Garca", "M Mendoza G");
        knowns.put("L f", "C Linnaeus f");
        knowns.put("Mll Arg", "J. Mller Arg");
        knowns.put("Ruiz", "H Ruz L");
        knowns.put("Germ", "J N E Germain S-P");
        knowns.put("Mll Berol", "K Mller Berol");
        knowns.put("Chick", "J W Chickering, Jr");
        knowns.put("Velez-Nauer", "C Vlez N");
        knowns.put("Ibarra-Manr", "G Ibarra M");
        knowns.put("Vera-Caletti", "P Vera C");
        knowns.put("Chzaro", "M J Chazaro B");
        knowns.put("Sahagun", "E Sahagn G");
        knowns.put("Marrero Rodr", "A Marrero R");
        knowns.put("Magalh", "C T Magalhes G");
        knowns.put("Lucas Rodr", "R L Rodriguez C");
        knowns.put("M Schultz", "Schultz, M");
        knowns.put("Hechav", "L Hechavarra S");
        knowns.put("FrancGut", "J A Francisco Gut");
        knowns.put("Karnk", "A Karnkowska-Ish");
        knowns.put("Sm", "J E Smith");
        knowns.put("Hue", "A-M Hue");
        knowns.put("Day", "M A Day");
        knowns.put("Fr", "E M Fries");

        String m1 = knowns.get(anAuthor.replace(".", ""));
        String m2 = knowns.get(anotherAuthor.replace(".", ""));

        if ((m1 != null && m1.equals(anotherAuthor)) || (m2 != null && m2.equals(anAuthor))) {
            result = true;
        }
        return result;
    }

    /**
    * If initials are present in both strings, true false if they are different, otherwise
    * return true.
    * 
    * @param anAuthorBit for comparison
    * @param toOtherAuthorBit to compare with anAuthorBit
    * @return false if initials are present in both author bits, otherwise return true
    */
    public static boolean initalsAreDifferent(String anAuthorBit, String toOtherAuthorBit) {
        boolean result = false;
        String initA = extractInitials(anAuthorBit);
        String initO = extractInitials(toOtherAuthorBit);
        if (!initA.equals(initO)) {
            if (initA.length() == initO.length() && initA.length() > 0) {
                result = true;
                logger.debug("Different Initials: " + initA + ":" + initO);
            }
        }
        return result;
    }

    /**
     * Given two strings, does the shorter appear to not be an abbreviation.
     * 
     * @param shorter a string (doesn't have to be the shorter of the two).
     * @param longer string to compare with (doesn't have to be the longer of the two).
     * 
     * @return true if one string is shorter than the other and doesn't end with a period, 
     *  and the longer of the two doesn't contain a period, thus the shorter doesn't 
     *  appear to be an abbreviation of the longer.  Otherwise returns false (including
     *  if both strings are the same length).
     */
    public static boolean shorterButNotAbbreviation(String shorter, String longer) {
        boolean result = false;
        String s = shorter;
        String l = longer;
        if (s.length() > l.length()) {
            shorter = l;
            longer = s;
        }
        if (shorter.length() < longer.length()) {
            if (!shorter.endsWith(".") && !longer.contains(".")) {
                // if the shorter of the two bits doesn't end with a period 
                // and the longer of the two bits doesn't contain a period 
                // then assume that the shorter isn't an abbreviation. 
                result = true;
                logger.debug("Missmatch: " + shorter + ":" + longer);
            }
        }
        if (shorter.length() == longer.length()) {
            //result = false;
        }
        return result;
    }

}