gate.creole.orthomatcher.OrthoMatcher.java Source code

Introduction

Here is the source code for gate.creole.orthomatcher.OrthoMatcher.java
Source

/*
 *  OrthoMatcher.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Kalina Bontcheva, 24/August/2001
 *
 *  Major update by Andrew Borthwick of Spock Networks, 11/13/2007 - 8/3/2008:
 *    1.  matchWithPrevious now searches for matching annotations in order, starting from current and working backwards
 *    until it finds a match.  This compares with the previous behavior, which searched randomly among previous annotations
 *    for a match (because it used an iterator across an AnnotationSet, whereas now we iterate across an ArrayList<Annotation>)
 *    2.  We no longer require that identical strings always refer to the same entity.  We can correctly match
 *    the sequence "David Jones ... David ... David Smith ... David" as referring to two people, tying the first
 *    David to "David Jones" and the second David to "David Smith".  Ditto with David Jones .. Mr. Jones ..
 *    Richard Jones .. Mr. Jones
 *    3.  We now allow for nickname matches for Persons (David = Dave) via the "fuzzyMatch" method which is referenced
 *    in some of the matching rules.
 *    4.  Optional parameter highPrecisionOrgs only allows high precision matches for organizations and
 *    turns off the riskier rules.  Under this option, need to match on something like IBM = IBM Corp.
 *    5.  Various fixes to a number of rules
 *
 *  $Id: OrthoMatcher.java 8929 2007-07-12 16:49:55Z ian_roberts $
 */

package gate.creole.orthomatcher;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.BomStrippingInputStreamReader;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;
import gate.util.OffsetComparator;
import gate.util.Out;

import java.io.BufferedReader;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;

@CreoleResource(name = "ANNIE OrthoMatcher", comment = "ANNIE orthographical coreference component.", helpURL = "http://gate.ac.uk/userguide/sec:annie:orthomatcher", icon = "ortho-matcher")
public class OrthoMatcher extends AbstractLanguageAnalyser {

    private static final long serialVersionUID = -6258229350677707465L;

    protected static final Logger log = Logger.getLogger(OrthoMatcher.class);

    public static final boolean DEBUG = false;

    public static final String OM_DOCUMENT_PARAMETER_NAME = "document";

    public static final String OM_ANN_SET_PARAMETER_NAME = "annotationSetName";

    public static final String OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";

    public static final String OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes";

    public static final String OM_ORG_TYPE_PARAMETER_NAME = "organizationType";

    public static final String OM_PERSON_TYPE_PARAMETER_NAME = "personType";

    public static final String OM_EXT_LISTS_PARAMETER_NAME = "extLists";

    protected static final String CDGLISTNAME = "cdg";
    protected static final String ALIASLISTNAME = "alias";
    protected static final String ARTLISTNAME = "def_art";
    protected static final String PREPLISTNAME = "prepos";
    protected static final String CONNECTORLISTNAME = "connector";
    protected static final String SPURLISTNAME = "spur_match";

    protected static final String PUNCTUATION_VALUE = "punctuation";
    protected static final String THE_VALUE = "The";

    /**the name of the annotation set*/
    protected String annotationSetName;

    /** the types of the annotation */
    protected List<String> annotationTypes = new ArrayList<String>(10);

    /** the organization type*/
    protected String organizationType = ORGANIZATION_ANNOTATION_TYPE;

    /** the person type*/
    protected String personType = PERSON_ANNOTATION_TYPE;

    protected String unknownType = "Unknown";

    /** internal or external list */
    protected boolean extLists = true;

    /** Use only high precision rules for Organizations */
    protected Boolean highPrecisionOrgs = false;

    /** matching unknowns or not*/
    protected boolean matchingUnknowns = true;

    /** This is an internal variable to indicate whether
     *  we matched using a rule that requires that
     *  the newly matched annotation matches all the others
     *  This is needed, because organizations can share
     *  first/last tokens like News and be different
     */
    protected boolean allMatchingNeeded = false;

    //** Orthomatching is not case-sensitive by default*/
    protected boolean caseSensitive = false;

    //protected FeatureMap queryFM = Factory.newFeatureMap();

    // name lookup tables (used for namematch)
    //gave them bigger default size, coz rehash is expensive
    protected HashMap<String, String> alias = new HashMap<String, String>(100);
    protected Set<String> cdg = new HashSet<String>();
    protected HashMap<String, String> spur_match = new HashMap<String, String>(100);
    protected HashMap<String, String> def_art = new HashMap<String, String>(20);
    protected HashMap<String, String> connector = new HashMap<String, String>(20);
    protected HashMap<String, String> prepos = new HashMap<String, String>(30);

    protected AnnotationSet nameAllAnnots = null;

    protected HashMap<Integer, String> processedAnnots = new HashMap<Integer, String>(150);
    protected HashMap<Integer, String> annots2Remove = new HashMap<Integer, String>(75);
    protected List<List<Integer>> matchesDocFeature = new ArrayList<List<Integer>>();
    //maps annotation ids to array lists of tokens
    protected HashMap<Integer, List<Annotation>> tokensMap = new HashMap<Integer, List<Annotation>>(150);

    public Map<Integer, List<Annotation>> getTokensMap() {
        return tokensMap;
    }

    protected Map<Integer, List<Annotation>> normalizedTokensMap = new HashMap<Integer, List<Annotation>>(150);

    protected Annotation shortAnnot;
    protected Annotation longAnnot;

    protected ArrayList<Annotation> tokensLongAnnot;
    protected ArrayList<Annotation> tokensShortAnnot;

    protected ArrayList<Annotation> normalizedTokensLongAnnot, normalizedTokensShortAnnot;

    /**
     * URL to the file containing the definition for this orthomatcher
     */
    private java.net.URL definitionFileURL;

    private Double minimumNicknameLikelihood;

    /** The encoding used for the definition file and associated lists.*/
    private String encoding;

    private Map<Integer, OrthoMatcherRule> rules = new HashMap<Integer, OrthoMatcherRule>();

    /** to be initialized in init() */
    private AnnotationOrthography orthoAnnotation;

    /** @link dependency */
    /*#OrthoMatcher lnkOrthoMatcher;*/

    public OrthoMatcher() {
        annotationTypes.add(organizationType);
        annotationTypes.add(personType);
        annotationTypes.add("Location");
        annotationTypes.add("Date");
    }

    /** Initialise the rules. The orthomatcher loads its build-in rules. */
    private void initRules() {
        //this line should be executed after spur_match is loaded
        rules.put(0, new MatchRule0(this));
        rules.put(1, new MatchRule1(this));
        rules.put(2, new MatchRule2(this));
        rules.put(3, new MatchRule3(this));
        rules.put(4, new MatchRule4(this));
        rules.put(5, new MatchRule5(this));
        rules.put(6, new MatchRule6(this));
        rules.put(7, new MatchRule7(this));
        rules.put(8, new MatchRule8(this));
        rules.put(9, new MatchRule9(this));
        rules.put(10, new MatchRule10(this));
        rules.put(11, new MatchRule11(this));
        rules.put(12, new MatchRule12(this));
        rules.put(13, new MatchRule13(this));
        rules.put(14, new MatchRule14(this));
        rules.put(15, new MatchRule15(this));
        rules.put(16, new MatchRule16(this));
        rules.put(17, new MatchRule17(this));

    }

    /** Override this method to add, replace, remove rules */
    protected void modifyRules(Map<Integer, OrthoMatcherRule> rules) {

    }

    /** Initialise this resource, and return it. */
    @SuppressWarnings("resource")
    @Override
    public Resource init() throws ResourceInstantiationException {
        //initialise the list of annotations which we will match
        if (definitionFileURL == null) {
            throw new ResourceInstantiationException("No URL provided for the definition file!");
        }
        String nicknameFile = null;
        BufferedReader reader = null;
        //at this point we have the definition file
        try {
            reader = new BomStrippingInputStreamReader(definitionFileURL.openStream(), encoding);
            String lineRead = null;
            //boolean foundANickname = false;
            while ((lineRead = reader.readLine()) != null) {
                int index = lineRead.indexOf(":");
                if (index != -1) {
                    String nameFile = lineRead.substring(0, index);
                    String nameList = lineRead.substring(index + 1, lineRead.length());
                    if (nameList.equals("nickname")) {
                        if (minimumNicknameLikelihood == null) {
                            throw new ResourceInstantiationException(
                                    "No value for the required parameter " + "minimumNicknameLikelihood!");
                        }
                        nicknameFile = nameFile;
                    } else {
                        createAnnotList(nameFile, nameList);
                    }
                } // if
            } //while
            reader.close();

            URL nicknameURL = null;
            if (nicknameFile != null)
                nicknameURL = new URL(definitionFileURL, nicknameFile);
            this.orthoAnnotation = new BasicAnnotationOrthography(personType, extLists, unknownType, nicknameURL,
                    minimumNicknameLikelihood, encoding);
            initRules();
            modifyRules(rules);

        } catch (IOException ioe) {
            throw new ResourceInstantiationException(ioe);
        } finally {
            IOUtils.closeQuietly(reader);
        }

        return this;
    } // init()

    /**  Run the resource. It doesn't make sense not to override
     *  this in subclasses so the default implementation signals an
     *  exception.
     */
    @Override
    public void execute() throws ExecutionException {
        try {
            //check the input
            if (document == null) {
                throw new ExecutionException("No document for namematch!");
            }
            fireStatusChanged("OrthoMatcher processing: " + document.getName());

            // get the annotations from document
            if ((annotationSetName == null) || (annotationSetName.equals("")))
                nameAllAnnots = document.getAnnotations();
            else
                nameAllAnnots = document.getAnnotations(annotationSetName);

            //if none found, print warning and exit
            if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) {
                Out.prln("OrthoMatcher Warning: No annotations found for processing");
                return;
            }

            //check if we've been run on this document before
            //and clean the doc if needed
            docCleanup();
            @SuppressWarnings("unchecked")
            Map<String, List<List<Integer>>> matchesMap = (Map<String, List<List<Integer>>>) document.getFeatures()
                    .get(DOCUMENT_COREF_FEATURE_NAME);

            // creates the cdg list from the document
            //no need to create otherwise, coz already done in init()
            if (!extLists)
                cdg = orthoAnnotation.buildTables(nameAllAnnots);

            //Match all name annotations and unknown annotations
            matchNameAnnotations();

            //used to check if the Orthomatcher works properly
            //OrthoMatcherHelper.setMatchesPositions(nameAllAnnots);

            // set the matches of the document
            //    determineMatchesDocument();
            if (!matchesDocFeature.isEmpty()) {
                if (matchesMap == null) {
                    matchesMap = new HashMap<String, List<List<Integer>>>();
                }
                matchesMap.put(nameAllAnnots.getName(), matchesDocFeature);
                // System.out.println("matchesMap is: " + matchesMap);
                //we need to put it even if it was already present in order to triger
                //the update events
                document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap);

                //cannot do clear() as this has already been put on the document
                //so I need a new one for the next run of matcher
                matchesDocFeature = new ArrayList<List<Integer>>();

                fireStatusChanged("OrthoMatcher completed");
            }
        } finally {
            //make sure the cleanup happens even if there are errors.
            //    Out.prln("Processed strings" + processedAnnots.values());
            //clean-up the internal data structures for next run
            nameAllAnnots = null;
            processedAnnots.clear();
            annots2Remove.clear();
            tokensMap.clear();
            normalizedTokensMap.clear();
            matchesDocFeature = new ArrayList<List<Integer>>();
            longAnnot = null;
            shortAnnot = null;
            tokensLongAnnot = null;
            tokensShortAnnot = null;

            //if (log.isDebugEnabled()) OrthoMatcherHelper.saveUsedTable();
        }
    } // run()

    protected void matchNameAnnotations() throws ExecutionException {
        // go through all the annotation types
        Iterator<String> iterAnnotationTypes = annotationTypes.iterator();
        while (iterAnnotationTypes.hasNext()) {
            String annotationType = iterAnnotationTypes.next();

            AnnotationSet nameAnnots = nameAllAnnots.get(annotationType);

            // continue if no such annotations exist
            if (nameAnnots.isEmpty())
                continue;

            AnnotationSet tokensNameAS = nameAllAnnots.get(TOKEN_ANNOTATION_TYPE);
            if (tokensNameAS.isEmpty())
                continue;

            ArrayList<Annotation> sortedNameAnnots = new ArrayList<Annotation>(nameAnnots);
            Collections.<Annotation>sort(sortedNameAnnots, new OffsetComparator());
            for (int snaIndex = 0; snaIndex < sortedNameAnnots.size(); snaIndex++) {
                Annotation tempAnnot = sortedNameAnnots.get(snaIndex);
                Annotation nameAnnot = nameAllAnnots.get(tempAnnot.getId()); // Not sure if this matters

                // get string and value
                String annotString = orthoAnnotation.getStringForAnnotation(nameAnnot, document);

                //convert to lower case if we are not doing a case sensitive match
                if (!caseSensitive)
                    annotString = annotString.toLowerCase();

                if (DEBUG) {
                    if (log.isDebugEnabled()) {
                        log.debug("Now processing the annotation:  "
                                + orthoAnnotation.getStringForAnnotation(nameAnnot, document) + " Id: "
                                + nameAnnot.getId() + " Type: " + nameAnnot.getType() + " Offset: "
                                + nameAnnot.getStartNode().getOffset());
                    }
                }

                // get the tokens
                List<Annotation> tokens = new ArrayList<Annotation>(tokensNameAS
                        .getContained(nameAnnot.getStartNode().getOffset(), nameAnnot.getEndNode().getOffset()));

                //if no tokens to match, do nothing
                if (tokens.isEmpty()) {
                    if (log.isDebugEnabled()) {
                        log.debug(
                                "Didn't find any tokens for the following annotation.  We will be unable to perform coref on this annotation.  \n String:  "
                                        + orthoAnnotation.getStringForAnnotation(nameAnnot, document) + " Id: "
                                        + nameAnnot.getId() + " Type: " + nameAnnot.getType());
                    }
                    continue;
                }
                Collections.sort(tokens, new gate.util.OffsetComparator());
                //check if these actually do not end after the name
                //needed coz new tokeniser conflates
                //strings with dashes. So British Gas-style is two tokens
                //instead of three. So cannot match properly British Gas
                //      tokens = checkTokens(tokens);
                tokensMap.put(nameAnnot.getId(), tokens);
                normalizedTokensMap.put(nameAnnot.getId(), new ArrayList<Annotation>(tokens));

                //first check whether we have not matched such a string already
                //if so, just consider it matched, don't bother calling the rules
                // Exception:  AB, Spock:
                // Note that we require one-token Person annotations to be matched even if an identical string
                // has been matched earlier because there could be multiple people named "David", for instance,
                // on a page.
                if (processedAnnots.containsValue(annotString)
                        && (!(nameAnnot.getType().equals(personType) && (tokens.size() == 1)))) {
                    Annotation returnAnnot = orthoAnnotation.updateMatches(nameAnnot, annotString, processedAnnots,
                            nameAllAnnots, matchesDocFeature);
                    if (returnAnnot != null) {
                        if (DEBUG) {
                            if (log.isDebugEnabled()) {
                                log.debug("Exact match criteria matched " + annotString + " from (id: "
                                        + nameAnnot.getId() + ", offset: " + nameAnnot.getStartNode().getOffset()
                                        + ") to " + "(id: " + returnAnnot.getId() + ", offset: "
                                        + returnAnnot.getStartNode().getOffset() + ")");
                            }
                        }
                        processedAnnots.put(nameAnnot.getId(), annotString);
                        continue;
                    }
                } else if (processedAnnots.isEmpty()) {
                    // System.out.println("First item put in processedAnnots: " + annotString);
                    processedAnnots.put(nameAnnot.getId(), annotString);
                    continue;
                }

                //if a person, then remove their title before matching
                if (nameAnnot.getType().equals(personType)) {
                    annotString = orthoAnnotation.stripPersonTitle(annotString, nameAnnot, document, tokensMap,
                            normalizedTokensMap, nameAllAnnots);
                    normalizePersonName(nameAnnot);
                } else if (nameAnnot.getType().equals(organizationType))
                    annotString = normalizeOrganizationName(annotString, nameAnnot);

                if (null == annotString || "".equals(annotString) || tokens.isEmpty()) {
                    if (log.isDebugEnabled()) {
                        log.debug("Annotation ID " + nameAnnot.getId() + " of type" + nameAnnot.getType()
                                + " refers to a null or empty string or one with no tokens after normalization.  Unable to process further.");
                    }
                    continue;
                }
                //otherwise try matching with previous annotations
                matchWithPrevious(nameAnnot, annotString, sortedNameAnnots, snaIndex);

                // Out.prln("Putting in previous " + nameAnnot + ": string " + annotString);
                //finally add the current annotations to the processed map
                processedAnnots.put(nameAnnot.getId(), annotString);
            } //while through name annotations
            if (matchingUnknowns) {
                matchUnknown(sortedNameAnnots);
            }
        } //while through annotation types

    }

    protected void matchUnknown(ArrayList<Annotation> sortedAnnotationsForAType) throws ExecutionException {
        //get all Unknown annotations
        AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType);
        annots2Remove.clear();
        if (unknownAnnots.isEmpty())
            return;

        AnnotationSet nameAllTokens = nameAllAnnots.get(TOKEN_ANNOTATION_TYPE);
        if (nameAllTokens.isEmpty())
            return;

        Iterator<Annotation> iter = unknownAnnots.iterator();
        //loop through the unknown annots
        while (iter.hasNext()) {
            Annotation unknown = iter.next();

            // get string and value
            String unknownString = orthoAnnotation.getStringForAnnotation(unknown, document);
            //convert to lower case if we are not doing a case sensitive match
            if (!caseSensitive)
                unknownString = unknownString.toLowerCase();

            // System.out.println("Now trying to match the unknown string: " + unknownString);
            //get the tokens
            List<Annotation> tokens = new ArrayList<Annotation>(nameAllTokens
                    .getContained(unknown.getStartNode().getOffset(), unknown.getEndNode().getOffset()));
            if (tokens.isEmpty())
                continue;
            Collections.sort(tokens, new gate.util.OffsetComparator());
            tokensMap.put(unknown.getId(), tokens);
            normalizedTokensMap.put(unknown.getId(), tokens);

            //first check whether we have not matched such a string already
            //if so, just consider it matched, don't bother calling the rules
            if (processedAnnots.containsValue(unknownString)) {
                Annotation matchedAnnot = orthoAnnotation.updateMatches(unknown, unknownString, processedAnnots,
                        nameAllAnnots, matchesDocFeature);
                if (matchedAnnot == null) {
                    log.debug("Orthomatcher: Unable to find the annotation: "
                            + orthoAnnotation.getStringForAnnotation(unknown, document) + " in matchUnknown");
                } else {
                    if (matchedAnnot.getType().equals(unknownType)) {
                        annots2Remove.put(unknown.getId(), annots2Remove.get(matchedAnnot.getId()));
                    } else
                        annots2Remove.put(unknown.getId(), matchedAnnot.getType());
                    processedAnnots.put(unknown.getId(), unknownString);
                    unknown.getFeatures().put("NMRule", unknownType);
                    continue;
                }
            }

            //check if we should do sub-string matching in case it's hyphenated
            //for example US-led
            if (tokens.size() == 1 && "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) {
                if (matchHyphenatedUnknowns(unknown, unknownString, iter))
                    continue;
            } //if

            // TODO:  The below results in a assigning the unknown's to the last annotation that it matches in a document.
            // It would probably be better to first start with things which precede the current unknown and then do
            // annotations after
            matchWithPrevious(unknown, unknownString, sortedAnnotationsForAType,
                    sortedAnnotationsForAType.size() - 1);

        } //while though unknowns

        if (!annots2Remove.isEmpty()) {
            Iterator<Integer> unknownIter = annots2Remove.keySet().iterator();
            while (unknownIter.hasNext()) {
                Integer unknId = unknownIter.next();
                Annotation unknown = nameAllAnnots.get(unknId);
                Integer newID = nameAllAnnots.add(unknown.getStartNode(), unknown.getEndNode(),
                        annots2Remove.get(unknId), unknown.getFeatures());
                nameAllAnnots.remove(unknown);

                //change the id in the matches list
                @SuppressWarnings("unchecked")
                List<Integer> mList = (List<Integer>) unknown.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
                mList.remove(unknId);
                mList.add(newID);
            } //while
        } //if
    }

    private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString, Iterator<Annotation> iter) {
        boolean matched = false;

        //only take the substring before the hyphen
        int stringEnd = unknownString.indexOf("-");
        unknownString = unknownString.substring(0, stringEnd);
        //check if we've already matched this string
        //because only exact match of the substring are considered
        if (processedAnnots.containsValue(unknownString)) {
            matched = true;
            Annotation matchedAnnot = orthoAnnotation.updateMatches(unknown, unknownString, processedAnnots,
                    nameAllAnnots, matchesDocFeature);
            //only do the matching if not a person, because we do not match
            //those on sub-strings
            iter.remove();
            String newType;
            if (matchedAnnot.getType().equals(unknownType))
                newType = annots2Remove.get(matchedAnnot.getId());
            else
                newType = matchedAnnot.getType();

            Integer newID = new Integer(-1);
            try {
                newID = nameAllAnnots.add(unknown.getStartNode().getOffset(),
                        new Long(unknown.getStartNode().getOffset().longValue() + stringEnd), newType,
                        unknown.getFeatures());
            } catch (InvalidOffsetException ex) {
                throw new GateRuntimeException(ex.getMessage());
            }
            nameAllAnnots.remove(unknown);

            //change the id in the matches list
            @SuppressWarnings("unchecked")
            List<Integer> mList = (List<Integer>) unknown.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
            mList.remove(unknown.getId());
            mList.add(newID);

        }
        return matched;
    }

    /**
     * Attempt to match nameAnnot against all previous annotations of the same type, which are passed down
     * in listOfThisType.  Matches are tested in order from most recent to oldest.
     * @param nameAnnot    Annotation we are trying to match
     * @param annotString  Normalized string representation of annotation
     * @param listOfThisType  ArrayList of Annotations of the same type as nameAnnot
     * @param startIndex   Index in listOfThisType that we will start from in matching the current annotation
     */
    protected void matchWithPrevious(Annotation nameAnnot, String annotString, ArrayList<Annotation> listOfThisType,
            int startIndex) {
        boolean matchedUnknown = false;
        // Out.prln("matchWithPrevious now processing: " + annotString);

        for (int curIndex = startIndex - 1; curIndex >= 0; curIndex--) {
            Integer prevId = listOfThisType.get(curIndex).getId();
            Annotation prevAnnot = nameAllAnnots.get(prevId); // Note that this line probably isn't necessary anymore

            //check if the two are from the same type or the new one is unknown
            if (prevAnnot == null || (!prevAnnot.getType().equals(nameAnnot.getType())
                    && !nameAnnot.getType().equals(unknownType)))
                continue;
            //do not compare two unknown annotations either
            //they are only matched to those of known types
            if (nameAnnot.getType().equals(unknownType) && prevAnnot.getType().equals(unknownType))
                continue;

            //check if we have already matched this annotation to the new one
            if (orthoAnnotation.matchedAlready(nameAnnot, prevAnnot, matchesDocFeature, nameAllAnnots))
                continue;

            //now changed to a rule, here we just match by gender
            if (prevAnnot.getType().equals(personType)) {
                String prevGender = (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
                String nameGender = (String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
                if (prevGender != null && nameGender != null
                        && ((nameGender.equalsIgnoreCase("female") && prevGender.equalsIgnoreCase("male"))
                                || (prevGender.equalsIgnoreCase("female") && nameGender.equalsIgnoreCase("male")))) //if condition
                    continue; //we don't have a match if the two genders are different

            } //if

            //if the two annotations match
            //
            // A. Borthwick, Spock:  If the earlier annotation is shorter than the current annotation and it
            // has already been matched with a longer annotations, then don't match it with the current annotation.
            // Reasoning is that with the sequence David Jones . . . David  . . . David Smith, we don't want to match
            // David Smith with David.  However, with the sequence, David  . . . David Jones, it's okay to match the
            // shorter version with the longer, because it hasn't already been matched with a longer.
            boolean prevAnnotUsedToMatchWithLonger = prevAnnot.getFeatures().containsKey("matchedWithLonger");
            if (matchAnnotations(nameAnnot, annotString, prevAnnot)) {
                orthoAnnotation.updateMatches(nameAnnot, prevAnnot, matchesDocFeature, nameAllAnnots);
                if (DEBUG) {
                    log.debug(
                            "Just matched nameAnnot " + nameAnnot.getId() + " with prevAnnot " + prevAnnot.getId());
                }

                if (!prevAnnotUsedToMatchWithLonger && prevAnnot.getFeatures().containsKey("matchedWithLonger")) {
                    // We have just matched the previous annotation with a longer annotation for the first time.  We need
                    // to propagate the matchedWithLonger property to all other annotations which coreffed with the previous annotation
                    // so that we don't match them with a longer annotation
                    propagatePropertyToExactMatchingMatches(prevAnnot, "matchedWithLonger", true);
                }
                //if unknown annotation, we need to change to the new type
                if (nameAnnot.getType().equals(unknownType)) {
                    matchedUnknown = true;
                    if (prevAnnot.getType().equals(unknownType))
                        annots2Remove.put(nameAnnot.getId(), annots2Remove.get(prevAnnot.getId()));
                    else
                        annots2Remove.put(nameAnnot.getId(), prevAnnot.getType());
                    //also put an attribute to indicate that
                    nameAnnot.getFeatures().put("NMRule", unknownType);
                } //if unknown
                break; //no need to match further
            } //if annotations matched

        } //while through previous annotations

        if (matchedUnknown)
            processedAnnots.put(nameAnnot.getId(), annotString);

    }//matchWithPrevious

    protected void propagatePropertyToExactMatchingMatches(Annotation updateAnnot, String featureName,
            Object value) {
        try {
            @SuppressWarnings("unchecked")
            List<Integer> matchesList = (List<Integer>) updateAnnot.getFeatures()
                    .get(ANNOTATION_COREF_FEATURE_NAME);
            if ((matchesList == null) || matchesList.isEmpty()) {
                return;
            } else {
                String updateAnnotString = orthoAnnotation.getStringForAnnotation(updateAnnot, document)
                        .toLowerCase();
                for (Integer nextId : matchesList) {
                    Annotation a = nameAllAnnots.get(nextId);

                    if (orthoAnnotation.fuzzyMatch(orthoAnnotation.getStringForAnnotation(a, document),
                            updateAnnotString)) {
                        if (DEBUG) {
                            log.debug("propogateProperty: " + featureName + " " + value + " from: "
                                    + updateAnnot.getId() + " to: " + a.getId());
                        }
                        a.getFeatures().put(featureName, value);
                    }
                }
            }
        } catch (Exception e) {
            log.error("Error in propogatePropertyToExactMatchingMatches", e);
        }
    }

    protected boolean matchAnnotations(Annotation newAnnot, String annotString, Annotation prevAnnot) {
        //do not match two annotations that overlap
        if (newAnnot.overlaps(prevAnnot))
            return false;

        // find which annotation string of the two is longer
        //  this is useful for some of the matching rules
        String prevAnnotString = processedAnnots.get(prevAnnot.getId());
        // Out.prln("matchAnnotations processing " + annotString + " and " + prevAnnotString);
        if (prevAnnotString == null) {
            //    Out.prln("We discovered that the following string is null!:  " + prevAnnot.getId() +
            //    " For the previous annotation " + getStringForAnnotation(prevAnnot, document) +
            //    " which has annotation type " + prevAnnot.getType() +
            //    " Tried to compared it to the annotation string " + annotString);
            return false;
        }

        String longName = prevAnnotString;
        String shortName = annotString;
        longAnnot = prevAnnot;
        shortAnnot = newAnnot;
        boolean longerPrevious = true;

        if (shortName.length() > longName.length()) {
            String temp = longName;
            longName = shortName;
            shortName = temp;
            Annotation tempAnn = longAnnot;
            longAnnot = shortAnnot;
            shortAnnot = tempAnn;
            longerPrevious = false;
        } //if

        tokensLongAnnot = (ArrayList<Annotation>) tokensMap.get(longAnnot.getId());
        normalizedTokensLongAnnot = (ArrayList<Annotation>) normalizedTokensMap.get(longAnnot.getId());
        tokensShortAnnot = (ArrayList<Annotation>) tokensMap.get(shortAnnot.getId());
        normalizedTokensShortAnnot = (ArrayList<Annotation>) normalizedTokensMap.get(shortAnnot.getId());

        @SuppressWarnings("unchecked")
        List<Integer> matchesList = (List<Integer>) prevAnnot.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
        if (matchesList == null || matchesList.isEmpty())
            return apply_rules_namematch(prevAnnot.getType(), shortName, longName, prevAnnot, newAnnot,
                    longerPrevious);

        //if these two match, then let's see if all the other matching one will too
        //that's needed, because sometimes names can share a token (e.g., first or
        //last but not be the same
        if (apply_rules_namematch(prevAnnot.getType(), shortName, longName, prevAnnot, newAnnot, longerPrevious)) {
            /**
             * Check whether we need to ensure that there is a match with the rest
             * of the matching annotations, because the rule requires that
             * transtivity is not assummed.
             */
            if (allMatchingNeeded) {
                allMatchingNeeded = false;

                List<Integer> toMatchList = new ArrayList<Integer>(matchesList);
                //      if (newAnnot.getType().equals(unknownType))
                //        Out.prln("Matching new " + annotString + " with annots " + toMatchList);
                toMatchList.remove(prevAnnot.getId());

                return matchOtherAnnots(toMatchList, newAnnot, annotString);
            } else
                return true;
        }
        return false;
    }

    /** This method checkes whether the new annotation matches
     *  all annotations given in the toMatchList (it contains ids)
     *  The idea is that the new annotation needs to match all those,
     *  because assuming transitivity does not always work, when
     *  two different entities share a common token: e.g., BT Cellnet
     *  and BT and British Telecom.
     */
    protected boolean matchOtherAnnots(List<Integer> toMatchList, Annotation newAnnot, String annotString) {

        //if the list is empty, then we're matching all right :-)
        if (toMatchList.isEmpty())
            return true;

        boolean matchedAll = true;
        int i = 0;

        while (matchedAll && i < toMatchList.size()) {
            Annotation prevAnnot = nameAllAnnots.get(toMatchList.get(i));

            // find which annotation string of the two is longer
            //  this is useful for some of the matching rules
            String prevAnnotString = processedAnnots.get(prevAnnot.getId());
            if (prevAnnotString == null)
                try {
                    prevAnnotString = document.getContent()
                            .getContent(prevAnnot.getStartNode().getOffset(), prevAnnot.getEndNode().getOffset())
                            .toString();
                } catch (InvalidOffsetException ioe) {
                    return false;
                } //try

            String longName = prevAnnotString;
            String shortName = annotString;
            longAnnot = prevAnnot;
            shortAnnot = newAnnot;
            boolean longerPrevious = true;
            if (shortName.length() >= longName.length()) {
                String temp = longName;
                longName = shortName;
                shortName = temp;
                Annotation tempAnn = longAnnot;
                longAnnot = shortAnnot;
                shortAnnot = tempAnn;
                longerPrevious = false;
            } //if

            tokensLongAnnot = (ArrayList<Annotation>) tokensMap.get(longAnnot.getId());
            normalizedTokensLongAnnot = (ArrayList<Annotation>) normalizedTokensMap.get(longAnnot.getId());
            tokensShortAnnot = (ArrayList<Annotation>) tokensMap.get(shortAnnot.getId());
            normalizedTokensShortAnnot = (ArrayList<Annotation>) normalizedTokensMap.get(shortAnnot.getId());

            matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName, longName, prevAnnot, newAnnot,
                    longerPrevious);
            //      if (newAnnot.getType().equals(unknownType))
            //      Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll);

            i++;
        } //while
        return matchedAll;
    }

    @SuppressWarnings("unchecked")
    protected void docCleanup() {
        Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME);
        if (matchesValue != null && (matchesValue instanceof Map))
            ((Map<String, List<List<Integer>>>) matchesValue).remove(nameAllAnnots.getName());
        else if (matchesValue != null) {
            document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap<String, List<List<Integer>>>());
        }

        //get all annotations that have a matches feature
        HashSet<String> fNames = new HashSet<String>();
        fNames.add(ANNOTATION_COREF_FEATURE_NAME);
        AnnotationSet annots = nameAllAnnots.get(null, fNames);

        //  Out.prln("Annots to cleanup" + annots);

        if (annots == null || annots.isEmpty())
            return;

        Iterator<Annotation> iter = annots.iterator();
        while (iter.hasNext()) {
            while (iter.hasNext())
                iter.next().getFeatures().remove(ANNOTATION_COREF_FEATURE_NAME);
        } //while
    }//cleanup

    static Pattern periodPat = Pattern.compile("[\\.]+");

    protected void normalizePersonName(Annotation annot) throws ExecutionException {
        ArrayList<Annotation> tokens = (ArrayList<Annotation>) normalizedTokensMap.get(annot.getId());
        for (int i = tokens.size() - 1; i >= 0; i--) {
            String tokenString = ((String) tokens.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME));
            String kind = (String) tokens.get(i).getFeatures().get(TOKEN_KIND_FEATURE_NAME);
            //String category = (String) tokens.get(i).getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
            if (!caseSensitive) {
                tokenString = tokenString.toLowerCase();
            }
            // log.debug("tokenString: " + tokenString + " kind: " + kind + " category: " + category);
            if (kind.equals(PUNCTUATION_VALUE)) {
                // log.debug("Now tagging it!");
                tokens.get(i).getFeatures().put("ortho_stop", true);
            }
        }

        ArrayList<Annotation> normalizedTokens = new ArrayList<Annotation>(tokens);
        for (int j = normalizedTokens.size() - 1; j >= 0; j--) {
            if (normalizedTokens.get(j).getFeatures().containsKey("ortho_stop")) {
                // log.debug("Now removing " + normalizedTokens.get(j).getFeatures().get(TOKEN_STRING_FEATURE_NAME));
                normalizedTokens.remove(j);
            }
        }
        // log.debug("normalizedTokens size is: " + normalizedTokens.size());
        normalizedTokensMap.put(annot.getId(), normalizedTokens);
    }

    /** return an organization  without a designator and starting The*/
    protected String normalizeOrganizationName(String annotString, Annotation annot) {

        ArrayList<Annotation> tokens = (ArrayList<Annotation>) tokensMap.get(annot.getId());

        //strip starting The first
        if (((String) tokens.get(0).getFeatures().get(TOKEN_STRING_FEATURE_NAME)).equalsIgnoreCase(THE_VALUE))
            tokens.remove(0);

        if (tokens.size() > 0) {

            // New code by A. Borthwick of Spock Networks
            // June 13, 2008
            // Strip everything on the cdg list, which now encompasses not just cdg's, but also other stopwords
            // Start from the right side so we don't mess up the arraylist
            for (int i = tokens.size() - 1; i >= 0; i--) {
                String tokenString = ((String) tokens.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME));
                String kind = (String) tokens.get(i).getFeatures().get(TOKEN_KIND_FEATURE_NAME);
                String category = (String) tokens.get(i).getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
                if (!caseSensitive) {
                    tokenString = tokenString.toLowerCase();
                }
                // Out.prln("tokenString: " + tokenString + " kind: " + kind + " category: " + category);
                if (kind.equals(PUNCTUATION_VALUE)
                        || ((category != null) && (category.equals("DT") || category.equals("IN")))
                        || cdg.contains(tokenString)) {
                    // Out.prln("Now tagging it!");
                    tokens.get(i).getFeatures().put("ortho_stop", true);
                }
            }

            // AB, Spock:  Need to check for CDG even for 1 token so we don't automatically match
            // a one-token annotation called "Company", for instance
            String compareString = (String) tokens.get(tokens.size() - 1).getFeatures()
                    .get(TOKEN_STRING_FEATURE_NAME);
            if (!caseSensitive) {
                compareString = compareString.toLowerCase();
            }
            if (cdg.contains(compareString)) {
                tokens.remove(tokens.size() - 1);
            }

        }

        ArrayList<Annotation> normalizedTokens = new ArrayList<Annotation>(tokens);
        for (int j = normalizedTokens.size() - 1; j >= 0; j--) {
            if (normalizedTokens.get(j).getFeatures().containsKey("ortho_stop")) {
                normalizedTokens.remove(j);
            }
        }

        normalizedTokensMap.put(annot.getId(), normalizedTokens);

        StringBuffer newString = new StringBuffer(50);
        for (int i = 0; i < tokens.size(); i++) {
            newString.append((String) tokens.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME));
            if (i != tokens.size() - 1)
                newString.append(" ");
        }
        // Out.prln("Strip CDG returned: " + newString + "for string " + annotString);

        if (caseSensitive)
            return newString.toString();

        return newString.toString().toLowerCase();
    }

    /** creates the lookup tables */
    protected void createAnnotList(String nameFile, String nameList) throws IOException {
        // create the relative URL
        URL fileURL = new URL(definitionFileURL, nameFile);
        BufferedReader bufferedReader = null;
        try {
            bufferedReader = new BomStrippingInputStreamReader(fileURL.openStream(), encoding);

            String lineRead = null;
            while ((lineRead = bufferedReader.readLine()) != null) {
                if (nameList.compareTo(CDGLISTNAME) == 0) {
                    Matcher matcher = punctPat.matcher(lineRead.toLowerCase().trim());
                    lineRead = matcher.replaceAll(" ").trim();
                    if (caseSensitive)
                        cdg.add(lineRead);
                    else
                        cdg.add(lineRead.toLowerCase());
                } // if
                else {
                    int index = lineRead.indexOf("");
                    if (index != -1) {
                        String expr = lineRead.substring(0, index);
                        // if not case-sensitive, we need to downcase all strings
                        if (!caseSensitive)
                            expr = expr.toLowerCase();
                        String code = lineRead.substring(index + 1, lineRead.length());
                        if (nameList.equals(ALIASLISTNAME)) {
                            alias.put(expr, code);
                        } else if (nameList.equals(ARTLISTNAME)) {
                            def_art.put(expr, code);
                        } else if (nameList.equals(PREPLISTNAME)) {
                            prepos.put(expr, code);
                        } else if (nameList.equals(CONNECTORLISTNAME)) {
                            connector.put(expr, code);
                        } else if (nameList.equals(SPURLISTNAME)) {
                            spur_match.put(expr, code);
                        }
                    } // if
                } // else

            } // while
        } finally {
            IOUtils.closeQuietly(bufferedReader);
        }
    }// createAnnotList

    /**
     * This is the skeleton of a function which should be available in OrthoMatcher to allow a pairwise comparison of two name strings
     * It should eventually be made public.  It is private here (and thus non-functional) because OrthoMatcher is currently reliant
     * on the tokenization of the names, which are held in the global variables tokensShortAnnot and tokensLongAnnot
     *
     * @param name1
     * @param name2
     * @return  true if the two names indicate the same person
     */
    @SuppressWarnings("unused")
    private boolean pairwise_person_name_match(String name1, String name2) {
        String shortName, longName;
        if (name1.length() > name2.length()) {
            longName = name1;
            shortName = name2;
        } else {
            longName = name2;
            shortName = name1;
        }
        if (rules.get(0).value(longName, shortName)) {//matchRule0(longName,shortName)
            return false;
        } else {
            if (longName.equals(shortName) || rules.get(2).value(longName, shortName)
                    || rules.get(3).value(longName, shortName)) {
                return true;
            } else {
                return (rules.get(0).value(longName, shortName));
                // boolean throwAway[] = new boolean[17];
                // return basic_person_match_criteria(shortName,longName,throwAway);
                // The above doesn't work because basic_person_match_criteria is reliant on the global
                // variables tokensShortAnnot and tokensLongAnnot so I just call what I can directly
            }
        }
    }

    /**
     * basic_person_match_criteria
     * Note that this function relies on various global variables in some other match rules.
     */
    private boolean basic_person_match_criteria(String shortName, String longName, boolean mr[]) {

        if ( // For 4, 5, 14, and 15, need to mark shorter annot
             //kalina: added 16, so it matches names when contain more than one first and one last name
        OrthoMatcherHelper.executeDisjunction(rules, new int[] { 1, 5, 6, 13, 15, 16 }, longName, shortName, mr)) {
            return true;
        }
        return false;
    }

    /** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */
    private boolean apply_rules_namematch(String annotationType, String shortName, String longName,
            Annotation prevAnnot, Annotation followAnnot, boolean longerPrevious) {
        boolean mr[] = new boolean[rules.size()];
        // first apply rule for spurious matches i.e. rule0
        if (DEBUG) {
            log.debug("Now matching " + longName + "(id: " + longAnnot.getId() + ") to " + shortName + "(id: "
                    + shortAnnot.getId() + ")");
        }

        if (rules.get(0).value(longName, shortName))
            return false;
        if ((// rules for all annotations
             //no longer use rule1, coz I do the check for same string via the hash table
        OrthoMatcherHelper.executeDisjunction(rules, new int[] { 2, 3 }, longName, shortName, mr)

        ) // rules for all annotations
                || (// rules for organisation annotations
                (annotationType.equals(organizationType)
                        //ACE addition
                        || annotationType.equals("Facility")) &&
                // Should basically only match when you have a match of all tokens other than
                // CDG's and function words
                        ((!highPrecisionOrgs && OrthoMatcherHelper.executeDisjunction(rules,
                                new int[] { 4, 6, 7, 8, 9, 10, 11, 12, 14 }, longName, shortName, mr))
                                || (highPrecisionOrgs && OrthoMatcherHelper.executeDisjunction(rules,
                                        new int[] { 7, 8, 10, 11, 17 }, longName, shortName, mr))))) {// rules for organisation annotations
            return true;
        }

        if (// rules for person annotations
        (annotationType.equals(personType))) {
            if (noMatchRule1(longName, shortName, prevAnnot, longerPrevious) || noMatchRule2(longName, shortName)) {
                // Out.prln("noMatchRule1 rejected match between " + longName + " and " + shortName);
                return false;
            } else {
                if (basic_person_match_criteria(shortName, longName, mr)) {
                    if ((longName.length() != shortName.length()) && (mr[4] || mr[5] || mr[14] || mr[15])) {
                        if (longerPrevious) {
                            followAnnot.getFeatures().put("matchedWithLonger", true);
                        } else {
                            prevAnnot.getFeatures().put("matchedWithLonger", true);
                        }
                    } else if ((longName.length() == shortName.length()) && (mr[1])) {
                        if (prevAnnot.getFeatures().containsKey("matchedWithLonger")) {
                            followAnnot.getFeatures().put("matchedWithLonger", true);
                        }
                    }
                    return true;
                }
                return false;
            }
        }
        return false;
    }//apply_rules

    /** set the extLists flag */
    @Optional
    @CreoleParameter(comment = "External lists otherwise internal", defaultValue = "true")
    public void setExtLists(Boolean newExtLists) {
        extLists = newExtLists.booleanValue();
    }//setextLists

    /** set the caseSensitive flag */
    @Optional
    @CreoleParameter(comment = "Should this resource diferentiate on case?", defaultValue = "false")
    public void setCaseSensitive(Boolean newCase) {
        caseSensitive = newCase.booleanValue();
    }//setextLists

    /** set the annotation set name*/
    @RunTime
    @Optional
    @CreoleParameter(comment = "Annotation set name where are the annotation types (annotationTypes)")
    public void setAnnotationSetName(String newAnnotationSetName) {
        annotationSetName = newAnnotationSetName;
    }//setAnnotationSetName

    /** set the types of the annotations*/
    @RunTime
    @Optional
    @CreoleParameter(comment = "Name of the annotation types to use", defaultValue = "Organization;Person;Location;Date")
    public void setAnnotationTypes(List<String> newType) {
        annotationTypes = newType;
    }//setAnnotationTypes

    /** set whether to process the Unknown annotations*/
    @Optional
    @CreoleParameter(comment = "Should we process 'Unknown' annotations?", defaultValue = "true")
    public void setProcessUnknown(Boolean processOrNot) {
        this.matchingUnknowns = processOrNot.booleanValue();
    }//setAnnotationTypes

    @Optional
    @CreoleParameter(comment = "Annotation name for the organizations", defaultValue = "Organization")
    public void setOrganizationType(String newOrganizationType) {
        organizationType = newOrganizationType;
    }//setOrganizationType

    @Optional
    @CreoleParameter(comment = "Annotation name for the persons", defaultValue = "Person")
    public void setPersonType(String newPersonType) {
        personType = newPersonType;
    }//setPersonType

    /**get the name of the annotation set*/
    public String getAnnotationSetName() {
        return annotationSetName;
    }//getAnnotationSetName

    /** get the types of the annotation*/
    public List<String> getAnnotationTypes() {
        return annotationTypes;
    }//getAnnotationTypes

    public String getOrganizationType() {
        return organizationType;
    }

    public String getPersonType() {
        return personType;
    }

    public Boolean getExtLists() {
        return new Boolean(extLists);
    }

    /** Are we running in a case-sensitive mode?*/
    public Boolean getCaseSensitive() {
        return new Boolean(caseSensitive);
    }

    /** Return whether or not we're processing the Unknown annots*/
    public Boolean getProcessUnknown() {
        return new Boolean(matchingUnknowns);
    }

    /**
    No Match Rule 1:
    Avoids the problem of matching
    David Jones ...
    David ...
    David Smith
    Since "David" was matched with David Jones, we don't match David with David Smith.
     */
    public boolean noMatchRule1(String s1, String s2, Annotation previousAnnot, boolean longerPrevious) {
        //    if (DEBUG) {
        //      try {
        //        String annotString = getStringForAnnotation(previousAnnot, document );

        //        log.debug("Previous annotation was " + annotString +  "(id: " + previousAnnot.getId() + ")" + " features are " + previousAnnot.getFeatures());
        //      }
        //      catch (ExecutionException e) {}
        //    }

        if (longerPrevious || !previousAnnot.getFeatures().containsKey("matchedWithLonger")) {
            return false;
        } else {
            return true;
        }
    }//noMatchRule1

    /***
     * returns true if it detects a middle name which indicates that the name string contains a nickname or a
     * compound last name
     */
    private boolean detectBadMiddleTokens(ArrayList<Annotation> tokArray) {
        for (int j = 1; j < tokArray.size() - 1; j++) {
            String currentToken = (String) tokArray.get(j).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
            Matcher matcher = badMiddleTokens.matcher(currentToken.toLowerCase().trim());
            if (matcher.find()) {
                // We have found a case of a ", ',
                return true;
            }
        }
        return false;
    }

    /**
     * NoMatch Rule #2: Do we have a mismatch of middle initial?
     * Condition(s):  Only applies to person names with more than two tokens in the name
     *
     * Want George W. Bush != George H. W. Bush and George Walker Bush != George Herbert Walker Bush
     * and
     * John T. Smith != John Q. Smith
     * however
     * John T. Smith == John Thomas Smith
     * be careful about
     * Hillary Rodham Clinton == Hillary Rodham-Clinton
     * be careful about
     * Carlos Bueno de Lopez == Bueno de Lopez
     * and
     * Cynthia Morgan de Rothschild == Cynthia de Rothschild
     */
    @SuppressWarnings("unused")
    public boolean noMatchRule2(String s1, String s2) {
        if (normalizedTokensLongAnnot.size() > 2 && normalizedTokensShortAnnot.size() > 2) {
            boolean retval = false;
            if (normalizedTokensLongAnnot.size() != normalizedTokensShortAnnot.size()) {
                String firstNameLong = (String) normalizedTokensLongAnnot.get(0).getFeatures()
                        .get(TOKEN_STRING_FEATURE_NAME);
                String firstNameShort = (String) normalizedTokensShortAnnot.get(0).getFeatures()
                        .get(TOKEN_STRING_FEATURE_NAME);
                String lastNameLong = (String) normalizedTokensLongAnnot.get(normalizedTokensLongAnnot.size() - 1)
                        .getFeatures().get(TOKEN_STRING_FEATURE_NAME);
                String lastNameShort = (String) normalizedTokensShortAnnot
                        .get(normalizedTokensShortAnnot.size() - 1).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
                if (rules.get(1).value(firstNameLong, firstNameShort)
                        && (rules.get(1).value(lastNameLong, lastNameShort))) {
                    // Must have a match on first and last name for this non-match rule to take effect when the number of tokens differs
                    if (detectBadMiddleTokens(tokensLongAnnot) || detectBadMiddleTokens(tokensShortAnnot)) {
                        // Exclude the William (Bill) H. Gates vs. William H. Gates case and the
                        // Cynthia Morgan de Rothschild vs. Cynthia de Rothschild case
                        if (DEBUG && log.isDebugEnabled()) {
                            log.debug("noMatchRule2Name did not non-match because of bad middle tokens " + s1
                                    + "(id: " + longAnnot.getId() + ") to " + s2 + "(id: " + shortAnnot.getId()
                                    + ")");
                        }
                        return false;
                    } else {
                        // Covers the George W. Bush vs George H. W. Bush and George Walker Bush vs. George Herbert Walker Bush cases
                        retval = true;
                    }
                }
            } else {
                for (int i = 1; i < normalizedTokensLongAnnot.size() - 1; i++) {
                    String s1_middle = (String) normalizedTokensLongAnnot.get(i).getFeatures()
                            .get(TOKEN_STRING_FEATURE_NAME);
                    String s2_middle = (String) normalizedTokensShortAnnot.get(i).getFeatures()
                            .get(TOKEN_STRING_FEATURE_NAME);
                    if (!caseSensitive) {
                        s1_middle = s1_middle.toLowerCase();
                        s2_middle = s2_middle.toLowerCase();
                    }
                    //          log.debug("noMatchRule2 comparing substring " + s1_middle + " to " + s2_middle);
                    if (!(rules.get(1).value(s1_middle, s2_middle)
                            || OrthoMatcherHelper.initialMatch(s1_middle, s2_middle))) {
                        // We found a mismatching middle name
                        retval = true;
                        break;
                    }
                }
            }
            if (retval && log.isDebugEnabled() && DEBUG) {
                log.debug("noMatchRule2Name non-matched  " + s1 + "(id: " + longAnnot.getId() + ") to " + s2
                        + "(id: " + shortAnnot.getId() + ")");
            }
            return retval;
        } // if (normalizedTokensLongAnnot.size()>2 && normalizedTokensShortAnnot.size()>2)
        return false;
    }//noMatchRule2

    @CreoleParameter(comment = "The URL to the definition file", defaultValue = "resources/othomatcher/listsNM.def", suffixes = "def")
    public void setDefinitionFileURL(java.net.URL definitionFileURL) {
        this.definitionFileURL = definitionFileURL;
    }

    public java.net.URL getDefinitionFileURL() {
        return definitionFileURL;
    }

    @CreoleParameter(comment = "The encoding used for reading the definition file", defaultValue = "UTF-8")
    public void setEncoding(String encoding) {
        this.encoding = encoding;
    }

    public String getEncoding() {
        return encoding;
    }

    public Double getMinimumNicknameLikelihood() {
        return minimumNicknameLikelihood;
    }

    @CreoleParameter(comment = "Minimum likelihood that a name is a nickname", defaultValue = "0.50")
    public void setMinimumNicknameLikelihood(Double minimumNicknameLikelihood) {
        this.minimumNicknameLikelihood = minimumNicknameLikelihood;
    }

    /**
     * @return the highPrecisionOrgs
     */
    public Boolean getHighPrecisionOrgs() {
        return highPrecisionOrgs;
    }

    /**
     * @param highPrecisionOrgs the highPrecisionOrgs to set
     */
    @Optional
    @CreoleParameter(comment = "Use very safe features for matching orgs, such as ACME = ACME, Inc.", defaultValue = "false")
    public void setHighPrecisionOrgs(Boolean highPrecisionOrgs) {
        this.highPrecisionOrgs = highPrecisionOrgs;
    }

    public void setOrthography(AnnotationOrthography orthography) {
        this.orthoAnnotation = orthography;
    }

    public AnnotationOrthography getOrthography() {
        return orthoAnnotation;
    }

    static Pattern punctPat = Pattern.compile("[\\p{Punct}]+");
    // The UTF characters are right and left double and single curly quotes
    static Pattern badMiddleTokens = Pattern.compile("[\u201c\u201d\u2018\u2019\'\\(\\)\"]+|^de$|^von$");
}