org.languagetool.rules.uk.TokenAgreementNounVerbRule.java Source code

Introduction

Here is the source code for org.languagetool.rules.uk.TokenAgreementNounVerbRule.java
Source

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2013 Andriy Rysin
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.uk;

import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.ResourceBundle;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.rules.Categories;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.tagging.uk.PosTagHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A rule that checks if noun and verb agree
 * 
 * @author Andriy Rysin
 * @since 3.6
 */
public class TokenAgreementNounVerbRule extends Rule {

    private static Logger logger = LoggerFactory.getLogger(TokenAgreementNounVerbRule.class);

    private static final Pattern VERB_INFLECTION_PATTERN = Pattern.compile(":([mfnps])(:([123])?|$)");
    private static final Pattern NOUN_INFLECTION_PATTERN = Pattern
            .compile("(?::((?:[iu]n)?anim))?:([mfnps]):(v_naz)");
    private static final Pattern NOUN_PERSON_PATTERN = Pattern.compile(":([123])");

    public TokenAgreementNounVerbRule(ResourceBundle messages) throws IOException {
        super.setCategory(Categories.MISC.getCategory(messages));
        //    setDefaultOff();
    }

    @Override
    public final String getId() {
        return "UK_NOUN_VERB_INFLECTION_AGREEMENT";
    }

    @Override
    public String getDescription() {
        return "?   ?  , ?  ?";
    }

    public String getShort() {
        return "?   ?";
    }

    /**
     * Indicates if the rule is case-sensitive. 
     * @return true if the rule is case-sensitive, false otherwise.
     */
    public boolean isCaseSensitive() {
        return false;
    }

    @Override
    public final RuleMatch[] match(AnalyzedSentence sentence) {
        List<RuleMatch> ruleMatches = new ArrayList<>();
        AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();

        List<AnalyzedToken> nounTokenReadings = new ArrayList<>();
        AnalyzedTokenReadings nounAnalyzedTokenReadings = null;

        for (int i = 1; i < tokens.length; i++) {
            AnalyzedTokenReadings tokenReadings = tokens[i];

            String posTag0 = tokenReadings.getAnalyzedToken(0).getPOSTag();

            //TODO: skip conj . 

            if (posTag0 == null) {
                nounTokenReadings.clear();
                continue;
            }

            if (nounTokenReadings.isEmpty()) {
                // no need to start checking on last token or if no noun
                if (i == tokens.length - 1)
                    continue;

                if (!PosTagHelper.hasPosTag(tokenReadings, "noun.*:v_naz.*"))
                    continue;

                for (AnalyzedToken token : tokenReadings) {
                    String nounPosTag = token.getPOSTag();

                    if (nounPosTag == null) { // can happen for words with \u0301 or \u00AD
                        continue;
                    }

                    //          if( nounPosTag.startsWith("<") ) {
                    //            nounTokenReadings.clear();
                    //            break;
                    //          }

                    if (nounPosTag.startsWith("noun") && nounPosTag.contains("v_naz")) {
                        nounTokenReadings.add(token);
                        nounAnalyzedTokenReadings = tokenReadings;
                    }
                    //          else if ( nounPosTag.equals(JLanguageTool.SENTENCE_END_TAGNAME) ) {
                    //            continue;
                    //          }
                    else {
                        nounTokenReadings.clear();
                        break;
                    }
                }

                continue;
            }

            // see if we get a following verb
            //       System.err.println("Check for verb: " + tokenReadings);

            List<AnalyzedToken> verbTokenReadings = new ArrayList<>();
            for (AnalyzedToken token : tokenReadings) {
                String verbPosTag = token.getPOSTag();

                if (verbPosTag == null) { // can happen for words with \u0301 or \u00AD
                    continue;
                }

                if (verbPosTag.startsWith("<")) {
                    verbTokenReadings.clear();
                    break;
                }

                if (verbPosTag.startsWith("verb")) {

                    verbTokenReadings.add(token);
                } else if (verbPosTag.equals(JLanguageTool.SENTENCE_END_TAGNAME)) {
                    continue;
                } else {
                    verbTokenReadings.clear();
                    break;
                }
            }

            // no slave token - restart

            if (verbTokenReadings.isEmpty()) {
                nounTokenReadings.clear();
                continue;
            }

            logger.debug("=== Checking\n\t{}\n\t{}", nounTokenReadings, verbTokenReadings);

            // perform the check

            List<Inflection> masterInflections = getNounInflections(nounTokenReadings);

            List<Inflection> slaveInflections = getVerbInflections(verbTokenReadings);

            logger.debug("\t\t{}\n\t{}", masterInflections, slaveInflections);

            if (Collections.disjoint(masterInflections, slaveInflections)) {
                if (TokenAgreementNounVerbExceptionHelper.isException(tokens, i, masterInflections,
                        slaveInflections, nounTokenReadings, verbTokenReadings)) {
                    nounTokenReadings.clear();
                    break;
                }

                if (logger.isDebugEnabled()) {
                    logger.debug(MessageFormat.format("=== Found noun/verb mismatch\n\t{0}\n\t{1}",
                            nounAnalyzedTokenReadings.getToken() + ": " + masterInflections + " // "
                                    + nounAnalyzedTokenReadings,
                            verbTokenReadings.get(0).getToken() + ": " + slaveInflections + " // "
                                    + verbTokenReadings));
                }

                String msg = String.format(
                        "?    ?: \"%s\" (%s)  \"%s\" (%s)",
                        nounTokenReadings.get(0).getToken(), formatInflections(masterInflections, true),
                        verbTokenReadings.get(0).getToken(), formatInflections(slaveInflections, false));
                RuleMatch potentialRuleMatch = new RuleMatch(this, sentence,
                        nounAnalyzedTokenReadings.getStartPos(), tokenReadings.getEndPos(), msg, getShort());
                ruleMatches.add(potentialRuleMatch);
            }

            nounTokenReadings.clear();
        }

        return toRuleMatchArray(ruleMatches);
    }

    private static String formatInflections(List<Inflection> inflections, boolean noun) {

        Collections.sort(inflections);

        List<String> list = new ArrayList<>();

        for (Inflection inflection : inflections) {
            String str = "";
            if (inflection.gender != null) {
                str = PosTagHelper.GENDER_MAP.get(inflection.gender);
            } else {
                if (inflection.person != null) {
                    str = PosTagHelper.PERSON_MAP.get(inflection.person);
                }
                if (inflection.plural != null) {
                    if (str.length() > 0) {
                        str += " ";
                    }
                    str += PosTagHelper.GENDER_MAP.get(inflection.plural);
                }
            }
            list.add(str);
        }

        LinkedHashSet<String> uniqeList = new LinkedHashSet<>(list);

        return StringUtils.join(uniqeList, ", ");
    }

    static List<Inflection> getVerbInflections(List<AnalyzedToken> nounTokenReadings) {
        List<Inflection> verbGenders = new ArrayList<>();
        for (AnalyzedToken token : nounTokenReadings) {
            String posTag = token.getPOSTag();

            if (posTag == null || !posTag.startsWith("verb"))
                continue;

            if (posTag.contains(":inf")) {
                verbGenders.add(new Inflection("i", null));
                continue;
            }

            if (posTag.contains(":impers")) {
                verbGenders.add(new Inflection("o", null));
                continue;
            }

            Matcher matcher = VERB_INFLECTION_PATTERN.matcher(posTag);
            matcher.find();

            String gen = matcher.group(1);
            String person = matcher.group(3);

            verbGenders.add(new Inflection(gen, person));
        }
        //    System.err.println("verbInfl: " + verbGenders);
        return verbGenders;
    }

    static List<Inflection> getNounInflections(List<AnalyzedToken> nounTokenReadings) {
        List<Inflection> slaveInflections = new ArrayList<>();
        for (AnalyzedToken token : nounTokenReadings) {
            String posTag2 = token.getPOSTag();
            if (posTag2 == null)
                continue;

            Matcher matcher = NOUN_INFLECTION_PATTERN.matcher(posTag2);
            if (!matcher.find()) {
                //           System.err.println("Failed to find slave inflection tag in " + posTag2 + " for " + nounTokenReadings);
                continue;
            }
            String gen = matcher.group(2);

            Matcher matcherPerson = NOUN_PERSON_PATTERN.matcher(posTag2);
            String person = matcherPerson.find() ? matcherPerson.group(1) : null;

            slaveInflections.add(new Inflection(gen, person));
        }
        //    System.err.println("nounInfl: " + slaveInflections);
        return slaveInflections;
    }

    static boolean inflectionsOverlap(List<AnalyzedToken> verbTokenReadings,
            List<AnalyzedToken> nounTokenReadings) {
        return !Collections.disjoint(getVerbInflections(verbTokenReadings), getNounInflections(nounTokenReadings));
    }

    static class Inflection implements Comparable<Inflection> {
        final String gender;
        final String plural;
        final String person;

        Inflection(String gender, String person) {
            if (gender.equals("s") || gender.equals("p")) {
                this.gender = null;
                this.plural = gender;
            } else if (gender.equals("i")) {
                this.gender = gender;
                this.plural = gender;
            } else {
                this.gender = gender;
                this.plural = "s";
            }
            this.person = person;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj)
                return true;
            if (obj == null)
                return false;
            if (getClass() != obj.getClass())
                return false;

            Inflection other = (Inflection) obj;

            if (person != null && other.person != null) {
                if (!person.equals(other.person))
                    return false;
            }

            if (gender != null && other.gender != null) {

                // infinitive matches all for now, otherwise too many false positives
                // e.g.    
                if (gender.equals("i") || other.gender.equals("i"))
                    return true;

                if (!gender.equals(other.gender))
                    return false;
            }

            return plural.equals(other.plural);
        }

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + ((gender == null) ? 0 : gender.hashCode());
            result = prime * result + ((plural == null) ? 0 : plural.hashCode());
            result = prime * result + ((person == null) ? 0 : person.hashCode());
            return result;
        }

        @Override
        public String toString() {
            return "Gender: " + gender + "/" + plural + "/" + person;
        }

        @Override
        public int compareTo(Inflection o) {
            Integer thisOrder = gender != null ? InflectionHelper.GEN_ORDER.get(gender) : 0;
            Integer otherOrder = o.gender != null ? InflectionHelper.GEN_ORDER.get(o.gender) : 0;

            int compared = thisOrder.compareTo(otherOrder);
            //      if( compared != 0 )
            return compared;

            //      compared = VIDM_ORDER.get(_case).compareTo(VIDM_ORDER.get(o._case));
            //      return compared;
        }

    }

}