org.languagetool.rules.uk.TokenAgreementAdjNounRule.java Source code

Introduction

Here is the source code for org.languagetool.rules.uk.TokenAgreementAdjNounRule.java
Source

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2013 Andriy Rysin
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.uk;

import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.ResourceBundle;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.language.Ukrainian;
import org.languagetool.rules.Categories;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.uk.InflectionHelper.Inflection;
import org.languagetool.synthesis.Synthesizer;
import org.languagetool.tagging.uk.PosTagHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A rule that checks if adjective and following noun agree on gender and inflection
 * 
 * @author Andriy Rysin
 */
public class TokenAgreementAdjNounRule extends Rule {
    private static Logger logger = LoggerFactory.getLogger(TokenAgreementAdjNounRule.class);

    static final Pattern ADJ_INFLECTION_PATTERN = Pattern.compile(":([mfnp]):(v_...)(:r(in)?anim)?");
    static final Pattern NOUN_INFLECTION_PATTERN = Pattern.compile("((?:[iu]n)?anim):([mfnps]):(v_...)");

    private final Ukrainian ukrainian = new Ukrainian();

    public TokenAgreementAdjNounRule(ResourceBundle messages) throws IOException {
        super.setCategory(Categories.MISC.getCategory(messages));
        //    setDefaultOff();
    }

    @Override
    public final String getId() {
        return "UK_ADJ_NOUN_INFLECTION_AGREEMENT";
    }

    @Override
    public String getDescription() {
        return "? ,   ?   ";
    }

    public String getShort() {
        return "?   ";
    }

    @Override
    public final RuleMatch[] match(AnalyzedSentence sentence) {
        List<RuleMatch> ruleMatches = new ArrayList<>();
        AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();

        List<AnalyzedToken> adjTokenReadings = new ArrayList<>();
        AnalyzedTokenReadings adjAnalyzedTokenReadings = null;

        for (int i = 1; i < tokens.length; i++) {
            AnalyzedTokenReadings tokenReadings = tokens[i];

            String posTag0 = tokenReadings.getAnalyzedToken(0).getPOSTag();

            if (posTag0 == null) {
                //          || posTag0.equals(JLanguageTool.SENTENCE_START_TAGNAME) ){
                adjTokenReadings.clear();
                continue;
            }

            // grab initial adjective inflections

            if (adjTokenReadings.isEmpty()) {

                // no need to start checking on last token or if no noun
                if (i == tokens.length - 1)
                    continue;

                //TODO: nv still can be wrong if :np/:ns is present to it's not much gain for lots of work
                if (PosTagHelper.hasPosTagPart(tokens[i], PosTagHelper.NO_VIDMINOK_SUBSTR)
                        //TODO: turn back on when we can handle pron
                        || PosTagHelper.hasPosTagPart(tokens[i], "&pron")
                        || PosTagHelper.hasPosTagPart(tokens[i], "<"))
                    continue;

                if (!PosTagHelper.hasPosTagPart(tokens[i + 1], "noun:")
                        || PosTagHelper.hasPosTagPart(tokens[i + 1], PosTagHelper.NO_VIDMINOK_SUBSTR)
                        || PosTagHelper.hasPosTagPart(tokens[i + 1], "&pron")
                        || PosTagHelper.hasPosTagPart(tokens[i + 1], "<"))
                    continue;

                //TODO: TEMP?

                if (LemmaHelper.hasLemma(tokens[i],
                        Arrays.asList("", "", "?",
                                "?", ""),
                        ":p:")
                        || LemmaHelper.hasLemma(tokens[i],
                                Arrays.asList("", "", "",
                                        "", "", ""),
                                ":n:")
                        || LemmaHelper.hasLemma(tokens[i], Arrays.asList(""), ":f:")) {
                    adjTokenReadings.clear();
                    break;
                }

                for (AnalyzedToken token : tokenReadings) {
                    String adjPosTag = token.getPOSTag();

                    if (adjPosTag == null) { // can happen for words with \u0301 or \u00AD
                        continue;
                    }

                    if (adjPosTag.startsWith("adj")) {

                        adjTokenReadings.add(token);
                        adjAnalyzedTokenReadings = tokenReadings;
                    } else {
                        adjTokenReadings.clear();
                        break;
                    }
                }

                continue;
            }

            List<AnalyzedToken> slaveTokenReadings = new ArrayList<>();

            for (AnalyzedToken token : tokenReadings) {
                String nounPosTag = token.getPOSTag();

                if (nounPosTag == null) { // can happen for words with \u0301 or \u00AD
                    continue;
                }

                if (nounPosTag.startsWith("noun") && !nounPosTag.contains(PosTagHelper.NO_VIDMINOK_SUBSTR)) {

                    slaveTokenReadings.add(token);
                } else if (nounPosTag.equals(JLanguageTool.SENTENCE_END_TAGNAME)
                        || nounPosTag.equals(JLanguageTool.PARAGRAPH_END_TAGNAME)) {
                    continue;
                } else {
                    slaveTokenReadings.clear();
                    break;
                }
            }

            // no slave token - restart

            if (slaveTokenReadings.isEmpty()) {
                adjTokenReadings.clear();
                continue;
            }

            logger.debug("=== Checking:\n\t{}\n\t{}", adjTokenReadings, slaveTokenReadings);

            // perform the check

            List<InflectionHelper.Inflection> masterInflections = InflectionHelper
                    .getAdjInflections(adjTokenReadings);

            List<InflectionHelper.Inflection> slaveInflections = InflectionHelper
                    .getNounInflections(slaveTokenReadings, "v_zna:var");

            if (Collections.disjoint(masterInflections, slaveInflections)) {

                if (TokenAgreementAdjNounExceptionHelper.isException(tokens, i, masterInflections, slaveInflections,
                        adjTokenReadings, slaveTokenReadings)) {
                    adjTokenReadings.clear();
                    continue;
                }

                if (logger.isDebugEnabled()) {
                    logger.debug(MessageFormat.format("=== Found:\n\t{0}\n\t",
                            adjAnalyzedTokenReadings.getToken() + ": " + masterInflections + " // "
                                    + adjAnalyzedTokenReadings,
                            slaveTokenReadings.get(0).getToken() + ": " + slaveInflections + " // "
                                    + slaveTokenReadings));
                }

                String msg = String.format(
                        " :     : \"%s\": [%s]  \"%s\": [%s]",
                        adjTokenReadings.get(0).getToken(), formatInflections(masterInflections, true),
                        slaveTokenReadings.get(0).getToken(), formatInflections(slaveInflections, false));

                if (PosTagHelper.hasPosTagPart(adjTokenReadings, ":m:v_rod")
                        && tokens[i].getToken().matches(".*[]")
                        && PosTagHelper.hasPosTag(slaveTokenReadings, "noun.*?:m:v_dav.*")) {
                    msg += ". ,     ..  ? -/- ? -/-? ( ?   ?? )?";
                } else if (adjAnalyzedTokenReadings.getToken().contains("-")
                        && Pattern.compile(".*([23]-|[02-9]-|[0-9]-)")
                                .matcher(adjAnalyzedTokenReadings.getToken()).matches()) {
                    msg += ". ,    ? ?? ? ??";
                } else if (adjAnalyzedTokenReadings.getToken().startsWith("")
                        // TODO: && tag(adjAnalyzedTokenReadings.getToken().substring(2)) has adjp
                        && PosTagHelper.hasPosTag(slaveTokenReadings, "noun.*?:v_oru.*")) {
                    msg += ". ,    ? ?";
                }

                RuleMatch potentialRuleMatch = new RuleMatch(this, sentence, adjAnalyzedTokenReadings.getStartPos(),
                        tokenReadings.getEndPos(), msg, getShort());

                Synthesizer ukrainianSynthesizer = ukrainian.getSynthesizer();
                List<String> suggestions = new ArrayList<>();

                try {

                    for (Inflection adjInflection : masterInflections) {
                        String genderTag = ":" + adjInflection.gender + ":";
                        String vidmTag = adjInflection._case;

                        if (!adjInflection._case.equals("v_kly") && (adjInflection.gender.equals("p")
                                || PosTagHelper.hasPosTagPart(slaveTokenReadings, genderTag))) {
                            for (AnalyzedToken nounToken : slaveTokenReadings) {

                                if (adjInflection.animMatters()) {
                                    if (!nounToken.getPOSTag().contains(":" + adjInflection.animTag))
                                        continue;
                                }

                                String newNounPosTag = nounToken.getPOSTag().replaceFirst(":.:v_...",
                                        genderTag + vidmTag);

                                String[] synthesized = ukrainianSynthesizer.synthesize(nounToken, newNounPosTag,
                                        false);

                                for (String s : synthesized) {
                                    String suggestion = adjAnalyzedTokenReadings.getToken() + " " + s;
                                    if (!suggestions.contains(suggestion)) {
                                        suggestions.add(suggestion);
                                    }
                                }
                            }
                        }
                    }

                    for (Inflection nounInflection : slaveInflections) {
                        String genderTag = ":" + nounInflection.gender + ":";
                        String vidmTag = nounInflection._case;

                        if (nounInflection.animMatters()) {
                            vidmTag += ":r" + nounInflection.animTag;
                        }

                        for (AnalyzedToken adjToken : adjTokenReadings) {
                            String newAdjTag = adjToken.getPOSTag().replaceFirst(":.:v_...(:r(in)?anim)?",
                                    genderTag + vidmTag);

                            String[] synthesized = ukrainianSynthesizer.synthesize(adjToken, newAdjTag, false);

                            for (String s : synthesized) {
                                String suggestion = s + " " + tokenReadings.getToken();
                                if (!suggestions.contains(suggestion)) {
                                    suggestions.add(suggestion);
                                }
                            }
                        }

                    }

                } catch (IOException e) {
                    throw new RuntimeException(e);
                }

                //        System.err.println("### " + suggestions);

                if (suggestions.size() > 0) {
                    potentialRuleMatch.setSuggestedReplacements(suggestions);
                }

                ruleMatches.add(potentialRuleMatch);
            }

            adjTokenReadings.clear();
        }

        return toRuleMatchArray(ruleMatches);
    }

    private static String formatInflections(List<Inflection> inflections, boolean adj) {

        Collections.sort(inflections);

        Map<String, List<String>> map = new LinkedHashMap<>();

        for (Inflection inflection : inflections) {
            if (!map.containsKey(inflection.gender)) {
                map.put(inflection.gender, new ArrayList<>());
            }
            String caseStr = PosTagHelper.VIDMINKY_MAP.get(inflection._case);
            if (adj && inflection.animTag != null) {
                caseStr += " (" + (inflection.animTag.equals("anim") ? "?." : "?.") + ")";
            }
            map.get(inflection.gender).add(caseStr);
        }

        List<String> list = new ArrayList<>();
        for (Entry<String, List<String>> entry : map.entrySet()) {
            String genderStr = PosTagHelper.GENDER_MAP.get(entry.getKey());

            List<String> caseValues = entry.getValue();

            list.add(genderStr + ": " + StringUtils.join(caseValues, ", "));
        }

        return StringUtils.join(list, ", ");
    }

}