org.languagetool.rules.uk.TokenAgreementRule.java Source code

Introduction

Here is the source code for org.languagetool.rules.uk.TokenAgreementRule.java
Source

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2013 Andriy Rysin
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.uk;

import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.ResourceBundle;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.jetbrains.annotations.Nullable;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.language.Ukrainian;
import org.languagetool.rules.Category;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.synthesis.Synthesizer;
import org.languagetool.tagging.uk.IPOSTag;
import org.languagetool.tagging.uk.PosTagHelper;

/**
 * A rule that checks if tokens in the sentence agree on inflection etc
 * 
 * @author Andriy Rysin
 */
public class TokenAgreementRule extends Rule {
    private static final String NO_VIDMINOK_SUBSTR = ":nv";
    private static final String REQUIRE_VIDMINOK_SUBSTR = ":rv_";
    private static final String VIDMINOK_SUBSTR = ":v_";
    private static final Pattern REQUIRE_VIDMINOK_REGEX = Pattern.compile(":r(v_[a-z]+)");
    private static final Pattern VIDMINOK_REGEX = Pattern.compile(":(v_[a-z]+)");

    private final Ukrainian ukrainian = new Ukrainian();

    private static final Set<String> STREETS = new HashSet<>(
            Arrays.asList("??", "?", ""));

    public TokenAgreementRule(final ResourceBundle messages) throws IOException {
        super.setCategory(new Category(messages.getString("category_misc")));
    }

    @Override
    public final String getId() {
        return "UK_TOKEN_AGREEMENT";
    }

    @Override
    public String getDescription() {
        return "? ?  ";
    }

    public String getShort() {
        return "? ?  ";
    }

    /**
     * Indicates if the rule is case-sensitive. 
     * @return true if the rule is case-sensitive, false otherwise.
     */
    public boolean isCaseSensitive() {
        return false;
    }

    @Override
    public final RuleMatch[] match(final AnalyzedSentence text) {
        List<RuleMatch> ruleMatches = new ArrayList<>();
        AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
        boolean insideMultiword = false;

        AnalyzedTokenReadings reqTokenReadings = null;
        for (int i = 0; i < tokens.length; i++) {
            AnalyzedTokenReadings tokenReadings = tokens[i];

            String posTag = tokenReadings.getAnalyzedToken(0).getPOSTag();

            //TODO: skip conj . 

            if (posTag == null || posTag.contains(IPOSTag.unknown.getText())
                    || posTag.equals(JLanguageTool.SENTENCE_START_TAGNAME)) {
                reqTokenReadings = null;
                continue;
            }

            // first token is always SENT_START
            String thisToken = tokenReadings.getToken();
            if (i > 1 && thisToken.length() == 1 && Character.isUpperCase(thisToken.charAt(0))
                    && tokenReadings.isWhitespaceBefore() && !tokens[i - 1].getToken().matches("[:-]")) { // ?  .  ? .:  
                reqTokenReadings = null;
                continue;
            }

            AnalyzedToken multiwordReqToken = getMultiwordToken(tokenReadings);
            if (multiwordReqToken != null) {
                String mwPosTag = multiwordReqToken.getPOSTag();
                if (mwPosTag.startsWith("</")) {
                    insideMultiword = false;
                } else {
                    insideMultiword = true;
                }

                if (mwPosTag.startsWith("</") && mwPosTag.contains(REQUIRE_VIDMINOK_SUBSTR)) { // . " "
                    posTag = multiwordReqToken.getPOSTag();
                    reqTokenReadings = tokenReadings;
                    continue;
                } else {
                    if (!mwPosTag.contains("adv") && !mwPosTag.contains("insert")) {
                        reqTokenReadings = null;
                    }
                    continue;
                }
            }

            if (insideMultiword) {
                continue;
            }

            String token = tokenReadings.getAnalyzedToken(0).getToken();
            if (posTag.contains(REQUIRE_VIDMINOK_SUBSTR) && tokenReadings.getReadingsLength() == 1) {
                String prep = token;

                if (prep.equals("") && reverseSearch(tokens, i, "")) // TODO: move to disambiguator
                    continue;

                if (prep.equalsIgnoreCase(""))
                    continue;

                if ((prep.equalsIgnoreCase("") || prep.equalsIgnoreCase(""))
                        && tokens.length > i + 1
                        && tokens[i + 1].getAnalyzedToken(0).getToken().equalsIgnoreCase("?")) {
                    reqTokenReadings = null;
                    continue;
                }

                reqTokenReadings = tokenReadings;
                continue;
            }

            if (reqTokenReadings == null)
                continue;

            // Do actual check

            ArrayList<String> posTagsToFind = new ArrayList<>();
            String reqPosTag = reqTokenReadings.getAnalyzedToken(0).getPOSTag();
            String prep = reqTokenReadings.getAnalyzedToken(0).getLemma();

            //      AnalyzedToken multiwordToken = getMultiwordToken(tokenReadings);
            //      if( multiwordToken != null ) {
            //        reqTokenReadings = null;
            //        continue;
            //      }

            //TODO: for numerics only v_naz
            if (prep.equalsIgnoreCase("")) { //&& tokenReadings.getAnalyzedToken(0).getPOSTag().equals(IPOSTag.numr) ) { 
                posTagsToFind.add("v_naz");
            } else if (prep.equalsIgnoreCase("?")) {
                posTagsToFind.add("v_naz");
            }

            Matcher matcher = REQUIRE_VIDMINOK_REGEX.matcher(reqPosTag);
            while (matcher.find()) {
                posTagsToFind.add(matcher.group(1));
            }

            for (AnalyzedToken readingToken : tokenReadings) {
                if (IPOSTag.numr.match(readingToken.getPOSTag())) {
                    posTagsToFind.add("v_naz"); // TODO: only if noun is following?
                    break;
                }
            }

            //      System.out.println("For " + tokenReadings + " to match " + posTagsToFind + " of " + reqTokenReadings.getToken());
            if (!getReadingWithVidmPosTag(posTagsToFind, tokenReadings)) {
                if (isTokenToSkip(tokenReadings))
                    continue;

                //        if( isTokenToIgnore(tokenReadings) ) {
                //          reqTokenReadings = null;
                //          continue;
                //        }

                //TODO: only for subset: ///?... or by verb //??/???...
                if (prep.equalsIgnoreCase("") || prep.equalsIgnoreCase("") || prep.equals("")
                        || prep.equals("")) {
                    if (PosTagHelper.hasPosTag(tokenReadings, ".*p:v_naz[^&]*")) { // but not &pron:
                        reqTokenReadings = null;
                        continue;
                    }
                }

                //  (??) ,  () ,  () 
                if (prep.equalsIgnoreCase("") && Character.isUpperCase(token.charAt(0))
                        && posTag.matches("noun:.:v_rod.*")) {
                    reqTokenReadings = null;
                    continue;
                }

                if (prep.equalsIgnoreCase("")) {
                    if (token.equals("")) {
                        reqTokenReadings = null;
                        continue;
                    }
                }

                if (prep.equalsIgnoreCase("")) {
                    if (token.equalsIgnoreCase("") || token.equals("") || token.equals("")
                            || token.equals("")) { // / ??  
                        reqTokenReadings = null;
                        continue;
                    }
                } else if (prep.equalsIgnoreCase("")) {
                    if (token.equalsIgnoreCase("?") || token.equals("")
                            || token.equals("")) { // / ??  
                        reqTokenReadings = null;
                        continue;
                    }
                }

                // exceptions
                if (tokens.length > i + 1) {
                    //      if( tokens.length > i+1 && Character.isUpperCase(tokenReadings.getAnalyzedToken(0).getToken().charAt(0))
                    //        && hasRequiredPosTag(Arrays.asList("v_naz"), tokenReadings)
                    //        && Character.isUpperCase(tokens[i+1].getAnalyzedToken(0).getToken().charAt(0)) )
                    //          continue; // "  ", "  "

                    if (isCapitalized(token) && STREETS.contains(tokens[i + 1].getAnalyzedToken(0).getToken())) {
                        reqTokenReadings = null;
                        continue;
                    }

                    if (IPOSTag.isNum(tokens[i + 1].getAnalyzedToken(0).getPOSTag())
                            && (token.equals("?") || token.equals("?")
                                    || token.equals("") || token.equals("?"))) {
                        reqTokenReadings = null;
                        continue;
                    }

                    //   ?  - ? 
                    if (PosTagHelper.hasPosTag(tokenReadings, "noun:.:v_oru.*")
                            && tokens[i + 1].hasPartialPosTag("adjp")) {
                        continue;
                    }

                    if ((prep.equalsIgnoreCase("") || prep.equalsIgnoreCase("")) //  10, ? 3-4
                            && (posTag.startsWith("noun:p:v_naz") || posTag.startsWith("noun:p:v_rod")) // token.equals("") 
                            && IPOSTag.isNum(tokens[i + 1].getAnalyzedToken(0).getPOSTag())) {
                        reqTokenReadings = null;
                        continue;
                    }

                    if ((token.equals("") || token.equals("") || token.equals(""))
                            && tokens[i + 1].getAnalyzedToken(0).getToken().startsWith("")) {
                        continue;
                    }
                    if ((token.equals("?") || token.equals("") || token.equals(""))
                            && tokens[i + 1].getAnalyzedToken(0).getToken().startsWith("")) {
                        continue;
                    }
                    if ((token.equals("?") || token.equals("?"))
                            && tokens[i + 1].getAnalyzedToken(0).getToken().startsWith("")) {
                        continue;
                    }

                    if (prep.equalsIgnoreCase("") && token.equals("?")
                            && tokens[i + 1].getAnalyzedToken(0).getToken().equals("??")) {
                        reqTokenReadings = null;
                        continue;
                    }

                    if (tokens[i + 1].getAnalyzedToken(0).getToken().equals("")
                            && tokens[i].getAnalyzedToken(0).getPOSTag().contains(":abbr")) {
                        reqTokenReadings = null;
                        continue;
                    }

                    if (tokens.length > i + 2) {
                        // ???  ??  ?
                        if (/*prep.equalsIgnoreCase("") &&*/ posTag.matches("adj.*:[mfn]:v_rod.*")) {
                            String gender = PosTagHelper.getGender(posTag);
                            if (gender == null) {
                                //                System.err.println("unknown gender for " + token);
                            }

                            if (PosTagHelper.hasPosTag(tokens[i + 1], "noun.*:" + gender + ":v_rod.*")) {
                                i += 1;
                                continue;
                            }
                        }

                        if ((token.equals("") || token.equals("")
                                || token.equals("") || token.equals(""))
                                && tokens[i + 1].getAnalyzedToken(0).getToken().equals("")) {
                            //          reqTokenReadings = null;
                            continue;
                        }
                        //            // ???  ??  ?
                        //            if (prep.equalsIgnoreCase("") && posTag.matches("adj.*:[mfn]:v_rod.*")) {
                        //              String gender = PosTagHelper.getGender(posTag);
                        //              if ( hasPosTag(tokens[i+1], "noun.*:"+gender+":v_rod.*")) {
                        //                i+=1;
                        //                continue;
                        //              }
                        //            }
                    }
                }

                RuleMatch potentialRuleMatch = createRuleMatch(tokenReadings, reqTokenReadings, posTagsToFind);
                ruleMatches.add(potentialRuleMatch);
            }

            reqTokenReadings = null;
        }

        return toRuleMatchArray(ruleMatches);
    }

    private static boolean isCapitalized(String token) {
        return token.length() > 1 && Character.isUpperCase(token.charAt(0))
                && Character.isLowerCase(token.charAt(1));
    }

    private boolean reverseSearch(AnalyzedTokenReadings[] tokens, int pos, String string) {
        for (int i = pos - 1; i >= 0 && i > pos - 4; i--) {
            if (tokens[i].getAnalyzedToken(0).getToken().equalsIgnoreCase(string))
                return true;
        }
        return false;
    }

    private boolean forwardSearch(AnalyzedTokenReadings[] tokens, int pos, String string, int maxSkip) {
        for (int i = pos + 1; i < tokens.length && i <= pos + maxSkip; i++) {
            if (tokens[i].getAnalyzedToken(0).getToken().equalsIgnoreCase(string))
                return true;
        }
        return false;
    }

    private boolean isTokenToSkip(AnalyzedTokenReadings tokenReadings) {
        for (AnalyzedToken token : tokenReadings) {
            //      System.out.println("    tag: " + token.getPOSTag() + " for " + token.getToken());
            if (IPOSTag.adv.match(token.getPOSTag()) || IPOSTag.contains(token.getPOSTag(), "adv>")
                    || IPOSTag.insert.match(token.getPOSTag()))
                return true;
        }
        return false;
    }

    //  private boolean isTokenToIgnore(AnalyzedTokenReadings tokenReadings) {
    //    for(AnalyzedToken token: tokenReadings) {
    //      if( token.getPOSTag().contains("abbr") )
    //        return true;
    //    }
    //    return false;
    //  }

    private boolean getReadingWithVidmPosTag(Collection<String> posTagsToFind,
            AnalyzedTokenReadings tokenReadings) {
        boolean vidminokFound = false; // because POS dictionary is not complete

        for (AnalyzedToken token : tokenReadings) {
            String posTag = token.getPOSTag();

            if (posTag == null) {
                if (tokenReadings.getReadingsLength() == 1)
                    return true;

                continue;
            }

            if (posTag.contains(NO_VIDMINOK_SUBSTR))
                return true;

            if (posTag.contains(VIDMINOK_SUBSTR)) {
                vidminokFound = true;

                for (String posTagToFind : posTagsToFind) {
                    //          System.out.println("  verifying: " + token + " -> " + posTag + " ~ " + posTagToFind);

                    if (posTag.contains(posTagToFind))
                        return true;
                }
            }
        }

        return !vidminokFound; //false;
    }

    private RuleMatch createRuleMatch(AnalyzedTokenReadings tokenReadings, AnalyzedTokenReadings reqTokenReadings,
            List<String> posTagsToFind) {
        String tokenString = tokenReadings.getToken();

        Synthesizer ukrainianSynthesizer = ukrainian.getSynthesizer();

        ArrayList<String> suggestions = new ArrayList<>();
        String oldPosTag = tokenReadings.getAnalyzedToken(0).getPOSTag();
        String requiredPostTagsRegEx = ":(" + StringUtils.join(posTagsToFind, "|") + ")";
        String posTag = oldPosTag.replaceFirst(":v_[a-z]+", requiredPostTagsRegEx);

        //    System.out.println("  creating suggestion for " + tokenReadings + " / " + tokenReadings.getAnalyzedToken(0) +" and tag " + posTag);

        try {
            String[] synthesized = ukrainianSynthesizer.synthesize(tokenReadings.getAnalyzedToken(0), posTag, true);

            //      System.out.println("Synthesized: " + Arrays.asList(synthesized));
            suggestions.addAll(Arrays.asList(synthesized));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

        ArrayList<String> reqVidminkyNames = new ArrayList<>();
        for (String vidm : posTagsToFind) {
            reqVidminkyNames.add(PosTagHelper.VIDMINKY_MAP.get(vidm));
        }

        ArrayList<String> foundVidminkyNames = new ArrayList<>();
        for (AnalyzedToken token : tokenReadings) {
            String posTag2 = token.getPOSTag();
            if (posTag2 != null && posTag2.contains(VIDMINOK_SUBSTR)) {
                String vidmName = PosTagHelper.VIDMINKY_MAP
                        .get(posTag2.replaceFirst("^.*" + VIDMINOK_REGEX + ".*$", "$1"));
                if (foundVidminkyNames.contains(vidmName)) {
                    if (posTag2.contains(":p:")) {
                        vidmName = vidmName + " (.)";
                        foundVidminkyNames.add(vidmName);
                    }
                    // else skip dup
                } else {
                    foundVidminkyNames.add(vidmName);
                }
            }
        }

        String msg = MessageFormat.format(
                " {0}   : {1},  : {2}",
                reqTokenReadings.getToken(), StringUtils.join(reqVidminkyNames, ", "),
                StringUtils.join(foundVidminkyNames, ", "));

        if (tokenString.equals("")) {
            msg += ".    ?  ?";
            try {
                String newYihPostag = "adj:p" + requiredPostTagsRegEx + ".*";
                String[] synthesized = ukrainianSynthesizer.synthesize(
                        new AnalyzedToken("", "adj:m:v_naz:&pron:pos", ""), newYihPostag, true);
                suggestions.addAll(Arrays.asList(synthesized));
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        } else if (reqTokenReadings.getToken().equalsIgnoreCase("")) {
            for (AnalyzedToken token : tokenReadings.getReadings()) {
                String posTag2 = token.getPOSTag();
                if (posTag2.matches(".*:v_naz.*:anim.*")) {
                    msg += ".          ?";
                    try {
                        String newPostag = posTag2.replace("v_naz", "v_kly");
                        String[] synthesized = ukrainianSynthesizer.synthesize(token, newPostag, false);
                        for (String string : synthesized) {
                            if (!string.equals(token.getToken()) && !suggestions.contains(string)) {
                                suggestions.add(string);
                            }
                        }
                        break;
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                }
            }

        }

        RuleMatch potentialRuleMatch = new RuleMatch(this, tokenReadings.getStartPos(), tokenReadings.getEndPos(),
                msg, getShort());

        potentialRuleMatch.setSuggestedReplacements(suggestions);

        return potentialRuleMatch;
    }

    @Nullable
    private static AnalyzedToken getMultiwordToken(AnalyzedTokenReadings analyzedTokenReadings) {
        for (AnalyzedToken analyzedToken : analyzedTokenReadings) {
            String posTag = analyzedToken.getPOSTag();
            if (posTag != null && posTag.startsWith("<"))
                return analyzedToken;
        }
        return null;
    }

    @Override
    public void reset() {
    }

}