org.languagetool.rules.AbstractCompoundRule.java Source code

Introduction

Here is the source code for org.languagetool.rules.AbstractCompoundRule.java
Source

/* LanguageTool, a natural language style checker
 * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.ResourceBundle;
import java.util.concurrent.ArrayBlockingQueue;

import org.apache.commons.lang3.StringUtils;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.tools.StringTools;

/**
 * Checks that compounds (if in the list) are not written as separate words.
 * 
 * @author Daniel Naber, Marcin Mikowski (refactoring)
 */
public abstract class AbstractCompoundRule extends Rule {

    static final int MAX_TERMS = 5;

    private final String withHyphenMessage;
    private final String withoutHyphenMessage;
    private final String withOrWithoutHyphenMessage;
    private final String shortDesc;
    // if true, the first word will be uncapitalized before compared to the entries in CompoundRuleData
    protected boolean sentenceStartsWithUpperCase = false;

    @Override
    public abstract String getId();

    @Override
    public abstract String getDescription();

    @Override
    public int estimateContextForSureMatch() {
        return 1;
    }

    /** @since 3.0 */
    protected abstract CompoundRuleData getCompoundRuleData();

    /**
     * @since 3.0
     */
    public AbstractCompoundRule(ResourceBundle messages, String withHyphenMessage, String withoutHyphenMessage,
            String withOrWithoutHyphenMessage) throws IOException {
        this(messages, withHyphenMessage, withoutHyphenMessage, withOrWithoutHyphenMessage, null);
    }

    /**
     * @since 3.0
     */
    public AbstractCompoundRule(ResourceBundle messages, String withHyphenMessage, String withoutHyphenMessage,
            String withOrWithoutHyphenMessage, String shortMessage) throws IOException {
        super.setCategory(Categories.MISC.getCategory(messages));
        this.withHyphenMessage = withHyphenMessage;
        this.withoutHyphenMessage = withoutHyphenMessage;
        this.withOrWithoutHyphenMessage = withOrWithoutHyphenMessage;
        this.shortDesc = shortMessage;
        setLocQualityIssueType(ITSIssueType.Misspelling);
    }

    /**
     * Flag to indicate if the hyphen is ignored in the text entered by the user.
     * Set this to false if you want the rule to offer suggestions for words 
     * like [ro] "cte-i-trei" (with hyphen), not only for "cte i trei" (with spaces)
     * This is only available for languages with hyphen as a word separator (ie: not 
     * available for English, available for Romanian). See Language.getWordTokenizer()
     */
    public boolean isHyphenIgnored() {
        return true;
    }

    @Override
    public RuleMatch[] match(AnalyzedSentence sentence) {
        List<RuleMatch> ruleMatches = new ArrayList<>();
        AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();

        RuleMatch prevRuleMatch = null;
        Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<>(MAX_TERMS);
        for (int i = 0; i < tokens.length + MAX_TERMS - 1; i++) {
            AnalyzedTokenReadings token;
            // we need to extend the token list so we find matches at the end of the original list:
            if (i >= tokens.length) {
                token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
            } else {
                token = tokens[i];
            }
            if (i == 0) {
                addToQueue(token, prevTokens);
                continue;
            } else if (token.isImmunized()) {
                continue;
            }

            AnalyzedTokenReadings firstMatchToken = prevTokens.peek();
            List<String> stringsToCheck = new ArrayList<>();
            List<String> origStringsToCheck = new ArrayList<>(); // original upper/lowercase spelling
            Map<String, AnalyzedTokenReadings> stringToToken = getStringToTokenMap(prevTokens, stringsToCheck,
                    origStringsToCheck);
            // iterate backwards over all potentially incorrect strings to make
            // sure we match longer strings first:
            for (int k = stringsToCheck.size() - 1; k >= 0; k--) {
                String stringToCheck = stringsToCheck.get(k);
                String origStringToCheck = origStringsToCheck.get(k);
                if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck)) {
                    AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
                    String msg = null;
                    List<String> replacement = new ArrayList<>();
                    if (!getCompoundRuleData().getNoDashSuggestion().contains(stringToCheck)) {
                        replacement.add(origStringToCheck.replace(' ', '-'));
                        msg = withHyphenMessage;
                    }
                    if (isNotAllUppercase(origStringToCheck)
                            && !getCompoundRuleData().getOnlyDashSuggestion().contains(stringToCheck)) {
                        replacement.add(mergeCompound(origStringToCheck,
                                getCompoundRuleData().getNoDashLowerCaseSuggestion().stream()
                                        .anyMatch(s -> origStringsToCheck.contains(s))));
                        msg = withoutHyphenMessage;
                    }
                    String[] parts = stringToCheck.split(" ");
                    if (parts.length > 0 && parts[0].length() == 1) {
                        replacement.clear();
                        replacement.add(origStringToCheck.replace(' ', '-'));
                        msg = withHyphenMessage;
                    } else if (replacement.isEmpty() || replacement.size() == 2) { // isEmpty shouldn't happen
                        msg = withOrWithoutHyphenMessage;
                    }
                    RuleMatch ruleMatch = new RuleMatch(this, sentence, firstMatchToken.getStartPos(),
                            atr.getEndPos(), msg, shortDesc);
                    ruleMatch.setSuggestedReplacements(replacement);
                    // avoid duplicate matches:
                    if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
                        prevRuleMatch = ruleMatch;
                        break;
                    }
                    prevRuleMatch = ruleMatch;
                    ruleMatches.add(ruleMatch);
                    break;
                }
            }
            addToQueue(token, prevTokens);
        }
        return toRuleMatchArray(ruleMatches);
    }

    private Map<String, AnalyzedTokenReadings> getStringToTokenMap(Queue<AnalyzedTokenReadings> prevTokens,
            List<String> stringsToCheck, List<String> origStringsToCheck) {
        StringBuilder sb = new StringBuilder();
        Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<>();
        int j = 0;
        boolean isFirstSentStart = false;
        for (AnalyzedTokenReadings atr : prevTokens) {
            sb.append(' ');
            sb.append(atr.getToken());
            if (j == 0) {
                isFirstSentStart = atr.hasPosTag(JLanguageTool.SENTENCE_START_TAGNAME);
            }
            if (j >= 1) {
                String stringToCheck = normalize(sb.toString());
                if (sentenceStartsWithUpperCase && isFirstSentStart) {
                    stringToCheck = StringUtils.uncapitalize(stringToCheck);
                }
                stringsToCheck.add(stringToCheck);
                origStringsToCheck.add(sb.toString().trim());
                if (!stringToToken.containsKey(stringToCheck)) {
                    stringToToken.put(stringToCheck, atr);
                }
            }
            j++;
        }
        return stringToToken;
    }

    private String normalize(String inStr) {
        String str = inStr.trim();
        if (str.indexOf('-') != -1 && str.indexOf(' ') != -1) {
            if (isHyphenIgnored()) {
                // e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected:
                str = str.replace('-', ' ');
            } else {
                str = str.replace(" - ", " ");
            }
        }
        return str;
    }

    private boolean isNotAllUppercase(String str) {
        String[] parts = str.split(" ");
        for (String part : parts) {
            if (isHyphenIgnored() || !"-".equals(part)) { // do not treat '-' as an upper-case word
                if (StringTools.isAllUppercase(part)) {
                    return false;
                }
            }
        }
        return true;
    }

    private String mergeCompound(String str, boolean uncapitalizeMidWords) {
        String[] stringParts = str.split(" ");
        StringBuilder sb = new StringBuilder();
        for (int k = 0; k < stringParts.length; k++) {
            if (isHyphenIgnored() || !"-".equals(stringParts[k])) {
                if (k == 0) {
                    sb.append(stringParts[0]);
                } else {
                    sb.append(uncapitalizeMidWords ? StringUtils.uncapitalize(stringParts[k]) : stringParts[k]);
                }
            }
        }
        return sb.toString();
    }

    private void addToQueue(AnalyzedTokenReadings token, Queue<AnalyzedTokenReadings> prevTokens) {
        boolean inserted = prevTokens.offer(token);
        if (!inserted) {
            prevTokens.poll();
            prevTokens.offer(token);
        }
    }

}