org.languagetool.rules.de.CompoundCoherencyRule.java Source code

Introduction

Here is the source code for org.languagetool.rules.de.CompoundCoherencyRule.java
Source

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2017 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.de;

import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.rules.*;

import java.io.IOException;
import java.util.*;

/**
 * Check consistent spelling for German compounds. It's a style issue
 * if a compound is written once with and once without hyphen in the same text.
 * E.g. "Zahnrzteverband" and "Zahnrzte-Verband"
 * @author Daniel Naber
 */
public class CompoundCoherencyRule extends TextLevelRule {

    public CompoundCoherencyRule(ResourceBundle messages) {
        super.setCategory(Categories.STYLE.getCategory(messages));
        addExamplePair(Example.wrong(
                "Ein Helpdesk gliedert sich in verschiedene Level. Die Qualitt des <marker>Help-Desks</marker> ist wichtig."),
                Example.fixed(
                        "Ein Helpdesk gliedert sich in verschiedene Level. Die Qualitt des <marker>Helpdesks</marker> ist wichtig."));
    }

    @Override
    public String getId() {
        return "DE_COMPOUND_COHERENCY";
    }

    @Override
    public String getDescription() {
        return "Einheitliche Schreibung bei Komposita (mit oder ohne Bindestrich)";
    }

    @Override
    public RuleMatch[] match(List<AnalyzedSentence> sentences) throws IOException {
        List<RuleMatch> ruleMatches = new ArrayList<>();
        Map<String, List<String>> normToTextOccurrences = new HashMap<>();
        int pos = 0;
        for (AnalyzedSentence sentence : sentences) {
            AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
            for (AnalyzedTokenReadings atr : tokens) {
                String lemmaOrNull = getLemma(atr);
                String token = atr.getToken();
                if (token.isEmpty()) {
                    continue;
                }
                // The whole implementation could be simpler, but this way we also catch cases where
                // the word (and this its lemma) isn't known.
                String lemma = lemmaOrNull != null ? lemmaOrNull : token;
                String normToken = lemma.replace("-", "").toLowerCase();
                if (StringUtils.isNumeric(normToken)) {
                    // avoid messages about "2-3" and "23" both being used
                    break;
                }
                List<String> textOcc = normToTextOccurrences.get(normToken);
                if (textOcc != null) {
                    if (textOcc.stream().noneMatch(f -> f.equalsIgnoreCase(lemma))) {
                        String other = textOcc.get(0);
                        if (containsHyphenInside(other) || containsHyphenInside(token)) {
                            String msg = "Uneinheitliche Verwendung von Bindestrichen. Der Text enthlt sowohl '"
                                    + token + "' als auch '" + other + "'.";
                            RuleMatch ruleMatch = new RuleMatch(this, sentence, pos + atr.getStartPos(),
                                    pos + atr.getEndPos(), msg);
                            if (token.replace("-", "").equalsIgnoreCase(other.replace("-", ""))) {
                                // might be different inflected forms, so only suggest if really just the hyphen is different:
                                ruleMatch.setSuggestedReplacement(other);
                            }
                            ruleMatches.add(ruleMatch);
                        }
                    }
                } else {
                    List<String> l = new ArrayList<>();
                    l.add(lemma);
                    normToTextOccurrences.putIfAbsent(normToken, l);
                }
            }
            pos += sentence.getText().length();
        }
        return toRuleMatchArray(ruleMatches);
    }

    private boolean containsHyphenInside(String token) {
        return token.contains("-") && !token.startsWith("-") && !token.endsWith("-");
    }

    @Nullable
    private String getLemma(AnalyzedTokenReadings atr) {
        String lemmaOrNull = atr.hasSameLemmas() && atr.getReadingsLength() > 0
                ? atr.getReadings().get(0).getLemma()
                : null;
        if (lemmaOrNull != null) {
            // our analysis gets lemma "Jugendfoto" for "Jugend-Fotos", we 'fix' that here
            String token = atr.getToken();
            if (!lemmaOrNull.contains("-") && token.contains("-")) {
                StringBuilder lemmaBuilder = new StringBuilder();
                for (int lemmaPos = 0, tokenPos = 0; lemmaPos < lemmaOrNull.length(); lemmaPos++, tokenPos++) {
                    if (tokenPos >= token.length()) {
                        break;
                    }
                    char lemmaChar = lemmaOrNull.charAt(lemmaPos);
                    char tokenChar = token.charAt(tokenPos);
                    if (lemmaChar == tokenChar) {
                        lemmaBuilder.append(lemmaChar);
                    } else if (token.charAt(tokenPos) == '-') {
                        tokenPos++; // skip hyphen
                        lemmaBuilder.append("-");
                        if (lemmaPos + 1 < token.length() && Character.isUpperCase(token.charAt(tokenPos))) {
                            lemmaBuilder.append(Character.toUpperCase(lemmaChar));
                        } else {
                            lemmaBuilder.append(lemmaChar);
                        }
                    }
                }
                return lemmaBuilder.toString();
            }
            return lemmaOrNull;
        } else {
            return null;
        }
    }

    @Override
    public int minToCheckParagraph() {
        return -1;
    }

}