org.languagetool.rules.spelling.hunspell.CompoundAwareHunspellRule.java Source code

Introduction

Here is the source code for org.languagetool.rules.spelling.hunspell.CompoundAwareHunspellRule.java
Source

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2012 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.spelling.hunspell;

import org.apache.commons.lang3.StringUtils;
import org.languagetool.Language;
import org.languagetool.UserConfig;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.rules.spelling.morfologik.MorfologikMultiSpeller;
import org.languagetool.tokenizers.CompoundWordTokenizer;
import org.languagetool.tools.StringTools;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.ResourceBundle;
import java.util.stream.IntStream;

/**
 * A spell checker that combines Hunspell und Morfologik spell checking
 * to support compound words and offer fast suggestions for some misspelled
 * compound words.
 */
public abstract class CompoundAwareHunspellRule extends HunspellRule {

    private static final int MAX_SUGGESTIONS = 20;

    private final CompoundWordTokenizer compoundSplitter;
    private final MorfologikMultiSpeller morfoSpeller;

    protected abstract void filterForLanguage(List<String> suggestions);

    public CompoundAwareHunspellRule(ResourceBundle messages, Language language,
            CompoundWordTokenizer compoundSplitter, MorfologikMultiSpeller morfoSpeller, UserConfig userConfig) {
        this(messages, language, compoundSplitter, morfoSpeller, userConfig, Collections.emptyList());
    }

    /**
     * @since 4.3
     */
    public CompoundAwareHunspellRule(ResourceBundle messages, Language language,
            CompoundWordTokenizer compoundSplitter, MorfologikMultiSpeller morfoSpeller, UserConfig userConfig,
            List<Language> altLanguages) {
        this(messages, language, compoundSplitter, morfoSpeller, userConfig, altLanguages, null);
    }

    public CompoundAwareHunspellRule(ResourceBundle messages, Language language,
            CompoundWordTokenizer compoundSplitter, MorfologikMultiSpeller morfoSpeller, UserConfig userConfig,
            List<Language> altLanguages, LanguageModel languageModel) {
        super(messages, language, userConfig, altLanguages, languageModel);
        this.compoundSplitter = compoundSplitter;
        this.morfoSpeller = morfoSpeller;
    }

    /**
     * As a hunspell-based approach is too slow, we use Morfologik to create suggestions. As this
     * won't work for compounds not in the dictionary, we split the word and also get suggestions
     * on the compound parts. In the end, all candidates are filtered against Hunspell again (which
     * supports compounds).
     */
    @Override
    public List<String> getSuggestions(String word) throws IOException {
        if (needsInit) {
            init();
        }
        //System.out.println("Computing suggestions for " + word);
        List<String> candidates = getCandidates(word);
        List<String> simpleSuggestions = getCorrectWords(candidates);
        //System.out.println("simpleSuggestions: " + simpleSuggestions);

        List<String> noSplitSuggestions = morfoSpeller.getSuggestions(word); // after getCorrectWords() so spelling.txt is considered
        handleWordEndPunctuation(".", word, noSplitSuggestions);
        handleWordEndPunctuation("...", word, noSplitSuggestions);
        List<String> noSplitLowercaseSuggestions = new ArrayList<>();
        if (StringTools.startsWithUppercase(word) && !StringTools.isAllUppercase(word)) {
            // almost all words can be uppercase because they can appear at the start of a sentence:
            noSplitLowercaseSuggestions = morfoSpeller.getSuggestions(word.toLowerCase());
        }
        //System.out.println("noSplitSuggestions: " + noSplitSuggestions);
        //System.out.println("noSplitLcSuggestions: " + noSplitLowercaseSuggestions);
        // We don't know about the quality of the results here, so mix both lists together,
        // taking elements from both lists on a rotating basis:
        List<String> suggestions = new ArrayList<>();
        int max = IntStream
                .of(simpleSuggestions.size(), noSplitSuggestions.size(), noSplitLowercaseSuggestions.size()).max()
                .orElse(0);
        for (int i = 0; i < max; i++) {
            if (i < noSplitSuggestions.size()) {
                suggestions.add(noSplitSuggestions.get(i));
            }
            if (i < noSplitLowercaseSuggestions.size()) {
                suggestions.add(StringTools.uppercaseFirstChar(noSplitLowercaseSuggestions.get(i)));
            }
            // put these behind suggestions by Morfologik, often low-quality / made-up words
            if (i < simpleSuggestions.size()) {
                suggestions.add(simpleSuggestions.get(i));
            }
        }
        //System.out.println("suggestions (mixed from simpleSuggestions, noSplitSuggestions, noSplitLowerCaseSuggestions): " + suggestions);

        filterDupes(suggestions);
        filterForLanguage(suggestions);

        List<String> sortedSuggestions = sortSuggestionByQuality(word, suggestions);
        //System.out.println("sortSuggestionByQuality(): " + sortedSuggestions);
        // This is probably be the right place to sort suggestions by probability:
        //SuggestionSorter sorter = new SuggestionSorter(new LuceneLanguageModel(new File("/home/dnaber/data/google-ngram-index/de")));
        //sortedSuggestions = sorter.sortSuggestions(sortedSuggestions);
        //System.out.println();
        return sortedSuggestions.subList(0, Math.min(MAX_SUGGESTIONS, sortedSuggestions.size()));
    }

    private void handleWordEndPunctuation(String punct, String word, List<String> noSplitSuggestions) {
        if (word.endsWith(punct)) {
            // e.g. "informationnen." - the dot is a word char in hunspell, so it needs special treatment here
            List<String> tmp = morfoSpeller.getSuggestions(word.substring(0, word.length() - punct.length()));
            for (String s : tmp) {
                noSplitSuggestions.add(s + punct);
            }
        }
    }

    /**
     * Find potential corrections - it's okay if some of these are not valid words,
     * this list will be filtered against the spellchecker before being returned to the user.
     */
    protected List<String> getCandidates(String word) {
        return compoundSplitter.tokenize(word);
    }

    protected List<String> getCandidates(List<String> parts) {
        int partCount = 0;
        List<String> candidates = new ArrayList<>();
        for (String part : parts) {
            if (hunspellDict.misspelled(part)) {
                // assume noun, so use uppercase:
                boolean doUpperCase = partCount > 0 && !StringTools.startsWithUppercase(part);
                List<String> suggestions = morfoSpeller
                        .getSuggestions(doUpperCase ? StringTools.uppercaseFirstChar(part) : part);
                if (suggestions.isEmpty()) {
                    suggestions = morfoSpeller
                            .getSuggestions(doUpperCase ? StringTools.lowercaseFirstChar(part) : part);
                }
                boolean appendS = false;
                if (doUpperCase && part.endsWith("s")) { // maybe infix-s as in "Dampfschiffahrtskapitn" -> "Dampfschifffahrtskapitn"
                    suggestions.addAll(morfoSpeller.getSuggestions(StringUtils.removeEnd(part, "s")));
                    appendS = true;
                }
                for (String suggestion : suggestions) {
                    List<String> partsCopy = new ArrayList<>(parts);
                    if (appendS) {
                        suggestion += "s";
                    }
                    if (partCount > 0 && parts.get(partCount).startsWith("-")
                            && parts.get(partCount).length() > 1) {
                        partsCopy.set(partCount, "-" + StringTools.uppercaseFirstChar(suggestion.substring(1)));
                    } else if (partCount > 0 && !parts.get(partCount - 1).endsWith("-")) {
                        partsCopy.set(partCount, suggestion.toLowerCase());
                    } else {
                        partsCopy.set(partCount, suggestion);
                    }
                    String candidate = String.join("", partsCopy);
                    if (!isMisspelled(candidate)) {
                        candidates.add(candidate);
                    }
                    // Arbeidszimmer -> Arbeitszimmer:
                    if (partCount < parts.size() - 1 && part.endsWith("s") && suggestion.endsWith("-")) {
                        partsCopy.set(partCount, suggestion.substring(0, suggestion.length() - 1));
                        String infixCandidate = String.join("", partsCopy);
                        if (!isMisspelled(infixCandidate)) {
                            candidates.add(infixCandidate);
                        }
                    }
                }
            }
            // What if there's no misspelled parts like for Arbeitamt = Arbeit+Amt ??
            // -> morfologik must be extended to return similar words even for known words
            // But GermanSpellerRule.getCandidates() has a solution for the cases with infix "s".
            partCount++;
        }
        return candidates;
    }

    // avoid over-accepting words, as the Morfologik approach above might construct
    // compound words with parts that are correct but the compound is not correct (e.g. "Arbeit + Amt = Arbeitamt"):
    private List<String> getCorrectWords(List<String> wordsOrPhrases) {
        List<String> result = new ArrayList<>();
        for (String wordOrPhrase : wordsOrPhrases) {
            // this might be a phrase like "aufgrund dessen", so it needs to be split: 
            String[] words = tokenizeText(wordOrPhrase);
            boolean wordIsOkay = true;
            for (String word : words) {
                if (hunspellDict.misspelled(word)) {
                    wordIsOkay = false;
                    break;
                }
            }
            if (wordIsOkay) {
                result.add(wordOrPhrase);
            }
        }
        return result;
    }

}