org.languagetool.rules.spelling.SpellingCheckRule.java Source code

Introduction

Here is the source code for org.languagetool.rules.spelling.SpellingCheckRule.java
Source

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2012 Marcin Milkowski (http://www.languagetool.org)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.spelling;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.jetbrains.annotations.Nullable;
import org.languagetool.*;
import org.languagetool.languagemodel.BaseLanguageModel;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.rules.ITSIssueType;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.SuggestedReplacement;
import org.languagetool.rules.patterns.PatternToken;
import org.languagetool.rules.patterns.PatternTokenBuilder;
import org.languagetool.rules.spelling.suggestions.SuggestionsOrderer;
import org.languagetool.rules.spelling.suggestions.SuggestionsOrdererFeatureExtractor;
import org.languagetool.rules.spelling.suggestions.SuggestionsRanker;
import org.languagetool.tagging.disambiguation.rules.DisambiguationPatternRule;
import org.languagetool.tokenizers.WordTokenizer;
import org.languagetool.tools.StringTools;

import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;

/**
 * An abstract rule for spellchecking rules.
 *
 * @author Marcin Mikowski
 */
public abstract class SpellingCheckRule extends Rule {

    /**
     * The string {@code LanguageTool}.
     * @since 2.3
     */
    public static final String LANGUAGETOOL = "LanguageTool";
    /**
     * The string {@code LanguageTooler}.
     * @since 4.4
     */
    public static final String LANGUAGETOOLER = "LanguageTooler";

    protected final Language language;

    /**
     * @since 4.5
     * For rules from @see Language.getRelevantLanguageModelCapableRules
     * Optional, allows e.g. better suggestions when set
     */
    @Nullable
    @Experimental
    protected LanguageModel languageModel;
    protected final CachingWordListLoader wordListLoader = new CachingWordListLoader();

    private static final String SPELLING_IGNORE_FILE = "/hunspell/ignore.txt";
    private static final String SPELLING_FILE = "/hunspell/spelling.txt";
    private static final String SPELLING_PROHIBIT_FILE = "/hunspell/prohibit.txt";
    private static final String SPELLING_FILE_VARIANT = null;
    private static final Comparator<String> STRING_LENGTH_COMPARATOR = Comparator.comparingInt(String::length);

    private final UserConfig userConfig;
    private final Set<String> wordsToBeIgnored = new HashSet<>();
    private final Set<String> wordsToBeProhibited = new HashSet<>();
    private final List<RuleWithLanguage> altRules;

    private Map<String, Set<String>> wordsToBeIgnoredDictionary = new HashMap<>();
    private Map<String, Set<String>> wordsToBeIgnoredDictionaryIgnoreCase = new HashMap<>();

    private List<DisambiguationPatternRule> antiPatterns = new ArrayList<>();
    private boolean considerIgnoreWords = true;
    private boolean convertsCase = false;
    protected int ignoreWordsWithLength = 0;

    public SpellingCheckRule(ResourceBundle messages, Language language, UserConfig userConfig) {
        this(messages, language, userConfig, Collections.emptyList());
    }

    /**
     * @since 4.4
     */
    public SpellingCheckRule(ResourceBundle messages, Language language, UserConfig userConfig,
            List<Language> altLanguages) {
        this(messages, language, userConfig, altLanguages, null);
    }

    /**
     * @since 4.5
     */
    @Experimental
    public SpellingCheckRule(ResourceBundle messages, Language language, UserConfig userConfig,
            List<Language> altLanguages, @Nullable LanguageModel languageModel) {
        super(messages);
        this.language = language;
        this.userConfig = userConfig;
        this.languageModel = languageModel;
        if (userConfig != null) {
            wordsToBeIgnored.addAll(userConfig.getAcceptedWords());
        }
        this.altRules = getAlternativeLangSpellingRules(altLanguages);
        setLocQualityIssueType(ITSIssueType.Misspelling);
    }

    /**
     *
     * @param word misspelled word that suggestions should be generated for
     * @param userCandidates candidates from personal dictionary
     * @param candidates candidates from default dictionary
     * @param orderer model to rank suggestions / extract features, or null
     * @param match rule match to add suggestions to
     */
    protected static void addSuggestionsToRuleMatch(String word, List<String> userCandidates,
            List<String> candidates, @Nullable SuggestionsOrderer orderer, RuleMatch match) {
        AnalyzedSentence sentence = match.getSentence();
        int startPos = match.getFromPos();
        //long startTime = System.currentTimeMillis();
        if (orderer != null && orderer.isMlAvailable()) {
            if (orderer instanceof SuggestionsRanker) {
                // don't rank words form user dictionary, assign confidence 0.0, but add at start
                // hard to ensure performance on unknown words
                SuggestionsRanker ranker = (SuggestionsRanker) orderer;
                List<SuggestedReplacement> defaultSuggestions = ranker.orderSuggestions(candidates, word, sentence,
                        startPos);
                if (defaultSuggestions.isEmpty()) {
                    // could not rank for some reason
                } else {
                    if (userCandidates.isEmpty()) {
                        match.setAutoCorrect(ranker.shouldAutoCorrect(defaultSuggestions));
                        match.setSuggestedReplacementObjects(defaultSuggestions);
                    } else {
                        List<SuggestedReplacement> combinedSuggestions = new ArrayList<>();
                        for (String wordFromUserDict : userCandidates) {
                            SuggestedReplacement s = new SuggestedReplacement(wordFromUserDict);
                            // confidence is null
                            combinedSuggestions.add(s);
                        }
                        combinedSuggestions.addAll(defaultSuggestions);
                        match.setSuggestedReplacementObjects(combinedSuggestions);
                        // no auto correct when words from personal dictionaries are included
                        match.setAutoCorrect(false);
                    }
                }
            } else if (orderer instanceof SuggestionsOrdererFeatureExtractor) {
                // disable user suggestions here
                // problem: how to merge match features when ranking default and user suggestions separately?
                if (userCandidates.size() != 0) {
                    throw new IllegalStateException(
                            "SuggestionsOrdererFeatureExtractor does not support suggestions from personal dictionaries at the moment.");
                }
                SuggestionsOrdererFeatureExtractor featureExtractor = (SuggestionsOrdererFeatureExtractor) orderer;
                Pair<List<SuggestedReplacement>, SortedMap<String, Float>> suggestions = featureExtractor
                        .computeFeatures(candidates, word, sentence, startPos);

                match.setSuggestedReplacementObjects(suggestions.getLeft());
                match.setFeatures(suggestions.getRight());
            } else {
                List<SuggestedReplacement> combinedSuggestions = new ArrayList<>();
                combinedSuggestions.addAll(orderer.orderSuggestions(userCandidates, word, sentence, startPos));
                combinedSuggestions.addAll(orderer.orderSuggestions(candidates, word, sentence, startPos));
                match.setSuggestedReplacementObjects(combinedSuggestions);
            }
        } else { // no reranking
            List<String> combinedSuggestions = new ArrayList<>();
            combinedSuggestions.addAll(userCandidates);
            combinedSuggestions.addAll(candidates);
            match.setSuggestedReplacements(combinedSuggestions);
        }
        /*long timeDelta = System.currentTimeMillis() - startTime;
        System.out.printf("Reordering %d suggestions took %d ms.%n", result.getSuggestedReplacements().size(), timeDelta);*/
    }

    @Override
    public abstract String getId();

    @Override
    public abstract String getDescription();

    @Override
    public abstract RuleMatch[] match(AnalyzedSentence sentence) throws IOException;

    @Override
    public boolean isDictionaryBasedSpellingRule() {
        return true;
    }

    /**
     * Add the given words to the list of words to be ignored during spell check.
     * You might want to use {@link #acceptPhrases(List)} instead, as only that
     * can also deal with phrases.
     */
    public void addIgnoreTokens(List<String> tokens) {
        wordsToBeIgnored.addAll(tokens);
        updateIgnoredWordDictionary();
    }

    //(re)create a Map<String, Set<String>> of all words to be ignored:
    // The words' first char serves as key, and the Set<String> contains all Strings starting with this char
    private void updateIgnoredWordDictionary() {
        wordsToBeIgnoredDictionary = wordsToBeIgnored.stream()
                .collect(Collectors.groupingBy(s -> s.substring(0, 1), Collectors.toSet()));
        wordsToBeIgnoredDictionaryIgnoreCase = wordsToBeIgnored.stream().map(String::toLowerCase)
                .collect(Collectors.groupingBy(s -> s.substring(0, 1), Collectors.toSet()));
    }

    /**
     * Set whether the list of words to be explicitly ignored (set with {@link #addIgnoreTokens(List)}) is considered at all.
     */
    public void setConsiderIgnoreWords(boolean considerIgnoreWords) {
        this.considerIgnoreWords = considerIgnoreWords;
    }

    /**
     * Get additional suggestions added before other suggestions (note the rule may choose to
     * re-order the suggestions anyway). Only add suggestions here that you know are spelled correctly,
     * they will not be checked again before being shown to the user.
     */
    protected List<String> getAdditionalTopSuggestions(List<String> suggestions, String word) throws IOException {
        List<String> moreSuggestions = new ArrayList<>();
        if (("Languagetool".equals(word) || "languagetool".equals(word)) && !suggestions.contains(LANGUAGETOOL)) {
            moreSuggestions.add(LANGUAGETOOL);
        }
        if (("Languagetooler".equals(word) || "languagetooler".equals(word))
                && !suggestions.contains(LANGUAGETOOLER)) {
            moreSuggestions.add(LANGUAGETOOLER);
        }
        return moreSuggestions;
    }

    /**
     * Get additional suggestions added after other suggestions (note the rule may choose to
     * re-order the suggestions anyway).
     */
    protected List<String> getAdditionalSuggestions(List<String> suggestions, String word) {
        return Collections.emptyList();
    }

    /**
     * Returns true iff the token at the given position should be ignored by the spell checker.
     */
    protected boolean ignoreToken(AnalyzedTokenReadings[] tokens, int idx) throws IOException {
        List<String> words = new ArrayList<>();
        for (AnalyzedTokenReadings token : tokens) {
            words.add(token.getToken());
        }
        return ignoreWord(words, idx);
    }

    /**
     * Returns true iff the word should be ignored by the spell checker.
     * If possible, use {@link #ignoreToken(AnalyzedTokenReadings[], int)} instead.
     */
    protected boolean ignoreWord(String word) throws IOException {
        if (!considerIgnoreWords) {
            return false;
        }
        if (word.endsWith(".") && !wordsToBeIgnored.contains(word)) {
            return isIgnoredNoCase(word.substring(0, word.length() - 1)); // e.g. word at end of sentence
        }
        return isIgnoredNoCase(word);
    }

    private boolean isIgnoredNoCase(String word) {
        return wordsToBeIgnored.contains(word)
                || (convertsCase && wordsToBeIgnored.contains(word.toLowerCase(language.getLocale())))
                || (ignoreWordsWithLength > 0 && word.length() <= ignoreWordsWithLength);
    }

    /**
     * Returns true iff the word at the given position should be ignored by the spell checker.
     * If possible, use {@link #ignoreToken(AnalyzedTokenReadings[], int)} instead.
     * @since 2.6
     */
    protected boolean ignoreWord(List<String> words, int idx) throws IOException {
        return ignoreWord(words.get(idx));
    }

    /**
     * Used to determine whether the dictionary will use case conversions for
     * spell checking.
     * @param convertsCase if true, then conversions are used.
     * @since 2.5
     */
    public void setConvertsCase(boolean convertsCase) {
        this.convertsCase = convertsCase;
    }

    protected boolean isUrl(String token) {
        return WordTokenizer.isUrl(token);
    }

    protected boolean isEMail(String token) {
        return WordTokenizer.isEMail(token);
    }

    protected void filterDupes(List<String> words) {
        Set<String> seen = new HashSet<>();
        Iterator<String> iterator = words.iterator();
        while (iterator.hasNext()) {
            String word = iterator.next();
            if (seen.contains(word)) {
                iterator.remove();
            }
            seen.add(word);
        }
    }

    protected void init() throws IOException {
        for (String ignoreWord : wordListLoader.loadWords(getIgnoreFileName())) {
            addIgnoreWords(ignoreWord);
        }
        if (getSpellingFileName() != null) {
            for (String ignoreWord : wordListLoader.loadWords(getSpellingFileName())) {
                addIgnoreWords(ignoreWord);
            }
        }
        updateIgnoredWordDictionary();
        for (String prohibitedWord : wordListLoader.loadWords(getProhibitFileName())) {
            addProhibitedWords(expandLine(prohibitedWord));
        }
    }

    /**
     * Get the name of the ignore file, which lists words to be accepted, even
     * when the spell checker would not accept them. Unlike with {@link #getSpellingFileName()}
     * the words in this file will not be used for creating suggestions for misspelled words.
     * @since 2.7
     */
    protected String getIgnoreFileName() {
        return language.getShortCode() + SPELLING_IGNORE_FILE;
    }

    /**
     * Get the name of the spelling file, which lists words to be accepted
     * and used for suggestions, even when the spell checker would not accept them.
     * @since 2.9, public since 3.5
     */
    public String getSpellingFileName() {
        return language.getShortCode() + SPELLING_FILE;
    }

    /**
     * 
     * Get the name of the spelling file for a language variant (e.g., en-US or de-AT), 
     * which lists words to be accepted and used for suggestions, even when the spell
     * checker would not accept them.
     * @since 4.3
     */
    public String getLanguageVariantSpellingFileName() {
        return SPELLING_FILE_VARIANT;
    }

    /**
     * Get the name of the prohibit file, which lists words not to be accepted, even
     * when the spell checker would accept them.
     * @since 2.8
     */
    protected String getProhibitFileName() {
        return language.getShortCode() + SPELLING_PROHIBIT_FILE;
    }

    /**
     * Whether the word is prohibited, i.e. whether it should be marked as a spelling
     * error even if the spell checker would accept it. (This is useful to improve our spell
     * checker without waiting for the upstream checker to be updated.)
     * @since 2.8
     */
    protected boolean isProhibited(String word) {
        return wordsToBeProhibited.contains(word);
    }

    /**
     * Remove prohibited words from suggestions.
     * @since 2.8
     */
    protected void filterSuggestions(List<String> suggestions) {
        suggestions.removeIf(suggestion -> isProhibited(suggestion));
        filterDupes(suggestions);
    }

    /**
     * @param line the line as read from {@code spelling.txt}.
     * @since 2.9, signature modified in 3.9
     */
    protected void addIgnoreWords(String line) {
        // if line consists of several words (separated by " "), a DisambiguationPatternRule
        // will be created where each words serves as a case-sensitive and non-inflected PatternToken
        // so that the entire multi-word entry is ignored by the spell checker
        if (line.contains(" ")) {
            List<String> tokens = language.getWordTokenizer().tokenize(line);
            List<PatternToken> patternTokens = new ArrayList<>(tokens.size());
            for (String token : tokens) {
                if (token.trim().isEmpty()) {
                    continue;
                }
                patternTokens.add(new PatternToken(token, true, false, false));
            }
            antiPatterns.add(new DisambiguationPatternRule("INTERNAL_ANTIPATTERN", "(no description)", language,
                    patternTokens, null, null, DisambiguationPatternRule.DisambiguatorAction.IGNORE_SPELLING));
        } else {
            wordsToBeIgnored.add(line);
        }
    }

    /**
     * @param words list of words to be prohibited.
     * @since 4.2
     */
    protected void addProhibitedWords(List<String> words) {
        wordsToBeProhibited.addAll(words);
    }

    /**
     * Expand suffixes in a line. By default, the line is not expanded.
     * Implementations might e.g. turn {@code bicycle/S} into {@code [bicycle, bicycles]}.
     * @since 3.0
     */
    protected List<String> expandLine(String line) {
        return Collections.singletonList(line);
    }

    protected List<RuleWithLanguage> getAlternativeLangSpellingRules(List<Language> alternativeLanguages) {
        List<RuleWithLanguage> spellingRules = new ArrayList<>();
        for (Language altLanguage : alternativeLanguages) {
            List<Rule> rules;
            try {
                rules = new ArrayList<>(
                        altLanguage.getRelevantRules(messages, userConfig, null, Collections.emptyList()));
                rules.addAll(altLanguage.getRelevantLanguageModelCapableRules(messages, null, userConfig, null,
                        Collections.emptyList()));
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            for (Rule rule : rules) {
                if (rule.isDictionaryBasedSpellingRule()) {
                    spellingRules.add(new RuleWithLanguage(rule, altLanguage));
                }
            }
        }
        return spellingRules;
    }

    protected Language acceptedInAlternativeLanguage(String word) throws IOException {
        if (word.length() <= 2) {
            // it's strange if single characters are suddenly considered English
            return null;
        }
        for (RuleWithLanguage altRule : altRules) {
            AnalyzedToken token = new AnalyzedToken(word, null, null);
            AnalyzedToken sentenceStartToken = new AnalyzedToken("", JLanguageTool.SENTENCE_START_TAGNAME, null);
            AnalyzedTokenReadings startTokenReadings = new AnalyzedTokenReadings(sentenceStartToken, 0);
            AnalyzedTokenReadings atr = new AnalyzedTokenReadings(token, 0);
            RuleMatch[] matches = altRule.getRule()
                    .match(new AnalyzedSentence(new AnalyzedTokenReadings[] { startTokenReadings, atr }));
            if (matches.length == 0) {
                return altRule.getLanguage();
            } else {
                if (word.endsWith(".")) {
                    Language altLanguage = acceptedInAlternativeLanguage(word.substring(0, word.length() - 1));
                    if (altLanguage != null) {
                        return altLanguage;
                    }
                }
            }
        }
        return null;
    }

    /**
     * Accept (case-sensitively, unless at the start of a sentence) the given phrases even though they
     * are not in the built-in dictionary.
     * Use this to avoid false alarms on e.g. names and technical terms. Unlike {@link #addIgnoreTokens(List)}
     * this can deal with phrases. A way to call this is like this:
     * <code>rule.acceptPhrases(Arrays.asList("duodenal atresia"))</code>
     * This way, checking would not create an error for "duodenal atresia", but it would still
     * create and error for "duodenal" or "atresia" if they appear on their own.
     * @since 3.3
     */
    public void acceptPhrases(List<String> phrases) {
        List<List<PatternToken>> antiPatterns = new ArrayList<>();
        for (String phrase : phrases) {
            String[] parts = phrase.split(" ");
            List<PatternToken> patternTokens = new ArrayList<>();
            int i = 0;
            boolean startsLowercase = false;
            for (String part : parts) {
                if (i == 0) {
                    String uppercased = StringTools.uppercaseFirstChar(part);
                    if (!uppercased.equals(part)) {
                        startsLowercase = true;
                    }
                }
                patternTokens.add(new PatternTokenBuilder().csToken(part).build());
                i++;
            }
            antiPatterns.add(patternTokens);
            if (startsLowercase) {
                antiPatterns.add(getTokensForSentenceStart(parts));
            }
        }
        this.antiPatterns = makeAntiPatterns(antiPatterns, language);
    }

    private List<PatternToken> getTokensForSentenceStart(String[] parts) {
        List<PatternToken> ucPatternTokens = new ArrayList<>();
        int j = 0;
        for (String part : parts) {
            if (j == 0) {
                // at sentence start, we also need to accept a phrase that starts with an uppercase char:
                String uppercased = StringTools.uppercaseFirstChar(part);
                ucPatternTokens
                        .add(new PatternTokenBuilder().posRegex(JLanguageTool.SENTENCE_START_TAGNAME).build());
                ucPatternTokens.add(new PatternTokenBuilder().csToken(uppercased).build());
            } else {
                ucPatternTokens.add(new PatternTokenBuilder().csToken(part).build());
            }
            j++;
        }
        return ucPatternTokens;
    }

    @Override
    public List<DisambiguationPatternRule> getAntiPatterns() {
        return antiPatterns;
    }

    /**
     * Checks whether a <code>word</code> starts with an ignored word.
     * Note that a minimum <code>word</code>-length of 4 characters is expected.
     * (This is for better performance. Moreover, such short words are most likely contained in the dictionary.)
     * @param word - entire word
     * @param caseSensitive - determines whether the check is case-sensitive
     * @return length of the ignored word (i.e., return value is 0, if the word does not start with an ignored word).
     * If there are several matches from the set of ignored words, the length of the longest matching word is returned.
     * @since 3.5
     */
    protected int startsWithIgnoredWord(String word, boolean caseSensitive) {
        if (word.length() < 4) {
            return 0;
        }
        Optional<String> match = Optional.empty();
        if (caseSensitive) {
            Set<String> subset = wordsToBeIgnoredDictionary.get(word.substring(0, 1));
            if (subset != null) {
                match = subset.stream().filter(s -> word.startsWith(s)).max(STRING_LENGTH_COMPARATOR);
            }
        } else {
            String lowerCaseWord = word.toLowerCase();
            Set<String> subset = wordsToBeIgnoredDictionaryIgnoreCase.get(lowerCaseWord.substring(0, 1));
            if (subset != null) {
                match = subset.stream().filter(s -> lowerCaseWord.startsWith(s)).max(STRING_LENGTH_COMPARATOR);
            }
        }
        return match.isPresent() ? match.get().length() : 0;
    }

    @Experimental
    protected List<String> reorderSuggestions(List<String> suggestions, String word) {
        // WORK IN PROGRESS
        if (languageModel == null) {
            return suggestions;
        }
        BaseLanguageModel lm = (BaseLanguageModel) languageModel;
        List<Integer> levenshteinDistances = suggestions.stream()
                .map(suggestion -> StringUtils.getLevenshteinDistance(word, suggestion))
                .collect(Collectors.toList());
        List<Long> frequencies = suggestions.stream().map(lm::getCount).collect(Collectors.toList());
        Long frequenciesSum = frequencies.stream().reduce((a, b) -> a + b).orElse(1L);
        List<Float> normalizedFrequencies = frequencies.stream().map(f -> (float) f / frequenciesSum)
                .collect(Collectors.toList());
        System.out.println("frequencies: " + frequencies + " / normalized: " + normalizedFrequencies);

        List<Pair<String, Float>> scoredSuggestions = new ArrayList<>(suggestions.size());
        for (int i = 0; i < suggestions.size(); i++) {
            float score = (1f / normalizedFrequencies.get(i)) * levenshteinDistances.get(i);
            scoredSuggestions.add(Pair.of(suggestions.get(i), score));
        }
        scoredSuggestions.sort(Comparator.comparing(Pair::getRight));

        System.out.println(
                "Before reordering: " + suggestions.subList(0, 5) + " / After: " + scoredSuggestions.subList(0, 5));

        return scoredSuggestions.stream().map(Pair::getLeft).collect(Collectors.toList());
    }
}