org.languagetool.rules.spelling.morfologik.MorfologikMultiSpeller.java Source code

Java tutorial

Introduction

Here is the source code for org.languagetool.rules.spelling.morfologik.MorfologikMultiSpeller.java

Source

/* LanguageTool, a natural language style checker
 * Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.spelling.morfologik;

import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import morfologik.fsa.FSA;
import morfologik.fsa.builders.FSABuilder;
import morfologik.fsa.builders.CFSA2Serializer;
import morfologik.stemming.Dictionary;

import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.languagetool.Experimental;
import org.languagetool.JLanguageTool;
import org.languagetool.UserConfig;
import org.languagetool.rules.spelling.SpellingCheckRule;

import java.io.*;
import java.util.*;
import java.util.concurrent.TimeUnit;

import static java.nio.charset.StandardCharsets.*;

/**
 * Morfologik speller that merges results from binary (.dict) and plain text (.txt) dictionaries.
 *
 * @since 2.9
 */
public class MorfologikMultiSpeller {

    private static final LoadingCache<BufferedReaderWithSource, List<byte[]>> dictCache = CacheBuilder.newBuilder()
            //.maximumSize(0)
            .expireAfterWrite(10, TimeUnit.MINUTES)
            .build(new CacheLoader<BufferedReaderWithSource, List<byte[]>>() {
                @Override
                public List<byte[]> load(@NotNull BufferedReaderWithSource reader) throws IOException {
                    List<byte[]> lines = getLines(reader.reader);
                    if (reader.languageVariantReader != null) {
                        lines.addAll(getLines(reader.languageVariantReader));
                        lines.add(SpellingCheckRule.LANGUAGETOOL.getBytes()); // adding here so it's also used for suggestions
                    }
                    return lines;
                }
            });
    private static final Map<String, Dictionary> dicPathToDict = new HashMap<>();

    private final List<MorfologikSpeller> spellers;
    private final List<MorfologikSpeller> defaultDictSpellers;
    private final List<MorfologikSpeller> userDictSpellers;
    private final boolean convertsCase;

    public MorfologikMultiSpeller(String binaryDictPath, String plainTextPath, String languageVariantPlainTextPath,
            int maxEditDistance) throws IOException {
        this(binaryDictPath, plainTextPath, languageVariantPlainTextPath, null, maxEditDistance);
    }

    /**
     * @param binaryDictPath path in classpath to a {@code .dict} binary Morfologik file
     * @param plainTextPath path in classpath to a plain text {@code .txt} file (like spelling.txt)
     * @param maxEditDistance maximum edit distance for accepting suggestions
     * @since 4.2
     */
    @Experimental
    public MorfologikMultiSpeller(String binaryDictPath, String plainTextPath, String languageVariantPlainTextPath,
            UserConfig userConfig, int maxEditDistance) throws IOException {
        this(binaryDictPath,
                plainTextPath != null
                        ? new BufferedReader(new InputStreamReader(
                                JLanguageTool.getDataBroker().getFromResourceDirAsStream(plainTextPath), UTF_8))
                        : null,
                plainTextPath,
                languageVariantPlainTextPath == null ? null
                        : new BufferedReader(new InputStreamReader(JLanguageTool.getDataBroker()
                                .getFromResourceDirAsStream(languageVariantPlainTextPath), UTF_8)),
                languageVariantPlainTextPath,
                userConfig != null ? userConfig.getAcceptedWords() : Collections.emptyList(), maxEditDistance);
        if (plainTextPath != null && (!plainTextPath.endsWith(".txt")
                || (languageVariantPlainTextPath != null && !languageVariantPlainTextPath.endsWith(".txt")))) {
            throw new IllegalArgumentException(
                    "Unsupported dictionary, plain text file needs to have suffix .txt: " + plainTextPath);
        }
    }

    /**
     * @param binaryDictPath path in classpath to a {@code .dict} binary Morfologik file
     * @param plainTextReader reader with to a plain text {@code .txt} file (like from spelling.txt)
     * @param maxEditDistance maximum edit distance for accepting suggestions
     * @since 3.0
     */
    public MorfologikMultiSpeller(String binaryDictPath, BufferedReader plainTextReader, String plainTextReaderPath,
            BufferedReader languageVariantPlainTextReader, String languageVariantPlainTextPath,
            List<String> userWords, int maxEditDistance) throws IOException {
        MorfologikSpeller speller = getBinaryDict(binaryDictPath, maxEditDistance);
        List<MorfologikSpeller> spellers = new ArrayList<>();
        MorfologikSpeller userDictSpeller = getUserDictSpellerOrNull(userWords, binaryDictPath, maxEditDistance);
        if (userDictSpeller != null) {
            // add this first, as otherwise suggestions from user's won dictionary might drown in the mass of other suggestions
            spellers.add(userDictSpeller);
            userDictSpellers = Collections.singletonList(userDictSpeller);
        } else {
            userDictSpellers = Collections.emptyList();
        }
        spellers.add(speller);
        convertsCase = speller.convertsCase();
        if (plainTextReader != null) {
            MorfologikSpeller plainTextSpeller = getPlainTextDictSpellerOrNull(plainTextReader, plainTextReaderPath,
                    languageVariantPlainTextReader, languageVariantPlainTextPath, binaryDictPath, maxEditDistance);
            if (plainTextSpeller != null) {
                spellers.add(plainTextSpeller);
                defaultDictSpellers = Arrays.asList(speller, plainTextSpeller);
            } else {
                defaultDictSpellers = Collections.singletonList(speller);
            }
        } else {
            defaultDictSpellers = Collections.singletonList(speller);
        }
        this.spellers = Collections.unmodifiableList(spellers);
    }

    private MorfologikSpeller getUserDictSpellerOrNull(List<String> userWords, String dictPath, int maxEditDistance)
            throws IOException {
        if (userWords.isEmpty()) {
            return null;
        }
        List<byte[]> byteLines = new ArrayList<>();
        for (String line : userWords) {
            byteLines.add(line.getBytes(UTF_8));
        }
        Dictionary dictionary = getDictionary(byteLines, dictPath, dictPath.replace(".dict", ".info"), false);
        return new MorfologikSpeller(dictionary, maxEditDistance);
    }

    private MorfologikSpeller getBinaryDict(String binaryDictPath, int maxEditDistance) {
        if (binaryDictPath.endsWith(".dict")) {
            return new MorfologikSpeller(binaryDictPath, maxEditDistance);
        } else {
            throw new RuntimeException(
                    "Unsupported dictionary, binary Morfologik file needs to have suffix .dict: " + binaryDictPath);
        }
    }

    @Nullable
    private MorfologikSpeller getPlainTextDictSpellerOrNull(BufferedReader plainTextReader,
            String plainTextReaderPath, BufferedReader languageVariantPlainTextReader,
            String languageVariantPlainTextPath, String dictPath, int maxEditDistance) throws IOException {
        List<byte[]> lines = dictCache.getUnchecked(new BufferedReaderWithSource(plainTextReader,
                plainTextReaderPath, languageVariantPlainTextReader, languageVariantPlainTextPath));
        if (lines.isEmpty()) {
            return null;
        }
        Dictionary dictionary = getDictionary(lines, plainTextReaderPath, dictPath.replace(".dict", ".info"), true);
        return new MorfologikSpeller(dictionary, maxEditDistance);
    }

    private static List<byte[]> getLines(BufferedReader br) throws IOException {
        List<byte[]> lines = new ArrayList<>();
        String line;
        while ((line = br.readLine()) != null) {
            if (!line.startsWith("#")) {
                lines.add(StringUtils.substringBefore(line, "#").trim().getBytes(UTF_8));
            }
        }
        return lines;
    }

    private Dictionary getDictionary(List<byte[]> lines, String dictPath, String infoPath, boolean allowCache)
            throws IOException {
        String cacheKey = dictPath + "|" + infoPath;
        Dictionary dictFromCache = dicPathToDict.get(cacheKey);
        if (allowCache && dictFromCache != null) {
            return dictFromCache;
        } else {
            // Creating the dictionary at runtime can easily take 50ms for spelling.txt files
            // that are ~50KB. We don't want that overhead for every check of a short sentence,
            // so we cache the result:
            List<byte[]> linesCopy = new ArrayList<>(lines);
            Collections.sort(linesCopy, FSABuilder.LEXICAL_ORDERING);
            FSA fsa = FSABuilder.build(linesCopy);
            ByteArrayOutputStream fsaOutStream = new CFSA2Serializer().serialize(fsa, new ByteArrayOutputStream());
            ByteArrayInputStream fsaInStream = new ByteArrayInputStream(fsaOutStream.toByteArray());
            Dictionary dict = Dictionary.read(fsaInStream,
                    JLanguageTool.getDataBroker().getFromResourceDirAsStream(infoPath));
            dicPathToDict.put(cacheKey, dict);
            return dict;
        }
    }

    /**
     * Accept the word if at least one of the dictionaries accepts it as not misspelled.
     */
    public boolean isMisspelled(String word) {
        for (MorfologikSpeller speller : spellers) {
            if (!speller.isMisspelled(word)) {
                return false;
            }
        }
        return true;
    }

    @NotNull
    private List<String> getSuggestionsFromSpellers(String word, List<MorfologikSpeller> spellerList) {
        List<String> result = new ArrayList<>();
        for (MorfologikSpeller speller : spellerList) {
            List<String> suggestions = speller.getSuggestions(word);
            for (String suggestion : suggestions) {
                if (!result.contains(suggestion) && !suggestion.equals(word)) {
                    result.add(suggestion);
                }
            }
        }
        return result;
    }

    /**
     * The suggestions from all dictionaries (without duplicates).
     */
    public List<String> getSuggestions(String word) {
        return getSuggestionsFromSpellers(word, spellers);
    }

    /**
     * @since 4.5
     * @param word misspelled word
     * @return suggestions from users personal dictionary
     */
    @Experimental
    public List<String> getSuggestionsFromUserDicts(String word) {
        return getSuggestionsFromSpellers(word, userDictSpellers);
    }

    /**
     * @since 4.5
     * @param word misspelled word
     * @return suggestions from built-in dictionaries
     */
    @Experimental
    public List<String> getSuggestionsFromDefaultDicts(String word) {
        return getSuggestionsFromSpellers(word, defaultDictSpellers);
    }

    /**
     * Determines whether the dictionary uses case conversions.
     * @return True when the speller uses spell conversions.
     * @since 2.5
     */
    public boolean convertsCase() {
        return convertsCase;
    }

    static class BufferedReaderWithSource {
        private BufferedReader reader;
        private String readerPath;
        private BufferedReader languageVariantReader;
        private String languageVariantPath;

        BufferedReaderWithSource(BufferedReader reader, String readerPath, BufferedReader languageVariantReader,
                String languageVariantPath) {
            this.reader = Objects.requireNonNull(reader);
            this.readerPath = Objects.requireNonNull(readerPath);
            this.languageVariantReader = languageVariantReader;
            this.languageVariantPath = languageVariantPath;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) {
                return true;
            } else if (o == null || getClass() != o.getClass()) {
                return false;
            }
            BufferedReaderWithSource that = (BufferedReaderWithSource) o;
            return Objects.equals(readerPath, that.readerPath)
                    && Objects.equals(languageVariantPath, that.languageVariantPath);
        }

        @Override
        public int hashCode() {
            return Objects.hash(readerPath, languageVariantPath);
        }
    }
}