org.languagetool.language.LanguageIdentifier.java Source code

Introduction

Here is the source code for org.languagetool.language.LanguageIdentifier.java
Source

/* LanguageTool, a natural language style checker
 * Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.language;

import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.*;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.jetbrains.annotations.Nullable;
import org.languagetool.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.logging.Level;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * Identify the language of a text. Note that some languages might never be
 * detected because they are close to another language. Language variants like
 * en-US or en-GB are not detected, the result will be {@code en} for those.
 * By default, only the first 1000 characters of a text are considered.
 * Email signatures that use {@code \n-- \n} as a delimiter are ignored.
 *
 * @since 2.9
 */
public class LanguageIdentifier {

    private static final Logger logger = LoggerFactory.getLogger(LanguageIdentifier.class);
    private static final double MINIMAL_CONFIDENCE = 0.9;
    private static final int K_HIGHEST_SCORES = 5;
    private static final int SHORT_ALGO_THRESHOLD = 50;
    // texts shorter than this will *only* consider preferred languages (if set):
    private static final int CONSIDER_ONLY_PREFERRED_THRESHOLD = 50;
    private static final Pattern SIGNATURE = Pattern.compile("\n-- \n.*", Pattern.DOTALL);

    // ast and gl often prevent the correct detection of Spanish (as the are quite similar
    // to Spanish, I assume) so we disable them for now. See LanguageDetectionEval.java:
    private static final List<String> ignoreLangCodes = Arrays.asList("ast", "gl");

    // languages that we offer profiles for as they are not yet supported by language-detector:
    private static final List<String> externalLangCodes = Arrays.asList("eo");
    // fall back to checking against list of common words if fasttext probability is lower than this:
    private static final float THRESHOLD = 0.9f; // 7.656
    //private static final float THRESHOLD = 0.95f;   // 7.39
    //private static final float THRESHOLD = 0.975f;  // 7.228 
    //private static final float THRESHOLD = 1.0f;    // 7.0

    private final LanguageDetector languageDetector;
    private final TextObjectFactory textObjectFactory;
    private final int maxLength;

    private boolean fasttextEnabled = false;
    private Process fasttextProcess;
    private BufferedReader fasttextIn;
    private BufferedWriter fasttextOut;

    public LanguageIdentifier() {
        this(1000);
    }

    /**
     * @param maxLength the maximum number of characters that will be considered - can help
     *                  with performance. Don't use values below 100, as this would decrease
     *                  accuracy.
     * @throws IllegalArgumentException if {@code maxLength} is less than 10
     * @since 4.2
     */
    public LanguageIdentifier(int maxLength) {
        if (maxLength < 10) {
            throw new IllegalArgumentException(
                    "maxLength must be >= 10 (but values > 100 are recommended): " + maxLength);
        }
        this.maxLength = maxLength;
        try {
            List<LanguageProfile> profiles = loadProfiles(getLanguageCodes());
            languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
                    .minimalConfidence(MINIMAL_CONFIDENCE).shortTextAlgorithm(SHORT_ALGO_THRESHOLD)
                    .withProfiles(profiles).build();
            textObjectFactory = new TextObjectFactoryBuilder().maxTextLength(10000)
                    .withTextFilter(UrlTextFilter.getInstance())
                    .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3))
                    .withTextFilter(new RemoveEMailSignatureFilter()).build();
        } catch (IOException e) {
            throw new RuntimeException("Could not set up language identifier", e);
        }
    }

    public void enableFasttext(File fasttextBinary, File fasttextModel) {
        if (fasttextBinary != null && fasttextModel != null) {
            try {
                startFasttext(fasttextModel, fasttextBinary);
                logger.info("Started fasttext process for language identification: Binary " + fasttextBinary
                        + " with model @ " + fasttextModel);
                fasttextEnabled = true;
            } catch (IOException e) {
                fasttextEnabled = false;
                throw new RuntimeException("Could not start fasttext process for language identification @ "
                        + fasttextBinary + " with model @ " + fasttextModel, e);
            }
        }
    }

    private static List<String> getLanguageCodes() {
        List<String> langCodes = new ArrayList<>();
        for (Language lang : Languages.get()) {
            String langCode = lang.getShortCode();
            boolean ignore = lang.isVariant() || ignoreLangCodes.contains(langCode)
                    || externalLangCodes.contains(langCode);
            if (ignore) {
                continue;
            }
            if ("zh".equals(langCode)) {
                langCodes.add("zh-CN");
                langCodes.add("zh-TW");
            } else {
                langCodes.add(langCode);
            }
        }
        return langCodes;
    }

    private List<LanguageProfile> loadProfiles(List<String> langCodes) throws IOException {
        LanguageProfileReader profileReader = new LanguageProfileReader();
        List<LanguageProfile> profiles = profileReader.read(langCodes);
        for (String externalLangCode : externalLangCodes) {
            String profilePath = "/" + externalLangCode + "/" + externalLangCode + ".profile";
            if (JLanguageTool.getDataBroker().resourceExists(profilePath)) { // not all languages are always available
                try (InputStream profile = JLanguageTool.getDataBroker().getFromResourceDirAsStream(profilePath)) {
                    profiles.add(new LanguageProfileReader().read(profile));
                }
            }
        }
        return profiles;
    }

    /**
     * @return language or {@code null} if language could not be identified
     */
    @Nullable
    public Language detectLanguage(String text) {
        DetectedLanguage detectedLanguage = detectLanguage(text, Collections.emptyList(), Collections.emptyList());
        if (detectedLanguage == null) {
            return null;
        }
        return detectedLanguage.getDetectedLanguage();
    }

    /**
     * @return language or {@code null} if language could not be identified
     */
    @Nullable
    @Experimental
    DetectedLanguage detectLanguageWithDetails(String text) {
        DetectedLanguage detectedLanguage = detectLanguage(text, Collections.emptyList(), Collections.emptyList());
        if (detectedLanguage == null) {
            return null;
        }
        return detectedLanguage;
    }

    /**
     * @return language or {@code null} if language could not be identified
     * @param noopLangsTmp list of codes that are detected but will lead to the NoopLanguage that has no rules
     * @since 4.4 (new parameter noopLangs, changed return type to DetectedLanguage)
     */
    @Nullable
    public DetectedLanguage detectLanguage(String text, List<String> noopLangsTmp, List<String> preferredLangsTmp) {
        Objects.requireNonNull(noopLangsTmp);
        Objects.requireNonNull(preferredLangsTmp);
        // Chrome sends 'nn' (Nynorsk) or 'nb' (Bokmal), but fasttext detects 'no', so we have to map, and 
        // Bokmal seems to be the standard variant:
        List<String> noopLangs = noopLangsTmp.stream().map(k -> k.equals("nb") ? "no" : k)
                .collect(Collectors.toList());
        List<String> preferredLangs = preferredLangsTmp.stream().map(k -> k.equals("nb") ? "no" : k)
                .collect(Collectors.toList());
        if (preferredLangs.stream().anyMatch(k -> k.contains("-"))) {
            throw new IllegalArgumentException(
                    "preferredLanguages may only contain language codes without variants (e.g. 'en', but not 'en-US'): "
                            + preferredLangs + ". Use 'preferredVariants' to specify variants");
        }
        String shortText = text.length() > maxLength ? text.substring(0, maxLength) : text;
        shortText = textObjectFactory.forText(shortText).toString();
        shortText = shortText.replaceAll("\uFEFF+", " "); // used by the browser add-on to filter HTML etc. (_ignoreText() in validator.js)
        Map.Entry<String, Double> result = null;
        if (fasttextEnabled) {
            try {
                Map<String, Double> scores = runFasttext(shortText, noopLangs);
                result = getHighestScoringResult(scores);
                if (result.getValue().floatValue() < THRESHOLD) {
                    //System.out.println(text + " ->" + result.getValue().floatValue() + " " + result.getKey());
                    CommonWords commonWords = new CommonWords();
                    Map<Language, Integer> lang2Count = commonWords.getKnownWordsPerLanguage(text);
                    //System.out.println("-> "+ lang2Count);
                    for (Map.Entry<Language, Integer> entry : lang2Count.entrySet()) {
                        String langCode = entry.getKey().getShortCode();
                        if (scores.containsKey(langCode)) {
                            // this looks arbitrary, but gave best results with evaluation (LanguageDetectionMinLengthEval):
                            scores.put(langCode, scores.get(langCode) + Double.valueOf(entry.getValue()));
                        } else {
                            scores.put(langCode, Double.valueOf(entry.getValue()));
                        }
                    }
                    result = getHighestScoringResult(scores);
                }
                if (text.length() < CONSIDER_ONLY_PREFERRED_THRESHOLD && preferredLangs.size() > 0) {
                    //System.out.println("remove? " + preferredLangs + " <-> " + scores);
                    scores.keySet().removeIf(k -> !preferredLangs.contains(k));
                    //System.out.println("-> " + b + " ==> " + scores);
                    result = getHighestScoringResult(scores);
                }
                // Calculate a trivial confidence value because fasttext's confidence is often
                // wrong for short text (e.g. 0.99 for a test that's misclassified). Don't
                // use 1.0 because we can never be totally sure...
                double newScore = 0.99 / (30.0 / Math.min(text.length(), 30));
                //System.out.println("fasttext  : " + result);
                //System.out.println("newScore  : " + newScore);
                result = new AbstractMap.SimpleImmutableEntry<>(result.getKey(), newScore);
            } catch (Exception e) {
                fasttextEnabled = false;
                RuleLoggerMessage msg = new RuleErrorNotification(this.getClass().getSimpleName(), "-", String
                        .format("Fasttext disabled, failed on '%s': %s", text, ExceptionUtils.getStackTrace(e)));
                RuleLoggerManager.getInstance().log(msg, Level.WARNING);
                fasttextProcess.destroy();
            }
        }
        if (!fasttextEnabled) { // no else, value can change in if clause
            result = detectLanguageCode(shortText);
            if (noopLangs.size() > 0) {
                logger.warn("Cannot consider noopLanguages because not in fastText mode: " + noopLangs);
            }
        }
        if (result != null && result.getKey() != null && canLanguageBeDetected(result.getKey(), noopLangs)) {
            return new DetectedLanguage(null, Languages.getLanguageForShortCode(result.getKey(), noopLangs),
                    result.getValue().floatValue());
        } else {
            return null;
        }
    }

    private boolean canLanguageBeDetected(String langCode, List<String> additionalLanguageCodes) {
        return Languages.isLanguageSupported(langCode) || additionalLanguageCodes.contains(langCode);
    }

    private void startFasttext(File modelPath, File binaryPath) throws IOException {
        fasttextProcess = new ProcessBuilder(binaryPath.getPath(), "predict-prob", modelPath.getPath(), "-",
                "" + K_HIGHEST_SCORES).start();
        fasttextIn = new BufferedReader(
                new InputStreamReader(fasttextProcess.getInputStream(), StandardCharsets.UTF_8));
        fasttextOut = new BufferedWriter(
                new OutputStreamWriter(fasttextProcess.getOutputStream(), StandardCharsets.UTF_8));
    }

    private Map.Entry<String, Double> getHighestScoringResult(Map<String, Double> probs) {
        String result = null;
        double max = -1;
        for (Map.Entry<String, Double> entry : probs.entrySet()) {
            if (entry.getValue() > max) {
                max = entry.getValue();
                result = entry.getKey();
            }
        }
        return new AbstractMap.SimpleImmutableEntry<>(result, max);
    }

    private Map<String, Double> runFasttext(String text, List<String> additionalLanguageCodes) throws IOException {
        Map<String, Double> probabilities = new HashMap<>();
        String joined = text.replace("\n", " ");
        String buffer;
        synchronized (this) {
            fasttextOut.write(joined);
            fasttextOut.newLine();
            fasttextOut.flush();
            buffer = fasttextIn.readLine();
        }
        String[] values = buffer.split(" ");
        if (values.length % 2 != 0) {
            throw new RuntimeException("Error while parsing fasttext output: " + buffer);
        }
        for (int i = 0; i < values.length; i += 2) {
            String lang = values[i];
            String langCode = lang.substring(lang.lastIndexOf("__") + 2);
            String prob = values[i + 1];
            Double probValue = Double.parseDouble(prob);
            if (canLanguageBeDetected(langCode, additionalLanguageCodes)) {
                probabilities.put(langCode, probValue);
            }
        }
        return probabilities;
    }

    /**
     * @return language or {@code null} if language could not be identified
     */
    @Nullable
    private Map.Entry<String, Double> detectLanguageCode(String text) {
        List<com.optimaize.langdetect.DetectedLanguage> lang = languageDetector.getProbabilities(text);
        // comment in for debugging:
        //System.out.println(languageDetector.getProbabilities(textObject));
        if (lang.size() > 0) {
            String code = lang.get(0).getLocale().getLanguage();
            double prob = lang.get(0).getProbability();
            return new AbstractMap.SimpleImmutableEntry<>(code, prob);
        } else {
            return null;
        }
    }

    class RemoveEMailSignatureFilter implements TextFilter {
        @Override
        public String filter(CharSequence text) {
            return SIGNATURE.matcher(text.toString()).replaceFirst("");
        }
    }
}