org.languagetool.dev.dumpcheck.SentenceSourceChecker.java Source code

Introduction

Here is the source code for org.languagetool.dev.dumpcheck.SentenceSourceChecker.java
Source

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2013 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.dev.dumpcheck;

import org.apache.commons.cli.*;
import org.apache.commons.lang3.StringUtils;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.MultiThreadedJLanguageTool;
import org.languagetool.rules.CategoryId;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.*;
import java.util.regex.Pattern;

/**
 * Checks texts from one or more {@link SentenceSource}s.
 * @since 2.4
 */
public class SentenceSourceChecker {

    private SentenceSourceChecker() {
        // no public constructor
    }

    public static void main(String[] args) throws IOException {
        SentenceSourceChecker prg = new SentenceSourceChecker();
        CommandLine commandLine = ensureCorrectUsageOrExit(args);
        File propFile = null;
        if (commandLine.hasOption('d')) {
            propFile = new File(commandLine.getOptionValue('d'));
            if (!propFile.exists() || propFile.isDirectory()) {
                throw new IOException("File not found or isn't a file: " + propFile.getAbsolutePath());
            }
        }
        String languageCode = commandLine.getOptionValue('l');
        Set<String> disabledRuleIds = new HashSet<>();
        if (commandLine.hasOption("rule-properties")) {
            File disabledRulesPropFile = new File(commandLine.getOptionValue("rule-properties"));
            if (!disabledRulesPropFile.exists() || disabledRulesPropFile.isDirectory()) {
                throw new IOException("File not found or isn't a file: " + disabledRulesPropFile.getAbsolutePath());
            }
            Properties disabledRules = new Properties();
            try (FileInputStream stream = new FileInputStream(disabledRulesPropFile)) {
                disabledRules.load(stream);
                addDisabledRules("all", disabledRuleIds, disabledRules);
                addDisabledRules(languageCode, disabledRuleIds, disabledRules);
            }
        }
        int maxArticles = Integer.parseInt(commandLine.getOptionValue("max-sentences", "0"));
        int maxErrors = Integer.parseInt(commandLine.getOptionValue("max-errors", "0"));
        String[] ruleIds = commandLine.hasOption('r') ? commandLine.getOptionValue('r').split(",") : null;
        String[] categoryIds = commandLine.hasOption("also-enable-categories")
                ? commandLine.getOptionValue("also-enable-categories").split(",")
                : null;
        String[] fileNames = commandLine.getOptionValues('f');
        File languageModelDir = commandLine.hasOption("languagemodel")
                ? new File(commandLine.getOptionValue("languagemodel"))
                : null;
        File word2vecModelDir = commandLine.hasOption("word2vecmodel")
                ? new File(commandLine.getOptionValue("word2vecmodel"))
                : null;
        File neuralNetworkModelDir = commandLine.hasOption("neuralnetworkmodel")
                ? new File(commandLine.getOptionValue("neuralnetworkmodel"))
                : null;
        Pattern filter = commandLine.hasOption("filter") ? Pattern.compile(commandLine.getOptionValue("filter"))
                : null;
        prg.run(propFile, disabledRuleIds, languageCode, Arrays.asList(fileNames), ruleIds, categoryIds,
                maxArticles, maxErrors, languageModelDir, word2vecModelDir, neuralNetworkModelDir, filter);
    }

    private static void addDisabledRules(String languageCode, Set<String> disabledRuleIds,
            Properties disabledRules) {
        String disabledRulesString = disabledRules.getProperty(languageCode);
        if (disabledRulesString != null) {
            String[] ids = disabledRulesString.split(",");
            disabledRuleIds.addAll(Arrays.asList(ids));
        }
    }

    private static CommandLine ensureCorrectUsageOrExit(String[] args) {
        Options options = new Options();
        options.addOption(Option.builder("l").longOpt("language").argName("code").hasArg()
                .desc("language code like 'en' or 'de'").required().build());
        options.addOption(Option.builder("d").longOpt("db-properties").argName("file").hasArg()
                .desc("A file to set database access properties. If not set, the output will be written to STDOUT. "
                        + "The file needs to set the properties dbUrl ('jdbc:...'), dbUser, and dbPassword. "
                        + "It can optionally define the batchSize for insert statements, which defaults to 1.")
                .build());
        options.addOption(Option.builder().longOpt("rule-properties").argName("file").hasArg().desc(
                "A file to set rules which should be disabled per language (e.g. en=RULE1,RULE2 or all=RULE3,RULE4)")
                .build());
        options.addOption(Option.builder("r").longOpt("rule-ids").argName("id").hasArg()
                .desc("comma-separated list of rule-ids to activate").build());
        options.addOption(Option.builder().longOpt("also-enable-categories").argName("categories").hasArg()
                .desc("comma-separated list of categories to activate, additionally to rules activated anyway")
                .build());
        options.addOption(Option.builder("f").longOpt("file").argName("file").hasArg().desc(
                "an unpacked Wikipedia XML dump; (must be named *.xml, dumps are available from http://dumps.wikimedia.org/backup-index.html) "
                        + "or a Tatoeba CSV file filtered to contain only one language (must be named tatoeba-*). You can specify this option more than once.")
                .required().build());
        options.addOption(Option.builder().longOpt("max-sentences").argName("number").hasArg()
                .desc("maximum number of sentences to check").build());
        options.addOption(Option.builder().longOpt("max-errors").argName("number").hasArg()
                .desc("maximum number of errors, stop when finding more").build());
        options.addOption(Option.builder().longOpt("languagemodel").argName("indexDir").hasArg()
                .desc("directory with a '3grams' sub directory that contains an ngram index").build());
        options.addOption(Option.builder().longOpt("neuralnetworkmodel").argName("baseDir").hasArg()
                .desc("base directory for saved neural network models").build());
        options.addOption(Option.builder().longOpt("filter").argName("regex").hasArg()
                .desc("Consider only sentences that contain this regular expression (for speed up)").build());
        try {
            CommandLineParser parser = new DefaultParser();
            return parser.parse(options, args);
        } catch (ParseException e) {
            System.err.println("Error: " + e.getMessage());
            HelpFormatter formatter = new HelpFormatter();
            formatter.setWidth(80);
            formatter.setSyntaxPrefix("Usage: ");
            formatter.printHelp(
                    SentenceSourceChecker.class.getSimpleName() + " [OPTION]... --file <file> --language <code>",
                    options);
            System.exit(1);
        }
        throw new IllegalStateException();
    }

    private void run(File propFile, Set<String> disabledRules, String langCode, List<String> fileNames,
            String[] ruleIds, String[] additionalCategoryIds, int maxSentences, int maxErrors,
            File languageModelDir, File word2vecModelDir, File neuralNetworkModelDir, Pattern filter)
            throws IOException {
        Language lang = Languages.getLanguageForShortCode(langCode);
        MultiThreadedJLanguageTool languageTool = new MultiThreadedJLanguageTool(lang);
        languageTool.setCleanOverlappingMatches(false);
        if (languageModelDir != null) {
            languageTool.activateLanguageModelRules(languageModelDir);
        }
        if (word2vecModelDir != null) {
            languageTool.activateWord2VecModelRules(word2vecModelDir);
        }
        if (neuralNetworkModelDir != null) {
            languageTool.activateNeuralNetworkRules(neuralNetworkModelDir);
        }
        if (ruleIds != null) {
            enableOnlySpecifiedRules(ruleIds, languageTool);
        } else {
            applyRuleDeactivation(languageTool, disabledRules);
        }
        if (filter != null) {
            System.out.println(
                    "*** NOTE: only sentences that match regular expression '" + filter + "' will be checked");
        }
        activateAdditionalCategories(additionalCategoryIds, languageTool);
        disableSpellingRules(languageTool);
        System.out.println("Working on: " + StringUtils.join(fileNames, ", "));
        System.out.println("Sentence limit: " + (maxSentences > 0 ? maxSentences : "no limit"));
        System.out.println("Error limit: " + (maxErrors > 0 ? maxErrors : "no limit"));
        //System.out.println("Version: " + JLanguageTool.VERSION + " (" + JLanguageTool.BUILD_DATE + ")");

        ResultHandler resultHandler = null;
        int ruleMatchCount = 0;
        int sentenceCount = 0;
        try {
            if (propFile != null) {
                resultHandler = new DatabaseHandler(propFile, maxSentences, maxErrors);
            } else {
                //resultHandler = new CompactStdoutHandler(maxSentences, maxErrors);
                resultHandler = new StdoutHandler(maxSentences, maxErrors);
            }
            MixingSentenceSource mixingSource = MixingSentenceSource.create(fileNames, lang, filter);
            while (mixingSource.hasNext()) {
                Sentence sentence = mixingSource.next();
                try {
                    List<RuleMatch> matches = languageTool.check(sentence.getText());
                    resultHandler.handleResult(sentence, matches, lang);
                    sentenceCount++;
                    if (sentenceCount % 5000 == 0) {
                        System.err.printf("%s sentences checked...\n",
                                NumberFormat.getNumberInstance(Locale.US).format(sentenceCount));
                    }
                    ruleMatchCount += matches.size();
                } catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
                    throw e;
                } catch (Exception e) {
                    throw new RuntimeException(
                            "Check failed on sentence: " + StringUtils.abbreviate(sentence.getText(), 250), e);
                }
            }
        } catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
            System.out.println(getClass().getSimpleName() + ": " + e);
        } finally {
            languageTool.shutdown();
            if (resultHandler != null) {
                float matchesPerSentence = (float) ruleMatchCount / sentenceCount;
                System.out.printf(lang + ": %d total matches\n", ruleMatchCount);
                System.out.printf(lang + ": %.2f rule matches per sentence\n", matchesPerSentence);
                try {
                    resultHandler.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private void enableOnlySpecifiedRules(String[] ruleIds, JLanguageTool languageTool) {
        for (Rule rule : languageTool.getAllRules()) {
            languageTool.disableRule(rule.getId());
        }
        for (String ruleId : ruleIds) {
            languageTool.enableRule(ruleId);
        }
        warnOnNonExistingRuleIds(ruleIds, languageTool);
        System.out.println("Only these rules are enabled: " + Arrays.toString(ruleIds));
    }

    private void warnOnNonExistingRuleIds(String[] ruleIds, JLanguageTool languageTool) {
        for (String ruleId : ruleIds) {
            boolean found = false;
            for (Rule rule : languageTool.getAllRules()) {
                if (rule.getId().equals(ruleId)) {
                    found = true;
                    break;
                }
            }
            if (!found) {
                System.out.println("WARNING: Could not find rule '" + ruleId + "'");
            }
        }
    }

    private void applyRuleDeactivation(JLanguageTool languageTool, Set<String> disabledRules) {
        // disabled via config file, usually to avoid too many false alarms:
        for (String disabledRuleId : disabledRules) {
            languageTool.disableRule(disabledRuleId);
        }
        System.out.println("These rules are disabled: " + languageTool.getDisabledRules());
    }

    private void activateAdditionalCategories(String[] additionalCategoryIds, JLanguageTool languageTool) {
        if (additionalCategoryIds != null) {
            for (String categoryId : additionalCategoryIds) {
                for (Rule rule : languageTool.getAllRules()) {
                    CategoryId id = rule.getCategory().getId();
                    if (id != null && id.toString().equals(categoryId)) {
                        System.out.println("Activating " + rule.getId() + " in category " + categoryId);
                        languageTool.enableRule(rule.getId());
                    }
                }
            }
        }
    }

    private void disableSpellingRules(JLanguageTool languageTool) {
        List<Rule> allActiveRules = languageTool.getAllActiveRules();
        for (Rule rule : allActiveRules) {
            if (rule.isDictionaryBasedSpellingRule()) {
                languageTool.disableRule(rule.getId());
            }
        }
        System.out.println("All spelling rules are disabled");
    }

}