org.languagetool.dev.wikipedia.CheckWikipediaDump.java Source code

Java tutorial

Introduction

Here is the source code for org.languagetool.dev.wikipedia.CheckWikipediaDump.java

Source

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.dev.wikipedia;

import org.apache.commons.cli.*;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.MultiThreadedJLanguageTool;
import org.languagetool.dev.dumpcheck.ArticleLimitReachedException;
import org.languagetool.dev.dumpcheck.ErrorLimitReachedException;
import org.languagetool.rules.Rule;
import org.xml.sax.SAXException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.TimeUnit;

/**
 * Command-line tool that checks texts from Wikipedia (download "pages-articles.xml.bz2" from
 * http://download.wikimedia.org/backup-index.html, e.g.
 * http://download.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2)
 * and stores the result in a database.
 * 
 * @author Daniel Naber
 * @deprecated use {@link org.languagetool.dev.dumpcheck.SentenceSourceChecker} instead (deprecated since 2.4)
 */
@Deprecated
public class CheckWikipediaDump {

    private CheckWikipediaDump() {
        // no public constructor
    }

    public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException {
        System.err.println("*** Note: this class has been deprecated - please use option 'check-data' instead");
        final CheckWikipediaDump prg = new CheckWikipediaDump();
        final CommandLine commandLine = ensureCorrectUsageOrExit(args);
        File propFile = null;
        if (commandLine.hasOption('d')) {
            propFile = new File(commandLine.getOptionValue('d'));
            if (!propFile.exists() || propFile.isDirectory()) {
                throw new IOException("File not found or isn't a file: " + propFile.getAbsolutePath());
            }
        }
        final String languageCode = commandLine.getOptionValue('l');
        final Set<String> disabledRuleIds = new HashSet<>();
        if (commandLine.hasOption("rule-properties")) {
            final File disabledRulesPropFile = new File(commandLine.getOptionValue("rule-properties"));
            if (!disabledRulesPropFile.exists() || disabledRulesPropFile.isDirectory()) {
                throw new IOException("File not found or isn't a file: " + disabledRulesPropFile.getAbsolutePath());
            }
            final Properties disabledRules = new Properties();
            try (FileInputStream stream = new FileInputStream(disabledRulesPropFile)) {
                disabledRules.load(stream);
                addDisabledRules("all", disabledRuleIds, disabledRules);
                addDisabledRules(languageCode, disabledRuleIds, disabledRules);
            }
        }
        final int maxArticles = Integer.parseInt(commandLine.getOptionValue("max-articles", "0"));
        final int maxErrors = Integer.parseInt(commandLine.getOptionValue("max-errors", "0"));
        String[] ruleIds = null;
        if (commandLine.hasOption('r')) {
            ruleIds = commandLine.getOptionValue('r').split(",");
        }
        prg.run(propFile, disabledRuleIds, languageCode, commandLine.getOptionValue('f'), ruleIds, maxArticles,
                maxErrors);
    }

    private static void addDisabledRules(String languageCode, Set<String> disabledRuleIds,
            Properties disabledRules) {
        final String disabledRulesString = disabledRules.getProperty(languageCode);
        if (disabledRulesString != null) {
            final String[] ids = disabledRulesString.split(",");
            disabledRuleIds.addAll(Arrays.asList(ids));
        }
    }

    @SuppressWarnings("AccessStaticViaInstance")
    private static CommandLine ensureCorrectUsageOrExit(String[] args) {
        Options options = new Options();
        options.addOption(OptionBuilder.withLongOpt("language").withArgName("code").hasArg()
                .withDescription("language code like 'en' or 'de'").isRequired().create("l"));
        options.addOption(OptionBuilder.withLongOpt("db-properties").withArgName("file").hasArg().withDescription(
                "A file to set database access properties. If not set, the output will be written to STDOUT. "
                        + "The file needs to set dbDriver (fully qualified driver class), dbUrl ('jdbc:...'), dbUser, and dbPassword.")
                .create("d"));
        options.addOption(OptionBuilder.withLongOpt("rule-properties").withArgName("file").hasArg().withDescription(
                "A file to set rules which should be disabled per language (e.g. en=RULE1,RULE2 or all=RULE3,RULE4)")
                .create());
        options.addOption(OptionBuilder.withLongOpt("rule-ids").withArgName("id").hasArg()
                .withDescription("comma-separated list of rule-ids to activate").create("r"));
        options.addOption(OptionBuilder.withLongOpt("file").withArgName("xmlfile").hasArg().withDescription(
                "an unpacked Wikipedia XML dump; dumps are available from http://dumps.wikimedia.org/backup-index.html")
                .isRequired().create("f"));
        options.addOption(OptionBuilder.withLongOpt("max-articles").withArgName("number").hasArg()
                .withDescription("maximum number of articles to check").create());
        options.addOption(OptionBuilder.withLongOpt("max-errors").withArgName("number").hasArg()
                .withDescription("maximum number of errors, stop when finding more").create());
        try {
            CommandLineParser parser = new GnuParser();
            return parser.parse(options, args);
        } catch (org.apache.commons.cli.ParseException e) {
            System.err.println("Error: " + e.getMessage());
            HelpFormatter formatter = new HelpFormatter();
            formatter.setWidth(80);
            formatter.setSyntaxPrefix("Usage: ");
            formatter.printHelp(
                    CheckWikipediaDump.class.getSimpleName() + " [OPTION]... --file <xmlfile> --language <code>",
                    options);
            System.exit(1);
        }
        return null;
    }

    private void run(File propFile, Set<String> disabledRules, String langCode, String xmlFileName,
            String[] ruleIds, int maxArticles, int maxErrors)
            throws IOException, SAXException, ParserConfigurationException {
        //final long startTime = System.currentTimeMillis();
        final File file = new File(xmlFileName);
        if (!file.exists() || !file.isFile()) {
            throw new IOException("File doesn't exist or isn't a file: " + xmlFileName);
        }
        final Language lang = Language.getLanguageForShortName(langCode);
        final JLanguageTool languageTool = new MultiThreadedJLanguageTool(lang);
        languageTool.activateDefaultPatternRules();
        if (ruleIds != null) {
            enableSpecifiedRules(ruleIds, languageTool);
        } else {
            applyRuleDeactivation(languageTool, disabledRules);
        }
        disableSpellingRules(languageTool);
        final Date dumpDate = getDumpFileDate(file);
        System.out.println("Dump date: " + dumpDate + ", language: " + langCode);
        System.out.println("Article limit: " + (maxArticles > 0 ? maxArticles : "no limit"));
        System.out.println("Error limit: " + (maxErrors > 0 ? maxErrors : "no limit"));
        BaseWikipediaDumpHandler xmlHandler = null;
        try {
            if (propFile != null) {
                xmlHandler = new DatabaseDumpHandler(languageTool, dumpDate, langCode, propFile, lang);
            } else {
                xmlHandler = new OutputDumpHandler(languageTool, dumpDate, langCode, lang);
            }
            xmlHandler.setMaximumArticles(maxArticles);
            xmlHandler.setMaximumErrors(maxErrors);
            final SAXParserFactory factory = SAXParserFactory.newInstance();
            final SAXParser saxParser = factory.newSAXParser();
            saxParser.parse(file, xmlHandler);
        } catch (ErrorLimitReachedException | ArticleLimitReachedException e) {
            System.out.println(e);
        } finally {
            if (xmlHandler != null) {
                final float matchesPerDoc = (float) xmlHandler.getRuleMatchCount() / xmlHandler.getArticleCount();
                System.out.printf(lang + ": %d total matches\n", xmlHandler.getRuleMatchCount());
                System.out.printf(lang + ": %.2f rule matches per document\n", matchesPerDoc);
                //System.out.printf(lang + ": %s total runtime\n", getRunTime(startTime));
                xmlHandler.close();
            }
        }
    }

    private void enableSpecifiedRules(String[] ruleIds, JLanguageTool languageTool) {
        for (Rule rule : languageTool.getAllRules()) {
            languageTool.disableRule(rule.getId());
        }
        for (String ruleId : ruleIds) {
            languageTool.enableRule(ruleId);
        }
        for (Rule rule : languageTool.getAllRules()) {
            if (rule.isDefaultOff()) {
                languageTool.enableDefaultOffRule(rule.getId());
            }
        }
        for (String ruleId : ruleIds) {
            boolean found = false;
            for (Rule rule : languageTool.getAllRules()) {
                if (rule.getId().equals(ruleId)) {
                    found = true;
                    break;
                }
            }
            if (!found) {
                System.out.println("WARNING: Could not find rule '" + ruleId + "'");
            }
        }
        System.out.println("Only these rules are enabled: " + Arrays.toString(ruleIds));
    }

    private void applyRuleDeactivation(JLanguageTool languageTool, Set<String> disabledRules) {
        // disabled via config file, usually to avoid too many false alarms:
        for (String disabledRuleId : disabledRules) {
            languageTool.disableRule(disabledRuleId);
        }
        System.out.println("These rules are disabled: " + languageTool.getDisabledRules());
    }

    private void disableSpellingRules(JLanguageTool languageTool) {
        final List<Rule> allActiveRules = languageTool.getAllActiveRules();
        for (Rule rule : allActiveRules) {
            if (rule.isDictionaryBasedSpellingRule()) {
                languageTool.disableRule(rule.getId());
            }
        }
        System.out.println("All spelling rules are disabled");
    }

    private Date getDumpFileDate(File file) throws IOException {
        final String filename = file.getName();
        final String[] parts = filename.split("-");
        if (parts.length < 3) {
            throw new IOException("Unexpected filename format: " + file.getName()
                    + ", must be like ??wiki-????????-pages-articles.xml");
        }
        final SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
        try {
            return sdf.parse(parts[1]);
        } catch (ParseException e) {
            throw new IOException("Unexpected date format '" + parts[1] + "', must be yyyymmdd", e);
        }
    }

    private String getRunTime(long startTime) {
        final long runtime = System.currentTimeMillis() - startTime;
        return String.format("%02d:%02d", TimeUnit.MILLISECONDS.toMinutes(runtime),
                TimeUnit.MILLISECONDS.toSeconds(runtime)
                        - TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(runtime)));
    }

}