org.languagetool.dev.bigdata.AutomaticProhibitedCompoundRuleEvaluator.java Source code

Introduction

Here is the source code for org.languagetool.dev.bigdata.AutomaticProhibitedCompoundRuleEvaluator.java
Source

/*
 *  LanguageTool, a natural language style checker
 *  * Copyright (C) 2018 Fabian Richter
 *  *
 *  * This library is free software; you can redistribute it and/or
 *  * modify it under the terms of the GNU Lesser General Public
 *  * License as published by the Free Software Foundation; either
 *  * version 2.1 of the License, or (at your option) any later version.
 *  *
 *  * This library is distributed in the hope that it will be useful,
 *  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  * Lesser General Public License for more details.
 *  *
 *  * You should have received a copy of the GNU Lesser General Public
 *  * License along with this library; if not, write to the Free Software
 *  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 *  * USA
 *
 */
package org.languagetool.dev.bigdata;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.store.FSDirectory;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.languagemodel.LuceneLanguageModel;
import org.languagetool.rules.ConfusionPair;
import org.languagetool.rules.ConfusionSetLoader;

import java.io.*;
import java.util.*;
import java.util.stream.Collectors;

/**
 * Automatically run {@link ConfusionRuleEvaluator} on word pairs.
 * @since 3.2
 */
@SuppressWarnings({ "resource", "CallToPrintStackTrace" })
class AutomaticProhibitedCompoundRuleEvaluator {

    private static final String LANGUAGE = "de";
    private static final int MAX_EXAMPLES = 1000;
    private static final int MIN_EXAMPLES = 50;
    private static final List<Long> EVAL_FACTORS = Arrays.asList(10L);//, 100L, 1_000L, 10_000L, 100_000L, 1_000_000L, 10_000_000L);
    private static final float MIN_PRECISION = 0.95f;
    private static final float MIN_RECALL = 0.1f;
    private static final String LUCENE_CONTENT_FIELD = "fieldLowercase";

    private final IndexSearcher searcher;
    private final Map<String, List<ConfusionPair>> knownSets;
    private final Set<String> finishedPairs = new HashSet<>();

    private int ignored = 0;

    AutomaticProhibitedCompoundRuleEvaluator(File luceneIndexDir) throws IOException {
        DirectoryReader reader = DirectoryReader.open(FSDirectory.open(luceneIndexDir.toPath()));
        searcher = new IndexSearcher(reader);
        InputStream confusionSetStream = JLanguageTool.getDataBroker()
                .getFromResourceDirAsStream("/" + LANGUAGE + "/confusion_sets.txt");
        knownSets = new ConfusionSetLoader().loadConfusionPairs(confusionSetStream);
    }

    private void run(List<String> lines, File indexDir) throws IOException {
        Language language = Languages.getLanguageForShortCode(LANGUAGE);
        LanguageModel lm = new LuceneLanguageModel(indexDir);
        ProhibitedCompoundRuleEvaluator evaluator = new ProhibitedCompoundRuleEvaluator(language, lm);
        int lineCount = 0;
        for (String line : lines) {
            lineCount++;
            if (line.contains("#")) {
                System.out.println("Ignoring: " + line);
                continue;
            }
            String[] parts = line.split(";\\s*");
            if (parts.length != 2) {
                throw new IOException("Expected semicolon-separated input: " + line);
            }
            try {
                int i = 1;
                for (String part : parts) {
                    // compare pair-wise - maybe we should compare every item with every other item?
                    if (i < parts.length) {
                        runOnPair(evaluator, line, lineCount, lines.size(), removeComment(part),
                                removeComment(parts[i]));
                    }
                    i++;
                }
            } catch (RuntimeException e) {
                e.printStackTrace();
            }
        }
        System.out.println("Done. Ignored items because they are already known: " + ignored);
    }

    private String removeComment(String str) {
        return str.replaceFirst("\\|.*", "").trim();
    }

    private void runOnPair(ProhibitedCompoundRuleEvaluator evaluator, String line, int lineCount, int totalLines,
            String part1, String part2) throws IOException {
        if (finishedPairs.contains(part1 + "/" + part2) || finishedPairs.contains(part2 + "/" + part1)) {
            System.out.println("Ignoring: " + part1 + "/" + part2 + ", finished before");
            return;
        }
        for (Map.Entry<String, List<ConfusionPair>> entry : knownSets.entrySet()) {
            if (entry.getKey().equals(part1)) {
                List<ConfusionPair> confusionPair = entry.getValue();
                for (ConfusionPair pair : confusionPair) {
                    Set<String> stringSet = pair.getTerms().stream().map(l -> l.getString())
                            .collect(Collectors.toSet());
                    if (stringSet.containsAll(Arrays.asList(part1, part2))) {
                        System.out
                                .println("Ignoring: " + part1 + "/" + part2 + ", in active confusion sets already");
                        ignored++;
                        return;
                    }
                }
            }
        }
        System.out.println("Working on: " + line + " (" + lineCount + " of " + totalLines + ")");
        try {
            File sentencesFile = writeExampleSentencesToTempFile(new String[] { part1, part2 });
            List<String> input = Arrays.asList(sentencesFile.getAbsolutePath());
            Map<Long, RuleEvalResult> results = evaluator.run(input, part1, part2, MAX_EXAMPLES, EVAL_FACTORS);
            Map<Long, RuleEvalResult> bestResults = findBestFactor(results);
            if (bestResults.size() > 0) {
                for (Map.Entry<Long, RuleEvalResult> entry : bestResults.entrySet()) {
                    System.out.println("=> " + entry.getValue().getSummary());
                }
            } else {
                System.out.println("No good result found for " + part1 + "/" + part2);
            }
            finishedPairs.add(part1 + "/" + part2);
        } catch (TooFewExamples e) {
            System.out.println("Skipping " + part1 + "/" + part2 + ", too few examples: " + e.getMessage());
        }
    }

    private Map<Long, RuleEvalResult> findBestFactor(Map<Long, RuleEvalResult> results) {
        Map<Long, RuleEvalResult> filteredResults = new HashMap<>();
        for (Map.Entry<Long, RuleEvalResult> entry : results.entrySet()) {
            RuleEvalResult result = entry.getValue();
            boolean candidate = result.getPrecision() >= MIN_PRECISION && result.getRecall() >= MIN_RECALL;
            if (candidate) {
                filteredResults.put(entry.getKey(), entry.getValue());
            }
        }
        return filteredResults;
    }

    private File writeExampleSentencesToTempFile(String[] words) throws IOException {
        File tempFile = new File("/tmp/example-sentences.txt");
        int count = 0;
        try (FileWriter fw = new FileWriter(tempFile)) {
            for (String word : words) {
                int tmpCount = findExampleSentences(word, fw);
                if (tmpCount <= MIN_EXAMPLES) {
                    throw new TooFewExamples(word, tmpCount);
                }
                count += tmpCount;
            }
            System.out.println(count + " example sentences written to " + tempFile);
        }
        return tempFile;
    }

    private int findExampleSentences(String word, FileWriter fw) throws IOException {
        Term term = new Term(LUCENE_CONTENT_FIELD, ".+" + word + "|" + StringUtils.capitalize(word) + ".+");
        long t1 = System.currentTimeMillis();
        TopDocs topDocs = searcher.search(new RegexpQuery(term), MAX_EXAMPLES);
        long t2 = System.currentTimeMillis();
        int count = 0;
        Set<String> foundSentences = new HashSet<>();
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            String sentence = searcher.doc(scoreDoc.doc).get(LUCENE_CONTENT_FIELD);
            if (!foundSentences.contains(sentence)) {
                fw.write(sentence + "\n");
                foundSentences.add(sentence);
                count++;
            }

            if (count > MAX_EXAMPLES) {
                break;
            }
        }
        long t3 = System.currentTimeMillis();
        long searchTime = t2 - t1;
        long iterateTime = t3 - t2;
        System.out.println(
                "Found " + count + " examples for " + word + " (" + searchTime + "ms, " + iterateTime + "ms)");
        return count;
    }

    public static void main(String[] args) throws IOException {
        if (args.length != 3) {
            System.out.println("Usage: " + AutomaticProhibitedCompoundRuleEvaluator.class.getSimpleName()
                    + " <confusionPairCandidates> <exampleSentenceIndexDir> <ngramDir>");
            System.out.println(
                    "   <confusionPairCandidates> is a semicolon-separated list of words (one pair per line)");
            System.out.println("   <exampleSentenceIndexDir> is a Lucene index created by TextIndexCreator");
            System.exit(1);
        }
        List<String> lines = IOUtils.readLines(new FileInputStream(args[0]), "utf-8");
        AutomaticProhibitedCompoundRuleEvaluator eval = new AutomaticProhibitedCompoundRuleEvaluator(
                new File(args[1]));
        eval.run(lines, new File(args[2]));
    }

    class TooFewExamples extends RuntimeException {
        private String word;
        private int exampleCount;

        TooFewExamples(String word, int exampleCount) {
            this.word = word;
            this.exampleCount = exampleCount;
        }

        @Override
        public String getMessage() {
            return exampleCount + " matches for " + word;
        }
    }
}