org.languagetool.dev.wordsimilarity.SimilarWordFinder.java Source code

Java tutorial

Introduction

Here is the source code for org.languagetool.dev.wordsimilarity.SimilarWordFinder.java

Source

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.dev.wordsimilarity;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.*;

class SimilarWordFinder {

    private static final int MAX_DIST = 1;
    private static final KeyboardDistance keyDistance = new GermanQwertzKeyboardDistance();
    //private static final KeyboardDistance keyDistance = new QwertyKeyboardDistance();

    private KnownPairs knownPairs = new KnownPairs();

    private void createIndex(List<String> words, File indexDir) throws IOException {
        FSDirectory dir = FSDirectory.open(indexDir.toPath());
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new StandardAnalyzer());
        System.out.println("Creating index...");
        int docs = 0;
        try (IndexWriter writer = new IndexWriter(dir, indexWriterConfig)) {
            for (String word : words) {
                Document doc = new Document();
                doc.add(new TextField("word", word, Field.Store.YES));
                writer.addDocument(doc);
                docs++;
            }
        }
        System.out.println("Index created: " + docs + " docs");
    }

    private void findSimilarWords(File indexDir, List<String> words) throws IOException {
        FSDirectory dir = FSDirectory.open(indexDir.toPath());
        try (DirectoryReader reader = DirectoryReader.open(dir)) {
            IndexSearcher searcher = new IndexSearcher(reader);
            for (String word : words) {
                findSimilarWordsTo(reader, searcher, word);
            }
        }
    }

    private void findSimilarWordsTo(DirectoryReader reader, IndexSearcher searcher, String word)
            throws IOException {
        FuzzyQuery query = new FuzzyQuery(new Term("word", word), 2); // a missing char counts as a distance of 2
        TopDocs topDocs = searcher.search(query, 10);
        //System.out.println(topDocs.totalHits + " hits for " + word);
        List<SimWord> simWords = findSimilarWordsFor(reader, word, topDocs);
        //System.out.println(word + " -> " + String.join(", ", simWords));
        for (SimWord simWord : simWords) {
            if (word.length() == simWord.word.length()) {
                int firstDiffPos = getDiffPos(simWord.word.toLowerCase(), word.toLowerCase());
                try {
                    float dist = keyDistance.getDistance(word.charAt(firstDiffPos),
                            simWord.word.charAt(firstDiffPos));
                    System.out.println(dist + "; " + word + "; " + simWord);
                } catch (Exception e) {
                    System.err.println("Could not get distance between '" + word + "' and '" + simWord + "':");
                    e.printStackTrace();
                }
            } else {
                // TODO: these need to be handled, too
                //System.out.println("-; " + word + "; " + simWord.word);
            }
        }
    }

    private void findSimilarWords(File indexDir) throws IOException {
        FSDirectory dir = FSDirectory.open(indexDir.toPath());
        try (DirectoryReader reader = DirectoryReader.open(dir)) {
            IndexSearcher searcher = new IndexSearcher(reader);
            for (int i = 0; i < reader.maxDoc(); i++) {
                Document doc = reader.document(i);
                String word = doc.get("word");
                //System.out.println(word);
                findSimilarWordsTo(reader, searcher, word);
            }
        }
    }

    private List<SimWord> findSimilarWordsFor(DirectoryReader reader, String word, TopDocs topDocs)
            throws IOException {
        List<SimWord> result = new ArrayList<>();
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            String simWord = reader.document(scoreDoc.doc).get("word");
            //System.out.println(" sim: " + simWord);
            if (!simWord.equalsIgnoreCase(word) && !knownPairs.contains(simWord, word)) {
                int firstDiffPos = getDiffPos(simWord.toLowerCase(), word.toLowerCase());
                int limit = Math.min(word.length(), simWord.length()) - 1;
                if (firstDiffPos > limit) {
                    //System.out.println("FILTERED: " + word + " -> " + simWord + " [" + firstDiffPos + " <= " + limit + "]");
                } else {
                    int dist = StringUtils.getLevenshteinDistance(word, simWord);
                    if (dist <= MAX_DIST) {
                        //System.out.println(word + " -> " + simWord + " [" + firstDiffPos + "]");
                        result.add(new SimWord(simWord, dist));
                    }
                }
                knownPairs.add(simWord, word);
            }
        }
        return result;
    }

    private int getDiffPos(String s1, String s2) {
        int i;
        for (i = 0; i < s1.length(); i++) {
            if (i >= s2.length()) {
                return i;
            }
            if (s1.charAt(i) != s2.charAt(i)) {
                return i;
            }
        }
        return i;
    }

    class SimWord {
        private String word;
        private int levenshteinDistance;

        SimWord(String word, int levenshteinDistance) {
            this.word = word;
            this.levenshteinDistance = levenshteinDistance;
        }

        @Override
        public String toString() {
            return word;
        }
    }

    class KnownPairs {
        private Set<String> set = new HashSet<>();

        boolean contains(String word1, String word2) {
            return set.contains(getKey(word1, word2));
        }

        void add(String word1, String word2) {
            set.add(getKey(word1, word2));
        }

        String getKey(String word1, String word2) {
            if (word1.compareTo(word2) < 0) {
                return word1 + ";" + word2;
            } else {
                return word2 + ";" + word1;
            }
        }
    }

    public static void main(String[] args) throws IOException {
        SimilarWordFinder simWordFinder = new SimilarWordFinder();
        System.out.println("Using key distance: " + keyDistance.getClass());
        if (args.length == 1) {
            File indexDir = new File(args[0]);
            simWordFinder.findSimilarWords(indexDir);
        } else if (args.length == 2) {
            File indexDir = new File(args[1]);
            String[] words = args[0].split(",");
            simWordFinder.findSimilarWords(indexDir, Arrays.asList(words));
        } else if (args.length == 3) {
            List<String> words = FileUtils.readLines(new File(args[1]), "utf-8");
            File indexDir = new File(args[2]);
            Files.deleteIfExists(indexDir.toPath());
            simWordFinder.createIndex(words, indexDir);
        } else {
            System.out.println(
                    "Usage 1: " + SimilarWordFinder.class.getSimpleName() + " --index <wordFile> <indexFile>");
            System.out.println("Usage 2: " + SimilarWordFinder.class.getSimpleName()
                    + " <words> <indexDir> (as created with usage 1)");
            System.out.println("             <indexDir> as created with usage 1");
            System.out.println(
                    "             <words> a comma-separated list of words to search similar words for (no spaces)");
            System.out.println("Usage 3: " + SimilarWordFinder.class.getSimpleName() + " <indexDir>");
            System.exit(1);
        }
    }

}