bzh.plantkelt.motscroises.Dictionnary.java Source code

Java tutorial

Introduction

Here is the source code for bzh.plantkelt.motscroises.Dictionnary.java

Source

/*
 *  This file is part of MotsCroiss.
 *
 *  MotsCroiss is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  MotsCroiss is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with MotsCroiss. If not, see <http://www.gnu.org/licenses/>.
 */
package bzh.plantkelt.motscroises;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Multimaps;

public class Dictionnary {

    private Set<String> words;
    private long totalLetterCount = 0;
    private ListMultimap<String, String> patterns;
    private AlphabetFreq alphabetFrequencies;
    private long[] lengthFrequencies;

    private int[] MAX_LETTER_COUNT_IN_PATTERN = new int[] { 0, 1, 2, 3, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1,
            1 };

    public Dictionnary(String filename, int maxWordLen) {
        if (maxWordLen >= MAX_LETTER_COUNT_IN_PATTERN.length)
            throw new IllegalArgumentException(
                    "maxWordLen is too large, maximum is " + MAX_LETTER_COUNT_IN_PATTERN.length);

        // Load words
        Logger.error("Loading dictionnary '" + filename + "'...");
        words = new HashSet<>(500000);
        try {
            Files.lines(Paths.get(filename)).forEach((line) -> {
                String word = stripAccents(line).toUpperCase().replaceAll("-", "");
                int len = word.length();
                if (isAlpha(word) && len > 1 && len <= maxWordLen) {
                    words.add(word);
                }
            });
        } catch (IOException e) {
            throw new IllegalArgumentException(e);
        }
        Logger.error("Loaded " + words.size() + " words.");

        // Index words
        Logger.error("Indexing words...");
        patterns = ArrayListMultimap.create(100000, 10);
        alphabetFrequencies = new AlphabetFreq();
        lengthFrequencies = new long[maxWordLen + 1];
        for (int i = 0; i < maxWordLen + 1; i++) {
            lengthFrequencies[i] = 0;
        }
        int n = 0;
        for (String word : words) {
            int len = word.length();
            totalLetterCount += len;
            int maxLetters = MAX_LETTER_COUNT_IN_PATTERN[len];
            lengthFrequencies[len]++;
            char[] wordLetters = word.toCharArray();
            char[] pattern = new char[len];
            for (int i = 0; i < len; i++) {
                pattern[i] = '_';
            }

            // 1-char pattern & letter frequencies
            for (int i = 0; i < len; i++) {
                char letter = wordLetters[i];
                alphabetFrequencies.inc(letter);
                pattern[i] = letter;
                patterns.put(new String(pattern), word);
                pattern[i] = '_';
            }

            if (maxLetters >= 2) {
                // 2-chars pattern
                for (int i = 0; i < len; i++) {
                    pattern[i] = wordLetters[i];
                    for (int j = i + 1; j < len; j++) {
                        pattern[j] = wordLetters[j];
                        patterns.put(new String(pattern), word);
                        pattern[j] = '_';
                    }
                    pattern[i] = '_';
                }
            }

            if (maxLetters >= 3) {
                // 3-chars pattern
                for (int i = 0; i < len; i++) {
                    pattern[i] = wordLetters[i];
                    for (int j = i + 1; j < len; j++) {
                        pattern[j] = wordLetters[j];
                        for (int k = j + 1; k < len; k++) {
                            pattern[k] = wordLetters[k];
                            patterns.put(new String(pattern), word);
                            pattern[k] = '_';
                        }
                        pattern[j] = '_';
                    }
                    pattern[i] = '_';
                }
            }

            if (maxLetters >= 4) {
                // 4-chars pattern
                for (int i = 0; i < len; i++) {
                    pattern[i] = wordLetters[i];
                    for (int j = i + 1; j < len; j++) {
                        pattern[j] = wordLetters[j];
                        for (int k = j + 1; k < len; k++) {
                            pattern[k] = wordLetters[k];
                            for (int l = k + 1; l < len; l++) {
                                pattern[l] = wordLetters[l];
                                patterns.put(new String(pattern), word);
                                pattern[l] = '_';
                            }
                            pattern[k] = '_';
                        }
                        pattern[j] = '_';
                    }
                    pattern[i] = '_';
                }
            }

            if (maxLetters >= 5)
                throw new IllegalArgumentException("Too many max letters in pattern, max 4.");

            n++;
            if (n % 100000 == 0)
                Logger.warn(n + " words indexed");
        }

    }

    public void dump(int maxPatterns) {

        // Sort patterns by frequency
        List<Map.Entry<String, List<String>>> entries = new ArrayList<>(Multimaps.asMap(patterns).entrySet());
        Collections.sort(entries, new Comparator<Map.Entry<String, List<String>>>() {
            @Override
            public int compare(Map.Entry<String, List<String>> e1, Map.Entry<String, List<String>> e2) {
                return Integer.compare(e2.getValue().size(), e1.getValue().size());
            }
        });
        System.out.println(maxPatterns + " most common patterns:");
        int n = 0;
        for (Map.Entry<String, List<String>> entry : entries) {
            System.out.println(String.format("%20s : %8d", entry.getKey(), entry.getValue().size()));
            n++;
            if (n > maxPatterns)
                break;
        }

        System.out.println("Letters frequencies:");
        for (char letter = 'A'; letter <= 'Z'; letter++) {
            System.out.println(letter + ": " + alphabetFrequencies.freq(letter));
        }

        System.out.println("Word length frequencies:");
        for (int i = 1; i < lengthFrequencies.length; i++) {
            System.out.println(i + ": " + lengthFrequencies[i]);
        }

        System.out.println("Number of patterns: " + patterns.keySet().size());
        System.out.println("Number of patterns entries: " + patterns.size());
    }

    public List<String> getWords(String pattern) {
        // Count number of letters in pattern
        final char[] patternChars = pattern.toCharArray();
        int patternLen = patternChars.length;
        List<Integer> letterIndexes = new ArrayList<>(patternLen);
        for (int i = 0; i < patternLen; i++) {
            if (patternChars[i] != '_') {
                letterIndexes.add(i);
            }
        }
        int maxLetters = patternLen < MAX_LETTER_COUNT_IN_PATTERN.length ? MAX_LETTER_COUNT_IN_PATTERN[patternLen]
                : 1;
        if (letterIndexes.size() <= maxLetters) {
            // The pattern is OK, use it right away
            return patterns.get(pattern);
        } else {
            // Too much letters, strip the most common ones
            Collections.sort(letterIndexes, new Comparator<Integer>() {
                @Override
                public int compare(Integer i1, Integer i2) {
                    long freq1 = alphabetFrequencies.freq(patternChars[i1]);
                    long freq2 = alphabetFrequencies.freq(patternChars[i2]);
                    return Long.compare(freq2, freq1);
                }
            });
            int nLettersToRemove = letterIndexes.size() - maxLetters;
            int[] indexesToCheck = new int[nLettersToRemove];
            for (int i = 0; i < nLettersToRemove; i++) {
                int letterIndex = letterIndexes.get(i);
                patternChars[letterIndex] = '_';
                indexesToCheck[i] = letterIndex;
            }
            String reducedPattern = new String(patternChars);

            // Linear scan potential words,
            // stripping away the one that does not fit the pattern
            List<String> potentials = patterns.get(reducedPattern);
            List<String> retval = new ArrayList<>(potentials.size());
            char[] patternChars2 = pattern.toCharArray();
            for (String potential : potentials) {
                char[] letters = potential.toCharArray();
                boolean fits = true;
                for (int i : indexesToCheck) {
                    if (letters[i] != patternChars2[i]) {
                        fits = false;
                        break;
                    }
                }
                if (fits) {
                    retval.add(potential);
                }
            }
            return retval;
        }
    }

    public double letterFrequency(char letter) {
        return 1. * alphabetFrequencies.freq(letter) / totalLetterCount;
    }

    private boolean isAlpha(String name) {
        for (char c : name.toCharArray()) {
            if (!Character.isLetter(c)) {
                return false;
            }
        }
        return true;
    }

    private String stripAccents(String s) {
        s = Normalizer.normalize(s, Normalizer.Form.NFD);
        s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}]", "");
        return s;
    }
}