modelinspector.collectors.MostFrequentWordsCollector.java Source code

Introduction

Here is the source code for modelinspector.collectors.MostFrequentWordsCollector.java
Source

/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package modelinspector.collectors;

import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.lang3.StringUtils;

public class MostFrequentWordsCollector extends CollectorBase<Set<String>> {
    private static Map<String, Object2IntOpenHashMap<String>> wordLists = new HashMap<>();

    private Set<String> result = new HashSet<>();
    private Object2IntOpenHashMap<String> wordList;
    private int cutoff;
    private boolean caseSensitive;
    private Locale language;

    public MostFrequentWordsCollector(String aLanguage, int aCutoff, boolean aCaseSensitive, String aFile,
            String aEncoding) {
        cutoff = aCutoff;
        language = new Locale(aLanguage);

        String key = aLanguage + "-" + aCaseSensitive;

        setShowSample(true);

        wordList = wordLists.get(key);
        if (wordList == null) {
            wordList = new Object2IntOpenHashMap<>();
            // The file read is sorted by frequency
            try (InputStream is = new FileInputStream(aFile)) {
                LineIterator i = IOUtils.lineIterator(is, aEncoding);
                int n = 1;
                while (i.hasNext()) {
                    String[] fields = i.nextLine().split("\t");
                    String word = aCaseSensitive ? fields[0] : fields[0].toLowerCase(language);
                    // System.out.println(word + " - " + n);
                    // Record the word and its rank - since a word may appear in different
                    // frequencies with different POSes, we need to make sure we don't overwrite
                    // a frequent POS with an infrequent one. We only consider the most frequent
                    // POS for a word.
                    if (!wordList.containsKey(word)) {
                        wordList.put(word, n);
                    }
                    n++;
                }
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            wordList.defaultReturnValue(Integer.MAX_VALUE);
            wordLists.put(key, wordList);
        }
    }

    @Override
    public String getName() {
        return "Coverage: Most frequent words (" + StringUtils.leftPad(String.valueOf(cutoff), 5) + ") "
                + (caseSensitive ? "" : "caseless");
    }

    @Override
    public void collect(String aValue) {
        String v = caseSensitive ? aValue : aValue.toLowerCase(language);
        if (wordList.getInt(v) <= cutoff) {
            //            System.out.println(v + " - " + wordList.getInt(v));
            result.add(v);
        }
    }

    @Override
    public Set<String> getResult() {
        //        System.out.println(getName());
        //        System.out.println(result.size() + " - " + result);

        Set<String> missing = new HashSet<String>();

        // Collect the top-X words first
        for (Entry<String, Integer> e : wordList.entrySet()) {
            if (e.getValue() <= cutoff) {
                missing.add(e.getKey());
            }
        }

        // Remove all the observed ones
        missing.removeAll(result);

        return missing;
        //        return result;
    }
}