Java tutorial
/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package modelinspector.collectors; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.HashSet; import java.util.Locale; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.commons.io.IOUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.lang3.StringUtils; public class MostFrequentWordsCollector extends CollectorBase<Set<String>> { private static Map<String, Object2IntOpenHashMap<String>> wordLists = new HashMap<>(); private Set<String> result = new HashSet<>(); private Object2IntOpenHashMap<String> wordList; private int cutoff; private boolean caseSensitive; private Locale language; public MostFrequentWordsCollector(String aLanguage, int aCutoff, boolean aCaseSensitive, String aFile, String aEncoding) { cutoff = aCutoff; language = new Locale(aLanguage); String key = aLanguage + "-" + aCaseSensitive; setShowSample(true); wordList = wordLists.get(key); if (wordList == null) { wordList = new Object2IntOpenHashMap<>(); // The file read is sorted by frequency try (InputStream is = new FileInputStream(aFile)) { LineIterator i = IOUtils.lineIterator(is, aEncoding); int n = 1; while (i.hasNext()) { String[] fields = i.nextLine().split("\t"); String word = aCaseSensitive ? fields[0] : fields[0].toLowerCase(language); // System.out.println(word + " - " + n); // Record the word and its rank - since a word may appear in different // frequencies with different POSes, we need to make sure we don't overwrite // a frequent POS with an infrequent one. We only consider the most frequent // POS for a word. if (!wordList.containsKey(word)) { wordList.put(word, n); } n++; } } catch (IOException e) { throw new RuntimeException(e); } wordList.defaultReturnValue(Integer.MAX_VALUE); wordLists.put(key, wordList); } } @Override public String getName() { return "Coverage: Most frequent words (" + StringUtils.leftPad(String.valueOf(cutoff), 5) + ") " + (caseSensitive ? "" : "caseless"); } @Override public void collect(String aValue) { String v = caseSensitive ? aValue : aValue.toLowerCase(language); if (wordList.getInt(v) <= cutoff) { // System.out.println(v + " - " + wordList.getInt(v)); result.add(v); } } @Override public Set<String> getResult() { // System.out.println(getName()); // System.out.println(result.size() + " - " + result); Set<String> missing = new HashSet<String>(); // Collect the top-X words first for (Entry<String, Integer> e : wordList.entrySet()) { if (e.getValue() <= cutoff) { missing.add(e.getKey()); } } // Remove all the observed ones missing.removeAll(result); return missing; // return result; } }