Java tutorial
/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package modelinspector.collectors; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashSet; import java.util.Locale; import java.util.Set; import org.apache.commons.io.IOUtils; import org.apache.commons.io.LineIterator; public class WordlistMatchCollector extends CollectorBase<String> { private String name; private Set<String> baseVocabulary; private int originalBaseVocabularySize; private Set<String> observedVocabulary = new HashSet<>(); boolean caseSensitive; int cutoff; private Locale language; public WordlistMatchCollector(String aName, String aLanguage, boolean aCaseSensitive, int aCutoff, String aFile, String aEncoding) { name = aName; baseVocabulary = new HashSet<>(); caseSensitive = aCaseSensitive; language = new Locale(aLanguage); cutoff = aCutoff; try (InputStream is = new FileInputStream(aFile)) { LineIterator i = IOUtils.lineIterator(is, aEncoding); while (i.hasNext()) { String[] fields = i.nextLine().split("\t"); if (fields.length > 1 && aCutoff > 0) { if (Integer.valueOf(fields[1]) < aCutoff) { continue; } } String word = aCaseSensitive ? fields[0] : fields[0].toLowerCase(language); baseVocabulary.add(word); } } catch (IOException e) { throw new RuntimeException(e); } originalBaseVocabularySize = baseVocabulary.size(); } @Override public String getName() { return name + (caseSensitive ? "" : " caseless") + (cutoff > 0 ? (" co=" + cutoff) : ""); } @Override public void collect(String aValue) { String value = caseSensitive ? aValue : aValue.toLowerCase(language); if (baseVocabulary.remove(value)) { observedVocabulary.add(value); } } @Override public String getResult() { return String.format("%10d %10d (%.2f%% match)", observedVocabulary.size(), originalBaseVocabularySize, ((float) observedVocabulary.size() / originalBaseVocabularySize) * 100.0f); } }