modelinspector.collectors.WordlistMatchCollector.java Source code

Introduction

Here is the source code for modelinspector.collectors.WordlistMatchCollector.java
Source

/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package modelinspector.collectors;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;

public class WordlistMatchCollector extends CollectorBase<String> {
    private String name;
    private Set<String> baseVocabulary;
    private int originalBaseVocabularySize;
    private Set<String> observedVocabulary = new HashSet<>();
    boolean caseSensitive;
    int cutoff;
    private Locale language;

    public WordlistMatchCollector(String aName, String aLanguage, boolean aCaseSensitive, int aCutoff, String aFile,
            String aEncoding) {
        name = aName;
        baseVocabulary = new HashSet<>();
        caseSensitive = aCaseSensitive;
        language = new Locale(aLanguage);
        cutoff = aCutoff;

        try (InputStream is = new FileInputStream(aFile)) {
            LineIterator i = IOUtils.lineIterator(is, aEncoding);
            while (i.hasNext()) {
                String[] fields = i.nextLine().split("\t");
                if (fields.length > 1 && aCutoff > 0) {
                    if (Integer.valueOf(fields[1]) < aCutoff) {
                        continue;
                    }
                }
                String word = aCaseSensitive ? fields[0] : fields[0].toLowerCase(language);
                baseVocabulary.add(word);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        originalBaseVocabularySize = baseVocabulary.size();
    }

    @Override
    public String getName() {
        return name + (caseSensitive ? "" : " caseless") + (cutoff > 0 ? (" co=" + cutoff) : "");
    }

    @Override
    public void collect(String aValue) {
        String value = caseSensitive ? aValue : aValue.toLowerCase(language);
        if (baseVocabulary.remove(value)) {
            observedVocabulary.add(value);
        }
    }

    @Override
    public String getResult() {
        return String.format("%10d %10d (%.2f%% match)", observedVocabulary.size(), originalBaseVocabularySize,
                ((float) observedVocabulary.size() / originalBaseVocabularySize) * 100.0f);
    }
}