de.tudarmstadt.ukp.experiments.argumentation.comments.pipeline.VocabularyCollector.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.experiments.argumentation.comments.pipeline.VocabularyCollector.java
Source

/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.tudarmstadt.ukp.experiments.argumentation.comments.pipeline;

import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import org.apache.commons.io.IOUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasConsumer_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.net.URL;
import java.util.*;

/**
 * @author Ivan Habernal
 */
public class VocabularyCollector extends JCasConsumer_ImplBase {
    /**
     * File for storing vocabulary
     */
    public static final String PARAM_MODEL_LOCATION = "modelLocation";

    @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
    File modelLocation;

    public static final String PARAM_MINIMAL_OCCURRENCE = "minimalOccurrence";
    @ConfigurationParameter(name = PARAM_MINIMAL_OCCURRENCE, mandatory = false)
    int minimalOccurrence;

    public static final String PARAM_USE_LEMMA = "useLemma";
    @ConfigurationParameter(name = PARAM_USE_LEMMA, mandatory = true, defaultValue = "false")
    boolean useLemma;

    public static final String PARAM_STOPWORDS_LIST = "stopwordsList";
    @ConfigurationParameter(name = PARAM_STOPWORDS_LIST, mandatory = false, defaultValue = "classpath:/stopwords_en.txt")
    String stopwordsList;

    public static final String PARAM_IGNORE_STOPWORDS = "ignoreStopwords";
    @ConfigurationParameter(name = PARAM_IGNORE_STOPWORDS, mandatory = false, defaultValue = "false")
    boolean ignoreStopwords;

    Map<String, Integer> vocabulary = new HashMap<>();

    Set<String> stopwords = new HashSet<>();

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);

        if (minimalOccurrence < 0) {
            throw new ResourceInitializationException(
                    new IllegalArgumentException("Minimal occurrence must be positive integer"));
        }

        try {
            if (ignoreStopwords) {
                URL url = ResourceUtils.resolveLocation(stopwordsList);
                stopwords.addAll(IOUtils.readLines(url.openStream()));
            }
        } catch (IOException e) {
            throw new ResourceInitializationException(e);
        }
    }

    @Override
    public void process(JCas aJCas) throws AnalysisEngineProcessException {
        for (Token t : JCasUtil.select(aJCas, Token.class)) {
            String entry = useLemma ? t.getCoveredText() : t.getLemma().getValue();

            // only words
            if (entry.matches("\\p{Alpha}+")) {
                // and filter stopwords, if required
                if (!ignoreStopwords || (ignoreStopwords && !stopwords.contains(entry))) {
                    if (!vocabulary.containsKey(entry)) {
                        vocabulary.put(entry, 0);
                    }

                    vocabulary.put(entry, vocabulary.get(entry) + 1);
                }
            }
        }
    }

    @Override
    public void collectionProcessComplete() throws AnalysisEngineProcessException {
        super.collectionProcessComplete();

        getLogger().info("Original vocabulary size: " + this.vocabulary.size());

        // remove all with low occurrence
        Iterator<Map.Entry<String, Integer>> iterator = vocabulary.entrySet().iterator();
        while (iterator.hasNext()) {
            Map.Entry<String, Integer> next = iterator.next();

            // remove
            if (next.getValue() < this.minimalOccurrence) {
                iterator.remove();
            }
        }

        getLogger().info("Filtered vocabulary size: " + this.vocabulary.size());

        // serialize to file
        try {
            ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(this.modelLocation));
            oos.writeObject(vocabulary);
        } catch (IOException e) {
            throw new AnalysisEngineProcessException(e);
        }
    }
}