Java tutorial
/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.experiments.argumentation.comments.pipeline; import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import org.apache.commons.io.IOUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasConsumer_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectOutputStream; import java.net.URL; import java.util.*; /** * @author Ivan Habernal */ public class VocabularyCollector extends JCasConsumer_ImplBase { /** * File for storing vocabulary */ public static final String PARAM_MODEL_LOCATION = "modelLocation"; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) File modelLocation; public static final String PARAM_MINIMAL_OCCURRENCE = "minimalOccurrence"; @ConfigurationParameter(name = PARAM_MINIMAL_OCCURRENCE, mandatory = false) int minimalOccurrence; public static final String PARAM_USE_LEMMA = "useLemma"; @ConfigurationParameter(name = PARAM_USE_LEMMA, mandatory = true, defaultValue = "false") boolean useLemma; public static final String PARAM_STOPWORDS_LIST = "stopwordsList"; @ConfigurationParameter(name = PARAM_STOPWORDS_LIST, mandatory = false, defaultValue = "classpath:/stopwords_en.txt") String stopwordsList; public static final String PARAM_IGNORE_STOPWORDS = "ignoreStopwords"; @ConfigurationParameter(name = PARAM_IGNORE_STOPWORDS, mandatory = false, defaultValue = "false") boolean ignoreStopwords; Map<String, Integer> vocabulary = new HashMap<>(); Set<String> stopwords = new HashSet<>(); @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); if (minimalOccurrence < 0) { throw new ResourceInitializationException( new IllegalArgumentException("Minimal occurrence must be positive integer")); } try { if (ignoreStopwords) { URL url = ResourceUtils.resolveLocation(stopwordsList); stopwords.addAll(IOUtils.readLines(url.openStream())); } } catch (IOException e) { throw new ResourceInitializationException(e); } } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { for (Token t : JCasUtil.select(aJCas, Token.class)) { String entry = useLemma ? t.getCoveredText() : t.getLemma().getValue(); // only words if (entry.matches("\\p{Alpha}+")) { // and filter stopwords, if required if (!ignoreStopwords || (ignoreStopwords && !stopwords.contains(entry))) { if (!vocabulary.containsKey(entry)) { vocabulary.put(entry, 0); } vocabulary.put(entry, vocabulary.get(entry) + 1); } } } } @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { super.collectionProcessComplete(); getLogger().info("Original vocabulary size: " + this.vocabulary.size()); // remove all with low occurrence Iterator<Map.Entry<String, Integer>> iterator = vocabulary.entrySet().iterator(); while (iterator.hasNext()) { Map.Entry<String, Integer> next = iterator.next(); // remove if (next.getValue() < this.minimalOccurrence) { iterator.remove(); } } getLogger().info("Filtered vocabulary size: " + this.vocabulary.size()); // serialize to file try { ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(this.modelLocation)); oos.writeObject(vocabulary); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } } }