Java tutorial
/* * Copyright 2015 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.argumentation.sequence.feature.lexical; import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory; import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable; import de.tudarmstadt.ukp.dkpro.tc.api.exception.TextClassificationException; import de.tudarmstadt.ukp.dkpro.tc.features.ngram.util.SkipNgramStringListIterable; import org.apache.commons.lang.StringUtils; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import java.util.ArrayList; import java.util.List; import java.util.Map.Entry; import java.util.Set; import static org.apache.uima.fit.util.JCasUtil.*; public class LemmaNGramUtils { /** * This is the character for joining strings for pair n-grams. */ public static final String N_GRAM_GLUE = "_"; public static final Class<? extends Annotation> tokenClass = Token.class; // public static final Class<? extends Annotation> tokenClass = Lemma.class; public static FrequencyDistribution<String> getAnnotationNGrams(JCas jcas, Annotation focusAnnotation, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords) { FrequencyDistribution<String> result = new FrequencyDistribution<>(); // If the focusAnnotation contains sentence annotations, extract the n-grams sentence-wise // if not, extract them from all tokens in the focusAnnotation if (JCasUtil.selectCovered(jcas, Sentence.class, focusAnnotation).size() > 0) { for (Sentence s : selectCovered(jcas, Sentence.class, focusAnnotation)) { for (List<String> nGram : new NGramStringListIterable(toText(selectCovered(tokenClass, s)), minN, maxN)) { if (lowerCaseNGrams) { nGram = lower(nGram); } if (passesNgramFilter(nGram, stopwords, filterPartialMatches)) { String nGramString = StringUtils.join(nGram, N_GRAM_GLUE); result.inc(nGramString); } } } } // FIXME the focus annotation branch doesn't make much sense else { for (List<String> nGram : new NGramStringListIterable( toText(selectCovered(tokenClass, focusAnnotation)), minN, maxN)) { if (lowerCaseNGrams) { nGram = lower(nGram); } if (passesNgramFilter(nGram, stopwords, filterPartialMatches)) { String nGramString = StringUtils.join(nGram, N_GRAM_GLUE); result.inc(nGramString); } } } return result; } /** * Returns document ngrams over any annotation type that extends Annotation. * Intended use is Lemma, Stem, etc. */ public static FrequencyDistribution<String> getDocumentNgrams(JCas jcas, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords, Class<? extends Annotation> annotationClass) throws TextClassificationException { FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<>(); for (Sentence s : select(jcas, Sentence.class)) { List<String> strings = valuesToText(jcas, s, annotationClass.getName()); for (List<String> ngram : new NGramStringListIterable(strings, minN, maxN)) { if (lowerCaseNGrams) { ngram = lower(ngram); } if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) { String ngramString = StringUtils.join(ngram, N_GRAM_GLUE); documentNgrams.inc(ngramString); } } } return documentNgrams; } /** * An ngram (represented by the list of tokens) does not pass the stopword filter: * a) filterPartialMatches=true - if it contains any stopwords * b) filterPartialMatches=false - if it entirely consists of stopwords * * @param tokenList The list of tokens in a single ngram * @param stopwords The set of stopwords used for filtering * @param filterPartialMatches Whether ngrams where only parts are stopwords should also be filtered. For example, "United States of America" would be filtered, as it contains the stopword "of". * @return Whether the ngram (represented by the list of tokens) passes the stopword filter or not. */ public static boolean passesNgramFilter(List<String> tokenList, Set<String> stopwords, boolean filterPartialMatches) { List<String> filteredList = new ArrayList<>(); for (String ngram : tokenList) { if (!stopwords.contains(ngram)) { filteredList.add(ngram); } } if (filterPartialMatches) { return filteredList.size() == tokenList.size(); } else { return filteredList.size() != 0; } } public static FrequencyDistribution<String> getDocumentSkipNgrams(JCas jcas, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, int skipN, Set<String> stopwords) { FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<>(); for (Sentence s : select(jcas, Sentence.class)) { for (List<String> ngram : new SkipNgramStringListIterable(toText(selectCovered(tokenClass, s)), minN, maxN, skipN)) { if (lowerCaseNGrams) { ngram = lower(ngram); } if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) { String ngramString = StringUtils.join(ngram, N_GRAM_GLUE); documentNgrams.inc(ngramString); } } } return documentNgrams; } public static List<String> lower(List<String> ngram) { List<String> newNgram = new ArrayList<>(); for (String token : ngram) { newNgram.add(token.toLowerCase()); } return newNgram; } public static <T extends Annotation> List<String> valuesToText(JCas jcas, Sentence s, String annotationClassName) throws TextClassificationException { List<String> texts = new ArrayList<>(); try { for (Entry<AnnotationFS, String> entry : FeaturePathFactory.select(jcas.getCas(), annotationClassName)) { if (entry.getKey().getBegin() >= s.getBegin() && entry.getKey().getEnd() <= s.getEnd()) { texts.add(entry.getValue()); } } } catch (FeaturePathException e) { throw new TextClassificationException(e); } return texts; } }