de.tudarmstadt.ukp.dkpro.argumentation.sequence.feature.lexical.LemmaNGramUtils.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.argumentation.sequence.feature.lexical.LemmaNGramUtils.java
Source

/*
 * Copyright 2015
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.tudarmstadt.ukp.dkpro.argumentation.sequence.feature.lexical;

import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory;
import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable;
import de.tudarmstadt.ukp.dkpro.tc.api.exception.TextClassificationException;
import de.tudarmstadt.ukp.dkpro.tc.features.ngram.util.SkipNgramStringListIterable;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;

import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;

import static org.apache.uima.fit.util.JCasUtil.*;

public class LemmaNGramUtils {
    /**
     * This is the character for joining strings for pair n-grams.
     */
    public static final String N_GRAM_GLUE = "_";

    public static final Class<? extends Annotation> tokenClass = Token.class;
    //    public static final Class<? extends Annotation> tokenClass = Lemma.class;

    public static FrequencyDistribution<String> getAnnotationNGrams(JCas jcas, Annotation focusAnnotation,
            boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords) {
        FrequencyDistribution<String> result = new FrequencyDistribution<>();

        // If the focusAnnotation contains sentence annotations, extract the n-grams sentence-wise
        // if not, extract them from all tokens in the focusAnnotation
        if (JCasUtil.selectCovered(jcas, Sentence.class, focusAnnotation).size() > 0) {
            for (Sentence s : selectCovered(jcas, Sentence.class, focusAnnotation)) {
                for (List<String> nGram : new NGramStringListIterable(toText(selectCovered(tokenClass, s)), minN,
                        maxN)) {

                    if (lowerCaseNGrams) {
                        nGram = lower(nGram);
                    }

                    if (passesNgramFilter(nGram, stopwords, filterPartialMatches)) {
                        String nGramString = StringUtils.join(nGram, N_GRAM_GLUE);
                        result.inc(nGramString);
                    }
                }
            }
        }
        // FIXME the focus annotation branch doesn't make much sense
        else {
            for (List<String> nGram : new NGramStringListIterable(
                    toText(selectCovered(tokenClass, focusAnnotation)), minN, maxN)) {

                if (lowerCaseNGrams) {
                    nGram = lower(nGram);
                }

                if (passesNgramFilter(nGram, stopwords, filterPartialMatches)) {
                    String nGramString = StringUtils.join(nGram, N_GRAM_GLUE);
                    result.inc(nGramString);
                }
            }
        }
        return result;
    }

    /**
     * Returns document ngrams over any annotation type that extends Annotation.
     * Intended use is Lemma, Stem, etc.
     */
    public static FrequencyDistribution<String> getDocumentNgrams(JCas jcas, boolean lowerCaseNGrams,
            boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords,
            Class<? extends Annotation> annotationClass) throws TextClassificationException {
        FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<>();
        for (Sentence s : select(jcas, Sentence.class)) {
            List<String> strings = valuesToText(jcas, s, annotationClass.getName());
            for (List<String> ngram : new NGramStringListIterable(strings, minN, maxN)) {
                if (lowerCaseNGrams) {
                    ngram = lower(ngram);
                }

                if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                    String ngramString = StringUtils.join(ngram, N_GRAM_GLUE);
                    documentNgrams.inc(ngramString);
                }
            }
        }
        return documentNgrams;
    }

    /**
     * An ngram (represented by the list of tokens) does not pass the stopword filter:
     * a) filterPartialMatches=true - if it contains any stopwords
     * b) filterPartialMatches=false - if it entirely consists of stopwords
     *
     * @param tokenList            The list of tokens in a single ngram
     * @param stopwords            The set of stopwords used for filtering
     * @param filterPartialMatches Whether ngrams where only parts are stopwords should also be filtered. For example, "United States of America" would be filtered, as it contains the stopword "of".
     * @return Whether the ngram (represented by the list of tokens) passes the stopword filter or not.
     */
    public static boolean passesNgramFilter(List<String> tokenList, Set<String> stopwords,
            boolean filterPartialMatches) {
        List<String> filteredList = new ArrayList<>();
        for (String ngram : tokenList) {
            if (!stopwords.contains(ngram)) {
                filteredList.add(ngram);
            }
        }

        if (filterPartialMatches) {
            return filteredList.size() == tokenList.size();
        } else {
            return filteredList.size() != 0;
        }
    }

    public static FrequencyDistribution<String> getDocumentSkipNgrams(JCas jcas, boolean lowerCaseNGrams,
            boolean filterPartialMatches, int minN, int maxN, int skipN, Set<String> stopwords) {
        FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<>();
        for (Sentence s : select(jcas, Sentence.class)) {
            for (List<String> ngram : new SkipNgramStringListIterable(toText(selectCovered(tokenClass, s)), minN,
                    maxN, skipN)) {
                if (lowerCaseNGrams) {
                    ngram = lower(ngram);
                }

                if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                    String ngramString = StringUtils.join(ngram, N_GRAM_GLUE);
                    documentNgrams.inc(ngramString);
                }
            }
        }
        return documentNgrams;
    }

    public static List<String> lower(List<String> ngram) {
        List<String> newNgram = new ArrayList<>();
        for (String token : ngram) {
            newNgram.add(token.toLowerCase());
        }
        return newNgram;
    }

    public static <T extends Annotation> List<String> valuesToText(JCas jcas, Sentence s,
            String annotationClassName) throws TextClassificationException {
        List<String> texts = new ArrayList<>();

        try {
            for (Entry<AnnotationFS, String> entry : FeaturePathFactory.select(jcas.getCas(),
                    annotationClassName)) {
                if (entry.getKey().getBegin() >= s.getBegin() && entry.getKey().getEnd() <= s.getEnd()) {
                    texts.add(entry.getValue());
                }
            }
        } catch (FeaturePathException e) {
            throw new TextClassificationException(e);
        }
        return texts;
    }
}