de.tudarmstadt.ukp.dkpro.core.frequency.phrasedetection.FrequencyCounter.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.core.frequency.phrasedetection.FrequencyCounter.java

Source

/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 * <p>
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * <p>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.tudarmstadt.ukp.dkpro.core.frequency.phrasedetection;

import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator;
import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.StringSequenceGenerator;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import org.apache.commons.collections4.Bag;
import org.apache.commons.collections4.bag.HashBag;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.stream.Stream;

/**
 * Count unigrams and bigrams in a collection.
 */
public class FrequencyCounter extends JCasFileWriter_ImplBase {
    /**
     * When concatenating multiple tokens, this string is inserted between them.
     */
    static final String BIGRAM_SEPARATOR = " ";
    /**
     * Columns (i.e. tokens and counts) are separated by this character.
     */
    static final String COLUMN_SEPARATOR = "\t";
    /**
     * When hitting a column separator within a token, it is replaced by this token.
     */
    static final String COLUMN_SEP_REPLACEMENT = " ";

    /**
     * This string (a line) will separate unigrams from bigrams in the output file
     **/
    static final String NGRAM_SEPARATOR_LINE = "----------------------------------------------------";
    static final String NEWLINE_REGEX = "\r\n?|\n";

    /**
     * The feature path. Default: tokens.
     */
    public static final String PARAM_FEATURE_PATH = "featurePath";
    @ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = false)
    private String featurePath;
    private static final String DEFAULT_FEATURE_PATH = Token.class.getCanonicalName();

    /**
     * Set this parameter if bigrams should only be counted when occurring within a covering type, e.g. sentences.
     */
    public static final String PARAM_COVERING_TYPE = "coveringType";
    @ConfigurationParameter(name = PARAM_COVERING_TYPE, mandatory = false)
    private String coveringType;

    /**
     * If true, all tokens are lowercased.
     */
    public static final String PARAM_LOWERCASE = "lowercase";
    @ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = true, defaultValue = "false")
    private boolean lowercase;

    /**
     * Tokens occurring fewer times than this value are omitted. Default: 5.
     */
    public static final String PARAM_MIN_COUNT = "minCount";
    @ConfigurationParameter(name = PARAM_MIN_COUNT, mandatory = true, defaultValue = "5")
    private int minCount;

    /**
     * If true, sort output by count (descending order).
     */
    public static final String PARAM_SORT_BY_COUNT = "sortByCount";
    @ConfigurationParameter(name = PARAM_SORT_BY_COUNT, mandatory = true, defaultValue = "false")
    private boolean sortByCount;

    /**
     * If true, sort output alphabetically.
     */
    public static final String PARAM_SORT_BY_ALPHABET = "sortByAlphabet";
    @ConfigurationParameter(name = PARAM_SORT_BY_ALPHABET, mandatory = true, defaultValue = "false")
    private boolean sortByAlphabet;

    public static final String PARAM_STOPWORDS_FILE = "stopwordsFile";
    @ConfigurationParameter(name = PARAM_STOPWORDS_FILE, mandatory = true, defaultValue = "")
    private String stopwordsFile;

    public static final String PARAM_STOPWORDS_REPLACEMENT = "stopwordsReplacement";
    @ConfigurationParameter(name = PARAM_STOPWORDS_REPLACEMENT, mandatory = true, defaultValue = "")
    private String stopwordsReplacement;

    public static final String PARAM_FILTER_REGEX = "filterRegex";
    @ConfigurationParameter(name = PARAM_FILTER_REGEX, mandatory = true, defaultValue = "")
    private String filterRegex;

    public static final String PARAM_REGEX_REPLACEMENT = "regexReplacement";
    @ConfigurationParameter(name = PARAM_REGEX_REPLACEMENT, mandatory = true, defaultValue = "")
    private String regexReplacement;

    private Bag<String> unigrams;
    private Bag<String> bigrams;
    private StringSequenceGenerator sequenceGenerator;

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        if (sortByAlphabet && sortByCount) {
            throw new ResourceInitializationException(
                    new IllegalArgumentException("Can only sort either by count or alphabetically."));
        }

        unigrams = new HashBag<>();
        bigrams = new HashBag<>();

        /* set feature path to default */
        if (featurePath == null) {
            featurePath = DEFAULT_FEATURE_PATH;
        }

        /* init sequence generator */
        try {
            sequenceGenerator = new PhraseSequenceGenerator.Builder().featurePath(featurePath)
                    .coveringType(coveringType).lowercase(lowercase).stopwordsFile(stopwordsFile)
                    .stopwordsReplacement(stopwordsReplacement).filterRegex(filterRegex)
                    .filterRegexReplacement(regexReplacement).buildStringSequenceGenerator();
        } catch (IOException e) {
            throw new ResourceInitializationException(e);
        }
    }

    @Override
    public void process(JCas aJCas) throws AnalysisEngineProcessException {
        try {
            /* iterate over sequences (e.g. sentences)*/
            for (String[] sequence : sequenceGenerator.tokenSequences(aJCas)) {
                /* iterate over tokens in sequence */
                for (int i = 0; i < sequence.length; i++) {
                    /* count unigrams */
                    String unigram = sequence[i].replaceAll(COLUMN_SEPARATOR, COLUMN_SEP_REPLACEMENT)
                            .replaceAll(NEWLINE_REGEX, COLUMN_SEP_REPLACEMENT);
                    unigrams.add(unigram);

                    /* count bigrams */
                    if (i + 1 < sequence.length) {
                        String bigram = unigram + BIGRAM_SEPARATOR
                                + sequence[i + 1].replaceAll(COLUMN_SEPARATOR, COLUMN_SEP_REPLACEMENT)
                                        .replaceAll(NEWLINE_REGEX, COLUMN_SEP_REPLACEMENT);
                        bigrams.add(bigram);
                    }
                }
            }
        } catch (FeaturePathException e) {
            throw new AnalysisEngineProcessException(e);
        }
    }

    @Override
    public void collectionProcessComplete() throws AnalysisEngineProcessException {
        getLogger().info("Vocabulary size: " + unigrams.uniqueSet().size());
        try {
            getLogger().info("Writing frequencies to " + getTargetLocation());
            OutputStream os = CompressionUtils.getOutputStream(new File(getTargetLocation()));

            writeNgrams(os, unigrams);
            os.write((NGRAM_SEPARATOR_LINE + "\n").getBytes());
            writeNgrams(os, bigrams);
            os.close();
        } catch (IOException e) {
            throw new AnalysisEngineProcessException(e);
        }
    }

    /**
     * Write counter with counts from a bag to an output stream.
     *
     * @param os      an {@link OutputStream}
     * @param counter a {@link Bag} of string counter
     */
    private void writeNgrams(OutputStream os, Bag<String> counter) {
        /* create token stream */
        Stream<String> stream = counter.uniqueSet().stream().filter(token -> counter.getCount(token) >= minCount);

        /* sort output */
        if (sortByAlphabet) {
            stream = stream.sorted(String::compareTo);
        } else if (sortByCount) {
            stream = stream.sorted((o1, o2) -> -Integer.compare(counter.getCount(o1), counter.getCount(o2)));
        }

        /* write tokens with counts */
        stream.forEach(token -> {
            try {
                os.write((token + COLUMN_SEPARATOR + counter.getCount(token) + "\n").getBytes());
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        });
    }
}