com.github.frapontillo.pulse.crowd.remstopword.simple.SimpleStopWordRemover.java Source code

Introduction

Here is the source code for com.github.frapontillo.pulse.crowd.remstopword.simple.SimpleStopWordRemover.java
Source

/*
 * Copyright 2015 Francesco Pontillo
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.github.frapontillo.pulse.crowd.remstopword.simple;

import com.github.frapontillo.pulse.crowd.data.entity.Category;
import com.github.frapontillo.pulse.crowd.data.entity.Message;
import com.github.frapontillo.pulse.crowd.data.entity.Tag;
import com.github.frapontillo.pulse.crowd.data.entity.Token;
import com.github.frapontillo.pulse.crowd.remstopword.StopWordRemover;
import org.apache.commons.io.IOUtils;

import java.util.*;
import java.util.stream.Collectors;

/**
 * Stop word remover plugin, marks tokens, tags and categories as stop words according to some stop
 * word files.
 *
 * @author Francesco Pontillo
 */
public class SimpleStopWordRemover extends StopWordRemover<StopWordConfig> {
    public final static String PLUGIN_NAME = "simple";
    private final static String LANG_PLACEHOLDER = "{{LANG}}";
    private final HashMap<String, HashSet<String>> dictionaries;
    private final List<String> punctuation = Arrays.asList(".", ",", ":", ";", "?", "!", "(", ")", "[", "]", "{",
            "}");

    // maps of dictionaries: key is language, value is list of filenames for the specific element
    // (or generic for all)
    private HashMap<String, List<String>> allDictionaries;
    private HashMap<String, List<String>> tokenDictionaries;
    private HashMap<String, List<String>> tagDictionaries;
    private HashMap<String, List<String>> categoryDictionaries;

    public SimpleStopWordRemover() {
        dictionaries = new HashMap<>();
        allDictionaries = new HashMap<>();
        tokenDictionaries = new HashMap<>();
        tagDictionaries = new HashMap<>();
        categoryDictionaries = new HashMap<>();
    }

    @Override
    public String getName() {
        return PLUGIN_NAME;
    }

    @Override
    public StopWordConfig getNewParameter() {
        return new StopWordConfig();
    }

    @Override
    protected boolean isTokenStopWord(String token, String language, StopWordConfig stopWordConfig) {
        List<String> tokenDictionariesNames = union(getAllDictionaries(stopWordConfig, language),
                getTokenDictionaries(stopWordConfig, language));
        return isStopWord(token, tokenDictionariesNames);
    }

    @Override
    protected boolean isTagStopWord(String tag, String language, StopWordConfig stopWordConfig) {
        List<String> tagDictionariesNames = union(getAllDictionaries(stopWordConfig, language),
                getTagDictionaries(stopWordConfig, language));
        return isStopWord(tag, tagDictionariesNames);
    }

    @Override
    protected boolean isCategoryStopWord(String category, String language, StopWordConfig stopWordConfig) {
        List<String> categoryDictionariesNames = union(getAllDictionaries(stopWordConfig, language),
                getCategoryDictionaries(stopWordConfig, language));
        return isStopWord(category, categoryDictionariesNames);
    }

    @Override
    protected void processMessage(Message message, StopWordConfig stopWordConfig) {
        String language = message.getLanguage();
        // for each element, reset the "stop word" property
        // by looking up the word in the proper dictionary

        // mark tokens
        if (stopWordConfig.mustStopTokens()) {
            List<Token> tokens = message.getTokens();
            if (tokens != null) {
                tokens.forEach(
                        token -> token.setStopWord(isTokenStopWord(token.getText(), language, stopWordConfig)));
            }
        }

        // mark tags
        if (stopWordConfig.mustStopTags()) {
            Set<Tag> tags = message.getTags();
            if (tags != null) {
                tags.forEach(tag -> {
                    tag.setStopWord(isTagStopWord(tag.getText(), language, stopWordConfig));

                    // for each tag, mark its categories
                    if (stopWordConfig.mustStopCategories()) {
                        Set<Category> categories = tag.getCategories();
                        if (categories != null) {
                            categories.forEach(cat -> cat
                                    .setStopWord(isCategoryStopWord(cat.getText(), language, stopWordConfig)));
                        }
                    }
                });
            }
        }
    }

    /**
     * Return the dictionary specified by a file name, building it if it wasn't built sooner.
     * If the dictionary cannot be built (maybe because files were missing or because files were
     * empty), no error is thrown, and the returned dictionary will simply be empty, thus removing
     * no words at all.
     *
     * @param fileName The name of the file referencing the dictionary (should exist in the
     *                 classpath resources).
     *
     * @return A {@link HashSet<String>} containing all of the dictionary terms.
     */
    private HashSet<String> getDictionaryByFileName(String fileName) {
        if (!dictionaries.containsKey(fileName)) {
            HashSet<String> newDictionary = new HashSet<>();

            try {
                List<String> lines = IOUtils.readLines(getClass().getClassLoader().getResourceAsStream(fileName));
                lines = lines.stream().map(String::toLowerCase).collect(Collectors.toList());
                newDictionary.addAll(lines);
            } catch (Exception ignored) {
            }

            dictionaries.put(fileName, newDictionary);
        }
        return dictionaries.get(fileName);
    }

    /**
     * Return a dictionary containing all of the term included in the files whose names are
     * included in the input list.
     *
     * @param fileNames A {@link List} of {@link String} representing the files to read.
     *
     * @return A {@link HashSet<String>} containing all of the dictionary terms.
     */
    private HashSet<String> getDictionariesByFileNames(List<String> fileNames) {
        HashSet<String> set = new HashSet<>();
        for (String fileName : fileNames) {
            set.addAll(getDictionaryByFileName(fileName));
        }
        set.addAll(punctuation);
        return set;
    }

    /**
     * Check if a word is considered as a stop-word among the input dictionaries.
     *
     * @param word      The {@link String} to check.
     * @param fileNames Specifies the list of dictionary files to be used to check for stop words.
     *
     * @return true if the word is considered a stop word, false otherwise.
     */
    private boolean isStopWord(String word, List<String> fileNames) {
        if (word == null) {
            return true;
        }
        HashSet<String> dict = getDictionariesByFileNames(fileNames);
        return dict.contains(word.toLowerCase());
    }

    private List<String> getAllDictionaries(StopWordConfig stopWordConfig, String language) {
        if (allDictionaries.get(language) == null) {
            List<String> languageDictionaries = stopWordConfig.getDictionaries().getAll();
            allDictionaries.put(language, replaceWithLang(languageDictionaries, language));
        }
        return allDictionaries.get(language);
    }

    private List<String> getTokenDictionaries(StopWordConfig stopWordConfig, String language) {
        if (tokenDictionaries.get(language) == null) {
            List<String> languageDictionaries = stopWordConfig.getDictionaries().getTokens();
            tokenDictionaries.put(language, replaceWithLang(languageDictionaries, language));
        }
        return tokenDictionaries.get(language);
    }

    private List<String> getTagDictionaries(StopWordConfig stopWordConfig, String language) {
        if (tagDictionaries.get(language) == null) {
            List<String> languageDictionaries = stopWordConfig.getDictionaries().getTags();
            tagDictionaries.put(language, replaceWithLang(languageDictionaries, language));
        }
        return tagDictionaries.get(language);
    }

    private List<String> getCategoryDictionaries(StopWordConfig stopWordConfig, String language) {
        if (categoryDictionaries.get(language) == null) {
            List<String> languageDictionaries = stopWordConfig.getDictionaries().getCategories();
            categoryDictionaries.put(language, replaceWithLang(languageDictionaries, language));
        }
        return categoryDictionaries.get(language);
    }

    private List<String> union(List<String>... lists) {
        List<String> resulting = new ArrayList<>();
        for (List<String> list : lists) {
            resulting.addAll(list);
        }
        return resulting;
    }

    private List<String> replaceWithLang(List<String> elems, String language) {
        List<String> normalized;
        if (elems != null) {
            normalized = new ArrayList<>(elems.size());
            for (String dict : elems) {
                if (dict.contains(LANG_PLACEHOLDER)) {
                    dict = dict.replace(LANG_PLACEHOLDER, language);
                }
                normalized.add(dict);
            }
        } else {
            normalized = new ArrayList<>(0);
        }
        return normalized;
    }
}