com.quui.chat.Preprocessor.java Source code

Introduction

Here is the source code for com.quui.chat.Preprocessor.java
Source

/**
 * Project "com.quui.chat.core" (C) 2006 Fabian Steeg This library is free
 * software; you can redistribute it and/or modify it under the terms of the GNU
 * Lesser General Public License as published by the Free Software Foundation;
 * either version 2.1 of the License, or (at your option) any later version.
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details. You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
package com.quui.chat;

import java.util.StringTokenizer;
import java.util.Vector;

import org.apache.commons.lang.StringUtils;

import com.quui.chat.mind.wn.WNLookup;

/**
 * A simple preprocessor.
 * @author Fabian Steeg (fsteeg)
 */
public class Preprocessor {
    private Vector<String> stopwords;

    private boolean isWordNetEnabled;

    /**
     * @param isWordnetEnabled If true this preprocessor uses wordnet for
     *            stemming
     * @param stopwords The words to ignore, will be filtered while
     *            preprocessing
     */
    public Preprocessor(boolean isWordnetEnabled, Vector<String> stopwords) {
        this.isWordNetEnabled = isWordnetEnabled;
        this.stopwords = stopwords;
    }

    /**
     * Tokenizes the user input, then every word thats not in the stopwords-list
     * is stemmed and these are returned.
     * @param s The input to answer to.
     * @return Returns those words (stemmed) that will be processed
     */
    public Vector<String> preProcess(String s) {
        if (s == null) {
            throw new NullPointerException("Word to process is null.");
        }
        StringTokenizer tokenizer = new StringTokenizer(s, " .!?,;:^\"$%&/\\()[]#'+*<>|\t-");
        String[] tokens = new String[tokenizer.countTokens()];
        int i = 0;
        while (tokenizer.hasMoreElements()) {
            tokens[i] = tokenizer.nextToken();
            i++;
        }
        Vector<String> result = new Vector<String>();
        for (String element : tokens) {
            String firstStem = element;
            if (this.isWordNetEnabled) {
                firstStem = WNLookup.getStaticStem(element);
                if (firstStem.equals("")) {
                    firstStem = element;
                }
            }
            if (!this.stopwords.contains(firstStem) && firstStem.trim().length() > 1) {
                if (firstStem.trim().equals("")) {
                    throw new NullPointerException("Empty token!");
                }
                result.add(firstStem);
            }
        }
        return result;
    }

    /**
     * Cleans all occurences and almost-occurences (Levenshtein Distance) of
     * nick in message.
     * @param message The message to clean
     * @param nick The nick to clean from the message
     * @return The cleaned message
     */
    static public String clean(String message, String nick) {

        String[] toks = message.toLowerCase().split("[^?!'\\p{L}]");
        String[] nickToks = nick.toLowerCase().split("[^\\p{L}]");
        for (int j = 0; j < toks.length; j++) {
            for (int i = 0; i < nickToks.length; i++) {
                int dist = StringUtils.getLevenshteinDistance(toks[j], nickToks[i]);
                if (dist < 2 && toks[j].length() > 3 && nickToks[i].length() > 3) {// ||
                    Log.logger
                            .debug("Cutting out, L-Dist zw. " + toks[j] + " und " + nickToks[i] + " ist: " + dist);
                    toks[j] = "";
                }
            }

        }
        String result = "";
        for (int j = 0; j < toks.length; j++) {
            result = (result + toks[j].trim()).trim() + " ";
        }
        return result.trim();
    }

    /**
     * @param message the string to check for non ascii
     * @return true is string contains non-ascii, else false
     */
    static public boolean containsNonAscii(String message) {
        char[] m = message.toCharArray();
        for (int i = 0; i < m.length; i++) {
            if (m[i] < 0 || m[i] > 127) {
                return true;
            }
        }
        return false;
    }
}