org.chililog.server.common.TextTokenizer.java Source code

Introduction

Here is the source code for org.chililog.server.common.TextTokenizer.java
Source

//
// Copyright 2010 Cinch Logic Pty Ltd.
//
// http://www.chililog.com
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package org.chililog.server.common;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;

public class TextTokenizer {

    /**
     * Returns the singleton instance for this class
     */
    public static TextTokenizer getInstance() {
        return SingletonHolder.INSTANCE;
    }

    /**
     * SingletonHolder is loaded on the first execution of Singleton.getInstance() or the first access to
     * SingletonHolder.INSTANCE, not before.
     * 
     * @see http://en.wikipedia.org/wiki/Singleton_pattern
     */
    private static class SingletonHolder {

        public static final TextTokenizer INSTANCE = new TextTokenizer();
    }

    /**
     * 
     */
    private TextTokenizer() {

    }

    /**
     * <p>
     * Tokenizes text to get keywords
     * </p>
     * <p>
     * We use lucene <code>StandardAnalyzer</code> with a bit of spice. We want to break up domain names, class names
     * and emails so we have to do some extra parsing.
     * </p>
     * <p>
     * Lucene parsing:
     * <ul>
     * <li>"email@address.com" = ["email@address", "com"]</li>
     * <li>"com.chililog.server.common.ChiliLogExceptionTest" = ["com.chililog.server.common", "chililogexceptiontest"]</li>
     * </ul>
     * </p>
     * <p>
     * We have not used regular expression because it is slow. We have implemented this as a singleton so that in the
     * future we can allow user customization.
     * </p>
     * 
     * @param text
     *            Text to extract keywords
     * @param maxKeywords
     *            Maximum number of keywords to extract. If < 0, then no limit will be used.
     * @return Array of keywords
     * @throws IOException
     */
    public ArrayList<String> tokenize(String text, long maxKeywords) throws IOException {
        ArrayList<String> tokens = new ArrayList<String>();

        if (StringUtils.isEmpty(text) || maxKeywords == 0) {
            return tokens;
        }

        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        HashMap<String, String> lookup = new HashMap<String, String>();
        TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

        StringBuilder sb = new StringBuilder();
        TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
        while (stream.incrementToken()) {
            char[] termBuffer = termAttribute.termBuffer();
            int length = termAttribute.termLength();

            boolean doSplit = true;

            // Check if we want to split
            if (Character.isDigit(termBuffer[0])) {
                doSplit = false;
            } else {
                for (int j = 0; j < length; j++) {
                    char c = termBuffer[j];
                    if (!Character.isLetterOrDigit(c) && c != '.' && c != '@') {
                        doSplit = false;
                        break;
                    }
                }
            }

            if (doSplit) {
                sb.setLength(0);
                for (int i = 0; i < length; i++) {
                    char c = termBuffer[i];
                    if (c == '.' || c == '@') {
                        if (!addToken(tokens, lookup, sb.toString(), maxKeywords)) {
                            return tokens;
                        }
                        sb.setLength(0);
                    } else {
                        sb.append(c);
                    }
                }

                // Add last part
                if (!addToken(tokens, lookup, sb.toString(), maxKeywords)) {
                    return tokens;
                }
            } else {
                // No splitting, just add term
                if (!addToken(tokens, lookup, termAttribute.term(), maxKeywords)) {
                    return tokens;
                }
            }
        }

        return tokens;
    }

    /**
     * Adds our token to our collection
     * 
     * @param tokens
     *            collection of tokens
     * @param lookup
     *            lookup hashmap for duplicates
     * @param token
     *            token or term to add to the collection
     * @param maxKeywords
     *            maximum number of keywords
     * @return True if it is OK to keep adding tokens, False if no more tokens should be added
     */
    private boolean addToken(ArrayList<String> tokens, HashMap<String, String> lookup, String token,
            long maxKeywords) {
        if (!StringUtils.isBlank(token) && !lookup.containsKey(token)) {
            tokens.add(token);
            lookup.put(token, null);
            if (maxKeywords > 0 && tokens.size() >= maxKeywords) {
                return false;
            }
        }
        return true;
    }

}