grails.plugin.searchable.internal.lucene.LuceneUtils.java Source code

Introduction

Here is the source code for grails.plugin.searchable.internal.lucene.LuceneUtils.java
Source

/*
 * Copyright 2007 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package grails.plugin.searchable.internal.lucene;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Query;
import org.compass.core.util.Assert;

/**
 * Lucene utils
 *
 * @author Maurice Nicholson
 */
public class LuceneUtils {
    private static final Log LOG = LogFactory.getLog(LuceneUtils.class);
    public static final String SPECIAL_QUERY_CHARACTERS = "\\+-!():^[]\"{}~*?";

    /**
     * Returns a list of terms by analysing the given text with Lucene's StandardAnalyzer
     *
     * @param text the text to analyse
     * @return a list of text terms
     */
    public static String[] termsForText(String text) {
        return termsForText(text, (Analyzer) null);
    }

    /**
     * Returns a list of terms by analysing the given text
     *
     * @param text the text to analyse
     * @param analyzerClass the Analyzer class to use, may be null in which case Lucene's StandardAnalyzer is used
     * @return a list of text terms
     */
    public static String[] termsForText(String text, Class analyzerClass) {
        if (analyzerClass == null) {
            return termsForText(text, (Analyzer) null);
        }
        try {
            return termsForText(text, (Analyzer) analyzerClass.newInstance());
        } catch (Exception ex) {
            // Convert to unchecked
            LOG.error("Failed to create instance of Analyzer class [" + analyzerClass + "]: " + ex, ex);
            throw new IllegalStateException(
                    "Failed to create instance of Analyzer class [" + analyzerClass + "]: " + ex);
        }
    }

    /**
     * Returns a list of terms by analysing the given text
     *
     * @param text the text to analyse
     * @param analyzer the Analyzer instance to use, may be null in which case Lucene's StandardAnalyzer is used
     * @return a list of text terms
     */
    public static String[] termsForText(String text, Analyzer analyzer) {
        try {
            if (analyzer == null) {
                analyzer = new StandardAnalyzer();
            }
            TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
            ArrayList terms = new ArrayList();
            Token token = new Token();
            while (true) {
                token = stream.next(token);
                if (token == null)
                    break;

                terms.add(new String(token.termBuffer(), 0, token.termLength()));
            }
            return (String[]) terms.toArray(new String[terms.size()]);
        } catch (IOException ex) {
            // Convert to unchecked
            LOG.error("Unable to analyze the given text: " + ex, ex);
            throw new IllegalArgumentException("Unable to analyze the given text: " + ex);
        }
    }

    /**
     * Returns a list of terms by parsing the given query string - special query characters and words (OR/AND) are
     * not included in the returned list
     *
     * @param queryString the query string to parse
     * @param analyzerClass the Analyzer Class instance to instantiate, may be null in which case Lucene's
     * StandardAnalyzer is used
     * @return a list of text terms
     */
    public static String[] termsForQueryString(String queryString, Class analyzerClass) throws ParseException {
        if (analyzerClass == null) {
            return termsForQueryString(queryString, (Analyzer) null);
        }
        try {
            return termsForQueryString(queryString, (Analyzer) analyzerClass.newInstance());
        } catch (Exception ex) {
            // Convert to unchecked
            LOG.error("Failed to create instance of Analyzer class [" + analyzerClass + "]: " + ex, ex);
            throw new IllegalStateException(
                    "Failed to create instance of Analyzer class [" + analyzerClass + "]: " + ex);
        }
    }

    /**
     * Returns a list of terms by parsing the given query string - special query characters and words (OR/AND) are
     * not included in the returned list
     *
     * @param queryString the query string to parse
     * @param analyzer the Analyzer instance, may be null in which case Lucene's StandardAnalyzer is used
     * @return a list of text terms
     * @throws org.apache.lucene.queryParser.ParseException if the query is invalid
     */
    public static String[] termsForQueryString(String queryString, Analyzer analyzer) throws ParseException {
        if (analyzer == null) {
            analyzer = new StandardAnalyzer();
        }
        final String defaultField = "$termsForQueryString_defaultField$";
        QueryParser queryParser = new QueryParser(defaultField, analyzer);
        Query query = queryParser.parse(queryString);
        Set terms = new ListNotSet();
        query.extractTerms(terms);
        String[] termsArray = new String[terms.size()];
        int i = 0;
        for (Iterator iter = terms.iterator(); iter.hasNext();) {
            termsArray[i++] = ((Term) iter.next()).text();
        }
        return termsArray;
    }

    /**
     * Returns an array of {@link Term}s by parsing the given query string. Since Lucene's query parser is used,
     * special query characters and words (OR / AND) are not included in the returned terms
     *
     * @param defaultField The default term field, cannot be null
     * @param queryString the query string to parse, cannot be null
     * @param analyzerClass the Class of Analyzer, may be null in which case Lucene's StandardAnalyzer is used
     * @return the Term array (field + term pairs)
     * @throws org.apache.lucene.queryParser.ParseException if the the query has invalid syntax
     */
    public static Term[] realTermsForQueryString(String defaultField, String queryString, Class analyzerClass)
            throws ParseException {
        if (analyzerClass == null) {
            return realTermsForQueryString(defaultField, queryString, (Analyzer) null);
        }
        try {
            return realTermsForQueryString(defaultField, queryString, (Analyzer) analyzerClass.newInstance());
        } catch (Exception ex) {
            // Convert to unchecked
            LOG.error("Failed to create instance of Analyzer class [" + analyzerClass + "]: " + ex, ex);
            throw new IllegalStateException(
                    "Failed to create instance of Analyzer class [" + analyzerClass + "]: " + ex);
        }
    }

    /**
     * Returns an array of {@link Term}s by parsing the given query string. Since Lucene's query parser is used,
     * special query characters and words (OR / AND) are not included in the returned terms
     *
     * @param defaultField The default term field, cannot be null
     * @param queryString the query string to parse, cannot be null
     * @param analyzer the Analyzer instance, may be null in which case Lucene's StandardAnalyzer is used
     * @return the Term array (field + term pairs)
     * @throws org.apache.lucene.queryParser.ParseException if the the query has invalid syntax
     */
    public static Term[] realTermsForQueryString(String defaultField, String queryString, Analyzer analyzer)
            throws ParseException {
        Assert.notNull(defaultField, "defaultField cannot be null");
        Assert.notNull(queryString, "queryString cannot be null");
        if (analyzer == null) {
            analyzer = new StandardAnalyzer();
        }
        QueryParser queryParser = new QueryParser(defaultField, analyzer);
        Query query = queryParser.parse(queryString);
        Set terms = new ListNotSet();
        query.extractTerms(terms);
        Term[] termsArray = new Term[terms.size()];
        int i = 0;
        for (Iterator iter = terms.iterator(); iter.hasNext();) {
            Term term = (Term) iter.next();
            termsArray[i++] = term;
        }
        return termsArray;
    }

    // A Set that allows dupes and maintains insertion order, so not really a set :-)
    private static class ListNotSet extends ArrayList implements Set {
        private static final long serialVersionUID = 1;
    }

    /**
     * Escape special characters in the given string that would otherwise cause a parse exception
     *
     * @param query the query to escape
     * @return the escaped query
     */
    public static String escapeQuery(String query) {
        // Note we use the Lucene QueryParser instead of the Compass subclass
        // because Groovy does not inherit static methods (?)
        if (query == null)
            return null;
        return QueryParser.escape(query);
    }

    /**
     * Returns the query string with special characters removed
     *
     * @param query the query to clean
     * @return the cleaned query
     */
    public static String cleanQuery(String query) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0, count = query.length(); i < count; i++) {
            char c = query.charAt(i);
            // These characters are part of the query syntax and must be escaped
            if (isSpecialQueryCharacter(c))
                continue;
            sb.append(c);
        }
        return sb.toString();
    }

    /**
     * Does the given query string contain special characters, ie, those with
     * special meaning to Lucene's query parser
     * @param query the query
     * @return true if it contains special characters
     */
    public static boolean queryHasSpecialCharacters(String query) {
        for (int i = 0, count = query.length(); i < count; i++) {
            char c = query.charAt(i);
            // These characters are part of the query syntax and must be escaped
            if (isSpecialQueryCharacter(c))
                return true;
        }
        return false;
    }

    private static boolean isSpecialQueryCharacter(char c) {
        return SPECIAL_QUERY_CHARACTERS.indexOf(c) > -1;
    }
}