Java tutorial
/* * Copyright 2007 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package grails.plugin.searchable.internal.lucene; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Iterator; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Query; import org.compass.core.util.Assert; /** * Lucene utils * * @author Maurice Nicholson */ public class LuceneUtils { private static final Log LOG = LogFactory.getLog(LuceneUtils.class); public static final String SPECIAL_QUERY_CHARACTERS = "\\+-!():^[]\"{}~*?"; /** * Returns a list of terms by analysing the given text with Lucene's StandardAnalyzer * * @param text the text to analyse * @return a list of text terms */ public static String[] termsForText(String text) { return termsForText(text, (Analyzer) null); } /** * Returns a list of terms by analysing the given text * * @param text the text to analyse * @param analyzerClass the Analyzer class to use, may be null in which case Lucene's StandardAnalyzer is used * @return a list of text terms */ public static String[] termsForText(String text, Class analyzerClass) { if (analyzerClass == null) { return termsForText(text, (Analyzer) null); } try { return termsForText(text, (Analyzer) analyzerClass.newInstance()); } catch (Exception ex) { // Convert to unchecked LOG.error("Failed to create instance of Analyzer class [" + analyzerClass + "]: " + ex, ex); throw new IllegalStateException( "Failed to create instance of Analyzer class [" + analyzerClass + "]: " + ex); } } /** * Returns a list of terms by analysing the given text * * @param text the text to analyse * @param analyzer the Analyzer instance to use, may be null in which case Lucene's StandardAnalyzer is used * @return a list of text terms */ public static String[] termsForText(String text, Analyzer analyzer) { try { if (analyzer == null) { analyzer = new StandardAnalyzer(); } TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); ArrayList terms = new ArrayList(); Token token = new Token(); while (true) { token = stream.next(token); if (token == null) break; terms.add(new String(token.termBuffer(), 0, token.termLength())); } return (String[]) terms.toArray(new String[terms.size()]); } catch (IOException ex) { // Convert to unchecked LOG.error("Unable to analyze the given text: " + ex, ex); throw new IllegalArgumentException("Unable to analyze the given text: " + ex); } } /** * Returns a list of terms by parsing the given query string - special query characters and words (OR/AND) are * not included in the returned list * * @param queryString the query string to parse * @param analyzerClass the Analyzer Class instance to instantiate, may be null in which case Lucene's * StandardAnalyzer is used * @return a list of text terms */ public static String[] termsForQueryString(String queryString, Class analyzerClass) throws ParseException { if (analyzerClass == null) { return termsForQueryString(queryString, (Analyzer) null); } try { return termsForQueryString(queryString, (Analyzer) analyzerClass.newInstance()); } catch (Exception ex) { // Convert to unchecked LOG.error("Failed to create instance of Analyzer class [" + analyzerClass + "]: " + ex, ex); throw new IllegalStateException( "Failed to create instance of Analyzer class [" + analyzerClass + "]: " + ex); } } /** * Returns a list of terms by parsing the given query string - special query characters and words (OR/AND) are * not included in the returned list * * @param queryString the query string to parse * @param analyzer the Analyzer instance, may be null in which case Lucene's StandardAnalyzer is used * @return a list of text terms * @throws org.apache.lucene.queryParser.ParseException if the query is invalid */ public static String[] termsForQueryString(String queryString, Analyzer analyzer) throws ParseException { if (analyzer == null) { analyzer = new StandardAnalyzer(); } final String defaultField = "$termsForQueryString_defaultField$"; QueryParser queryParser = new QueryParser(defaultField, analyzer); Query query = queryParser.parse(queryString); Set terms = new ListNotSet(); query.extractTerms(terms); String[] termsArray = new String[terms.size()]; int i = 0; for (Iterator iter = terms.iterator(); iter.hasNext();) { termsArray[i++] = ((Term) iter.next()).text(); } return termsArray; } /** * Returns an array of {@link Term}s by parsing the given query string. Since Lucene's query parser is used, * special query characters and words (OR / AND) are not included in the returned terms * * @param defaultField The default term field, cannot be null * @param queryString the query string to parse, cannot be null * @param analyzerClass the Class of Analyzer, may be null in which case Lucene's StandardAnalyzer is used * @return the Term array (field + term pairs) * @throws org.apache.lucene.queryParser.ParseException if the the query has invalid syntax */ public static Term[] realTermsForQueryString(String defaultField, String queryString, Class analyzerClass) throws ParseException { if (analyzerClass == null) { return realTermsForQueryString(defaultField, queryString, (Analyzer) null); } try { return realTermsForQueryString(defaultField, queryString, (Analyzer) analyzerClass.newInstance()); } catch (Exception ex) { // Convert to unchecked LOG.error("Failed to create instance of Analyzer class [" + analyzerClass + "]: " + ex, ex); throw new IllegalStateException( "Failed to create instance of Analyzer class [" + analyzerClass + "]: " + ex); } } /** * Returns an array of {@link Term}s by parsing the given query string. Since Lucene's query parser is used, * special query characters and words (OR / AND) are not included in the returned terms * * @param defaultField The default term field, cannot be null * @param queryString the query string to parse, cannot be null * @param analyzer the Analyzer instance, may be null in which case Lucene's StandardAnalyzer is used * @return the Term array (field + term pairs) * @throws org.apache.lucene.queryParser.ParseException if the the query has invalid syntax */ public static Term[] realTermsForQueryString(String defaultField, String queryString, Analyzer analyzer) throws ParseException { Assert.notNull(defaultField, "defaultField cannot be null"); Assert.notNull(queryString, "queryString cannot be null"); if (analyzer == null) { analyzer = new StandardAnalyzer(); } QueryParser queryParser = new QueryParser(defaultField, analyzer); Query query = queryParser.parse(queryString); Set terms = new ListNotSet(); query.extractTerms(terms); Term[] termsArray = new Term[terms.size()]; int i = 0; for (Iterator iter = terms.iterator(); iter.hasNext();) { Term term = (Term) iter.next(); termsArray[i++] = term; } return termsArray; } // A Set that allows dupes and maintains insertion order, so not really a set :-) private static class ListNotSet extends ArrayList implements Set { private static final long serialVersionUID = 1; } /** * Escape special characters in the given string that would otherwise cause a parse exception * * @param query the query to escape * @return the escaped query */ public static String escapeQuery(String query) { // Note we use the Lucene QueryParser instead of the Compass subclass // because Groovy does not inherit static methods (?) if (query == null) return null; return QueryParser.escape(query); } /** * Returns the query string with special characters removed * * @param query the query to clean * @return the cleaned query */ public static String cleanQuery(String query) { StringBuilder sb = new StringBuilder(); for (int i = 0, count = query.length(); i < count; i++) { char c = query.charAt(i); // These characters are part of the query syntax and must be escaped if (isSpecialQueryCharacter(c)) continue; sb.append(c); } return sb.toString(); } /** * Does the given query string contain special characters, ie, those with * special meaning to Lucene's query parser * @param query the query * @return true if it contains special characters */ public static boolean queryHasSpecialCharacters(String query) { for (int i = 0, count = query.length(); i < count; i++) { char c = query.charAt(i); // These characters are part of the query syntax and must be escaped if (isSpecialQueryCharacter(c)) return true; } return false; } private static boolean isSpecialQueryCharacter(char c) { return SPECIAL_QUERY_CHARACTERS.indexOf(c) > -1; } }