Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeReflector; import org.apache.lucene.util.ArrayUtil; import org.apache.solr.analysis.CharFilterFactory; import org.apache.solr.analysis.TokenFilterFactory; import org.apache.solr.analysis.TokenizerChain; import org.apache.solr.analysis.TokenizerFactory; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.SolrException; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.schema.FieldType; import java.io.IOException; import java.io.StringReader; import java.util.*; import java.math.BigInteger; import org.apache.commons.lang.ArrayUtils; /** * A base class for all analysis request handlers. * * @version $Id: AnalysisRequestHandlerBase.java 1143785 2011-07-07 11:59:59Z uschindler $ * @since solr 1.4 */ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { @Override public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { rsp.add("analysis", doAnalysis(req)); } /** * Performs the analysis based on the given solr request and returns the analysis result as a named list. * * @param req The solr request. * * @return The analysis result as a named list. * * @throws Exception When analysis fails. */ protected abstract NamedList doAnalysis(SolrQueryRequest req) throws Exception; /** * Analyzes the given value using the given Analyzer. * * @param value Value to analyze * @param context The {@link AnalysisContext analysis context}. * * @return NamedList containing the tokens produced by analyzing the given value */ protected NamedList<List<NamedList>> analyzeValue(String value, AnalysisContext context) { Analyzer analyzer = context.getAnalyzer(); if (!TokenizerChain.class.isInstance(analyzer)) { TokenStream tokenStream = null; try { tokenStream = analyzer.reusableTokenStream(context.getFieldName(), new StringReader(value)); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); } NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>(); namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context)); return namedList; } TokenizerChain tokenizerChain = (TokenizerChain) analyzer; CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories(); TokenizerFactory tfac = tokenizerChain.getTokenizerFactory(); TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories(); NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>(); if (cfiltfacs != null) { String source = value; for (CharFilterFactory cfiltfac : cfiltfacs) { CharStream reader = CharReader.get(new StringReader(source)); reader = cfiltfac.create(reader); source = writeCharStream(namedList, reader); } } TokenStream tokenStream = tfac.create(tokenizerChain.charStream(new StringReader(value))); List<AttributeSource> tokens = analyzeTokenStream(tokenStream); namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context)); ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokens); for (TokenFilterFactory tokenFilterFactory : filtfacs) { for (final AttributeSource tok : tokens) { tok.getAttribute(TokenTrackingAttribute.class).freezeStage(); } tokenStream = tokenFilterFactory.create(listBasedTokenStream); tokens = analyzeTokenStream(tokenStream); namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context)); listBasedTokenStream = new ListBasedTokenStream(tokens); } return namedList; } /** * Analyzes the given text using the given analyzer and returns the produced tokens. * * @param value The value to analyze. * @param analyzer The analyzer to use. * * @return The produces token list. * @deprecated This method is no longer used by Solr * @see #getQueryTokenSet */ @Deprecated protected List<AttributeSource> analyzeValue(String value, Analyzer analyzer) { TokenStream tokenStream = analyzer.tokenStream("", new StringReader(value)); return analyzeTokenStream(tokenStream); } /** * Analyzes the given text using the given analyzer and returns the produced tokens. * * @param query The query to analyze. * @param analyzer The analyzer to use. */ protected Set<String> getQueryTokenSet(String query, Analyzer analyzer) { final Set<String> tokens = new HashSet<String>(); final TokenStream tokenStream = analyzer.tokenStream("", new StringReader(query)); final CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { tokens.add(termAtt.toString()); } } catch (IOException ioe) { throw new RuntimeException("Error occured while iterating over tokenstream", ioe); } return tokens; } /** * Analyzes the given TokenStream, collecting the Tokens it produces. * * @param tokenStream TokenStream to analyze * * @return List of tokens produced from the TokenStream */ private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) { final List<AttributeSource> tokens = new ArrayList<AttributeSource>(); final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class); // for backwards compatibility, add all "common" attributes tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(TypeAttribute.class); try { tokenStream.reset(); int position = 0; while (tokenStream.incrementToken()) { position += posIncrAtt.getPositionIncrement(); trackerAtt.setActPosition(position); tokens.add(tokenStream.cloneAttributes()); } } catch (IOException ioe) { throw new RuntimeException("Error occured while iterating over tokenstream", ioe); } return tokens; } // a static mapping of the reflected attribute keys to the names used in Solr 1.4 static Map<String, String> ATTRIBUTE_MAPPING = Collections.unmodifiableMap(new HashMap<String, String>() { { put(OffsetAttribute.class.getName() + "#startOffset", "start"); put(OffsetAttribute.class.getName() + "#endOffset", "end"); put(TypeAttribute.class.getName() + "#type", "type"); put(TokenTrackingAttribute.class.getName() + "#position", "position"); put(TokenTrackingAttribute.class.getName() + "#positionHistory", "positionHistory"); } }); /** * Converts the list of Tokens to a list of NamedLists representing the tokens. * * @param tokens Tokens to convert * @param context The analysis context * * @return List of NamedLists containing the relevant information taken from the tokens */ private List<NamedList> convertTokensToNamedLists(final List<AttributeSource> tokenList, AnalysisContext context) { final List<NamedList> tokensNamedLists = new ArrayList<NamedList>(); final FieldType fieldType = context.getFieldType(); final AttributeSource[] tokens = tokenList.toArray(new AttributeSource[tokenList.size()]); // sort the tokens by absoulte position ArrayUtil.mergeSort(tokens, new Comparator<AttributeSource>() { public int compare(AttributeSource a, AttributeSource b) { return arrayCompare(a.getAttribute(TokenTrackingAttribute.class).getPositions(), b.getAttribute(TokenTrackingAttribute.class).getPositions()); } private int arrayCompare(int[] a, int[] b) { int p = 0; final int stop = Math.min(a.length, b.length); while (p < stop) { int diff = a[p] - b[p]; if (diff != 0) return diff; p++; } // One is a prefix of the other, or, they are equal: return a.length - b.length; } }); for (int i = 0; i < tokens.length; i++) { AttributeSource token = tokens[i]; final NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>(); final String rawText = token.addAttribute(CharTermAttribute.class).toString(); String text = fieldType.indexedToReadable(rawText); tokenNamedList.add("text", text); if (!text.equals(rawText)) { tokenNamedList.add("raw_text", rawText); } if (context.getTermsToMatch().contains(rawText)) { tokenNamedList.add("match", true); } token.reflectWith(new AttributeReflector() { public void reflect(Class<? extends Attribute> attClass, String key, Object value) { // leave out position and term if (CharTermAttribute.class.isAssignableFrom(attClass)) return; if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) return; String k = attClass.getName() + '#' + key; // map keys for "standard attributes": if (ATTRIBUTE_MAPPING.containsKey(k)) { k = ATTRIBUTE_MAPPING.get(k); } // TODO: special handling for payloads - move this to ResponseWriter? if (value instanceof Payload) { Payload p = (Payload) value; if (null != p) { BigInteger bi = new BigInteger(p.getData()); String ret = bi.toString(16); if (ret.length() % 2 != 0) { // Pad with 0 ret = "0" + ret; } value = ret; } else { value = null; } } tokenNamedList.add(k, value); } }); tokensNamedLists.add(tokenNamedList); } return tokensNamedLists; } private String writeCharStream(NamedList out, CharStream input) { final int BUFFER_SIZE = 1024; char[] buf = new char[BUFFER_SIZE]; int len = 0; StringBuilder sb = new StringBuilder(); do { try { len = input.read(buf, 0, BUFFER_SIZE); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); } if (len > 0) sb.append(buf, 0, len); } while (len == BUFFER_SIZE); out.add(input.getClass().getName(), sb.toString()); return sb.toString(); } // ================================================= Inner classes ================================================= /** * TokenStream that iterates over a list of pre-existing Tokens * @lucene.internal */ protected final static class ListBasedTokenStream extends TokenStream { private final List<AttributeSource> tokens; private Iterator<AttributeSource> tokenIterator; /** * Creates a new ListBasedTokenStream which uses the given tokens as its token source. * * @param tokens Source of tokens to be used */ ListBasedTokenStream(List<AttributeSource> tokens) { this.tokens = tokens; tokenIterator = tokens.iterator(); } @Override public boolean incrementToken() throws IOException { if (tokenIterator.hasNext()) { clearAttributes(); AttributeSource next = tokenIterator.next(); Iterator<Class<? extends Attribute>> atts = next.getAttributeClassesIterator(); while (atts.hasNext()) // make sure all att impls in the token exist here addAttribute(atts.next()); next.copyTo(this); return true; } else { return false; } } @Override public void reset() throws IOException { super.reset(); tokenIterator = tokens.iterator(); } } /** This is an {@link Attribute} used to track the positions of tokens * in the analysis chain. * @lucene.internal This class is only public for usage by the {@link AttributeSource} API. */ public interface TokenTrackingAttribute extends Attribute { void freezeStage(); void setActPosition(int pos); int[] getPositions(); void reset(int[] basePositions, int position); } /** Implementation of {@link TokenTrackingAttribute}. * @lucene.internal This class is only public for usage by the {@link AttributeSource} API. */ public static final class TokenTrackingAttributeImpl extends AttributeImpl implements TokenTrackingAttribute { private int[] basePositions = new int[0]; private int position = 0; private transient int[] cachedPositions = null; public void freezeStage() { this.basePositions = getPositions(); this.position = 0; this.cachedPositions = null; } public void setActPosition(int pos) { this.position = pos; this.cachedPositions = null; } public int[] getPositions() { if (cachedPositions == null) { cachedPositions = ArrayUtils.add(basePositions, position); } return cachedPositions; } public void reset(int[] basePositions, int position) { this.basePositions = basePositions; this.position = position; this.cachedPositions = null; } @Override public void clear() { // we do nothing here, as all attribute values are controlled externally by consumer } @Override public void reflectWith(AttributeReflector reflector) { reflector.reflect(TokenTrackingAttribute.class, "position", position); // convert to Integer[] array, as only such one can be serialized by ResponseWriters reflector.reflect(TokenTrackingAttribute.class, "positionHistory", ArrayUtils.toObject(getPositions())); } @Override public void copyTo(AttributeImpl target) { final TokenTrackingAttribute t = (TokenTrackingAttribute) target; t.reset(basePositions, position); } } /** * Serves as the context of an analysis process. This context contains the following constructs */ protected static class AnalysisContext { private final String fieldName; private final FieldType fieldType; private final Analyzer analyzer; private final Set<String> termsToMatch; /** * Constructs a new AnalysisContext with a given field tpe, analyzer and * termsToMatch. By default the field name in this context will be * {@code null}. During the analysis processs, The produced tokens will * be compaired to the terms in the {@code termsToMatch} set. When found, * these tokens will be marked as a match. * * @param fieldType The type of the field the analysis is performed on. * @param analyzer The analyzer to be used. * @param termsToMatch Holds all the terms that should match during the * analysis process. */ public AnalysisContext(FieldType fieldType, Analyzer analyzer, Set<String> termsToMatch) { this(null, fieldType, analyzer, termsToMatch); } /** * Constructs an AnalysisContext with a given field name, field type * and analyzer. By default this context will hold no terms to match * * @param fieldName The name of the field the analysis is performed on * (may be {@code null}). * @param fieldType The type of the field the analysis is performed on. * @param analyzer The analyzer to be used during the analysis process. * */ public AnalysisContext(String fieldName, FieldType fieldType, Analyzer analyzer) { this(fieldName, fieldType, analyzer, Collections.EMPTY_SET); } /** * Constructs a new AnalysisContext with a given field tpe, analyzer and * termsToMatch. During the analysis processs, The produced tokens will be * compaired to the termes in the {@code termsToMatch} set. When found, * these tokens will be marked as a match. * * @param fieldName The name of the field the analysis is performed on * (may be {@code null}). * @param fieldType The type of the field the analysis is performed on. * @param analyzer The analyzer to be used. * @param termsToMatch Holds all the terms that should match during the * analysis process. */ public AnalysisContext(String fieldName, FieldType fieldType, Analyzer analyzer, Set<String> termsToMatch) { this.fieldName = fieldName; this.fieldType = fieldType; this.analyzer = analyzer; this.termsToMatch = termsToMatch; } public String getFieldName() { return fieldName; } public FieldType getFieldType() { return fieldType; } public Analyzer getAnalyzer() { return analyzer; } public Set<String> getTermsToMatch() { return termsToMatch; } } }