org.apache.lucene.analysis.TokenStream.java Source code

Introduction

Here is the source code for org.apache.lucene.analysis.TokenStream.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis;

import java.io.IOException;
import java.io.Closeable;
import java.lang.reflect.Modifier;

import org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;

/**
 * A <code>TokenStream</code> enumerates the sequence of tokens, either from
 * {@link Field}s of a {@link Document} or from query text.
 * <p>
 * This is an abstract class; concrete subclasses are:
 * <ul>
 * <li>{@link Tokenizer}, a <code>TokenStream</code> whose input is a Reader; and
 * <li>{@link TokenFilter}, a <code>TokenStream</code> whose input is another
 * <code>TokenStream</code>.
 * </ul>
 * <code>TokenStream</code> extends {@link AttributeSource}, which provides
 * access to all of the token {@link Attribute}s for the <code>TokenStream</code>.
 * Note that only one instance per {@link AttributeImpl} is created and reused
 * for every token. This approach reduces object creation and allows local
 * caching of references to the {@link AttributeImpl}s. See
 * {@link #incrementToken()} for further details.
 * <p>
 * <b>The workflow of the new <code>TokenStream</code> API is as follows:</b>
 * <ol>
 * <li>Instantiation of <code>TokenStream</code>/{@link TokenFilter}s which add/get
 * attributes to/from the {@link AttributeSource}.
 * <li>The consumer calls {@link TokenStream#reset()}.
 * <li>The consumer retrieves attributes from the stream and stores local
 * references to all attributes it wants to access.
 * <li>The consumer calls {@link #incrementToken()} until it returns false
 * consuming the attributes after each call.
 * <li>The consumer calls {@link #end()} so that any end-of-stream operations
 * can be performed.
 * <li>The consumer calls {@link #close()} to release any resource when finished
 * using the <code>TokenStream</code>.
 * </ol>
 * To make sure that filters and consumers know which attributes are available,
 * the attributes must be added during instantiation. Filters and consumers are
 * not required to check for availability of attributes in
 * {@link #incrementToken()}.
 * <p>
 * You can find some example code for the new API in the analysis package level
 * Javadoc.
 * <p>
 * Sometimes it is desirable to capture a current state of a <code>TokenStream</code>,
 * e.g., for buffering purposes (see {@link CachingTokenFilter},
 * TeeSinkTokenFilter). For this usecase
 * {@link AttributeSource#captureState} and {@link AttributeSource#restoreState}
 * can be used.
 * <p>The {@code TokenStream}-API in Lucene is based on the decorator pattern.
 * Therefore all non-abstract subclasses must be final or have at least a final
 * implementation of {@link #incrementToken}! This is checked when Java
 * assertions are enabled.
 */
public abstract class TokenStream extends AttributeSource implements Closeable {

    /** Default {@link AttributeFactory} instance that should be used for TokenStreams. */
    public static final AttributeFactory DEFAULT_TOKEN_ATTRIBUTE_FACTORY = AttributeFactory
            .getStaticImplementation(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, PackedTokenAttributeImpl.class);

    /**
     * A TokenStream using the default attribute factory.
     */
    protected TokenStream() {
        super(DEFAULT_TOKEN_ATTRIBUTE_FACTORY);
        assert assertFinal();
    }

    /**
     * A TokenStream that uses the same attributes as the supplied one.
     */
    protected TokenStream(AttributeSource input) {
        super(input);
        assert assertFinal();
    }

    /**
     * A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
     */
    protected TokenStream(AttributeFactory factory) {
        super(factory);
        assert assertFinal();
    }

    private boolean assertFinal() {
        try {
            final Class<?> clazz = getClass();
            if (!clazz.desiredAssertionStatus())
                return true;
            assert clazz.isAnonymousClass() || (clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0
                    || Modifier.isFinal(clazz.getMethod("incrementToken")
                            .getModifiers()) : "TokenStream implementation classes or at least their incrementToken() implementation must be final";
            return true;
        } catch (NoSuchMethodException nsme) {
            return false;
        }
    }

    /**
     * Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to
     * the next token. Implementing classes must implement this method and update
     * the appropriate {@link AttributeImpl}s with the attributes of the next
     * token.
     * <P>
     * The producer must make no assumptions about the attributes after the method
     * has been returned: the caller may arbitrarily change it. If the producer
     * needs to preserve the state for subsequent calls, it can use
     * {@link #captureState} to create a copy of the current attribute state.
     * <p>
     * This method is called for every token of a document, so an efficient
     * implementation is crucial for good performance. To avoid calls to
     * {@link #addAttribute(Class)} and {@link #getAttribute(Class)},
     * references to all {@link AttributeImpl}s that this stream uses should be
     * retrieved during instantiation.
     * <p>
     * To ensure that filters and consumers know which attributes are available,
     * the attributes must be added during instantiation. Filters and consumers
     * are not required to check for availability of attributes in
     * {@link #incrementToken()}.
     * 
     * @return false for end of stream; true otherwise
     */
    public abstract boolean incrementToken() throws IOException;

    /**
     * This method is called by the consumer after the last token has been
     * consumed, after {@link #incrementToken()} returned <code>false</code>
     * (using the new <code>TokenStream</code> API). Streams implementing the old API
     * should upgrade to use this feature.
     * <p>
     * This method can be used to perform any end-of-stream operations, such as
     * setting the final offset of a stream. The final offset of a stream might
     * differ from the offset of the last token eg in case one or more whitespaces
     * followed after the last token, but a WhitespaceTokenizer was used.
     * <p>
     * Additionally any skipped positions (such as those removed by a stopfilter)
     * can be applied to the position increment, or any adjustment of other
     * attributes where the end-of-stream value may be important.
     * <p>
     * If you override this method, always call {@code super.end()}.
     * 
     * @throws IOException If an I/O error occurs
     */
    public void end() throws IOException {
        endAttributes(); // LUCENE-3849: don't consume dirty atts
    }

    /**
     * This method is called by a consumer before it begins consumption using
     * {@link #incrementToken()}.
     * <p>
     * Resets this stream to a clean state. Stateful implementations must implement
     * this method so that they can be reused, just as if they had been created fresh.
     * <p>
     * If you override this method, always call {@code super.reset()}, otherwise
     * some internal state will not be correctly reset (e.g., {@link Tokenizer} will
     * throw {@link IllegalStateException} on further usage).
     */
    public void reset() throws IOException {
    }

    /** Releases resources associated with this stream.
     * <p>
     * If you override this method, always call {@code super.close()}, otherwise
     * some internal state will not be correctly reset (e.g., {@link Tokenizer} will
     * throw {@link IllegalStateException} on reuse).
     */
    @Override
    public void close() throws IOException {
    }

}