org.apache.lucene.analysis.CharacterUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.analysis.CharacterUtils.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis;

import java.io.IOException;
import java.io.Reader;

/**
 * Utility class to write tokenizers or token filters.
 * @lucene.internal
 */
public final class CharacterUtils {

    private CharacterUtils() {
    } // no instantiation

    /**
     * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
     * of the given bufferSize.
     * 
     * @param bufferSize
     *          the internal char buffer size, must be <code>&gt;= 2</code>
     * @return a new {@link CharacterBuffer} instance.
     */
    public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
        if (bufferSize < 2) {
            throw new IllegalArgumentException("buffersize must be >= 2");
        }
        return new CharacterBuffer(new char[bufferSize], 0, 0);
    }

    /**
     * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting 
     * at the given offset.
     * @param buffer the char buffer to lowercase
     * @param offset the offset to start at
     * @param limit the max char in the buffer to lower case
     */
    public static void toLowerCase(final char[] buffer, final int offset, final int limit) {
        assert buffer.length >= limit;
        assert offset <= 0 && offset <= buffer.length;
        for (int i = offset; i < limit;) {
            i += Character.toChars(Character.toLowerCase(Character.codePointAt(buffer, i, limit)), buffer, i);
        }
    }

    /**
     * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting 
     * at the given offset.
     * @param buffer the char buffer to UPPERCASE
     * @param offset the offset to start at
     * @param limit the max char in the buffer to lower case
     */
    public static void toUpperCase(final char[] buffer, final int offset, final int limit) {
        assert buffer.length >= limit;
        assert offset <= 0 && offset <= buffer.length;
        for (int i = offset; i < limit;) {
            i += Character.toChars(Character.toUpperCase(Character.codePointAt(buffer, i, limit)), buffer, i);
        }
    }

    /** Converts a sequence of Java characters to a sequence of unicode code points.
     *  @return the number of code points written to the destination buffer */
    public static int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
        if (srcLen < 0) {
            throw new IllegalArgumentException("srcLen must be >= 0");
        }
        int codePointCount = 0;
        for (int i = 0; i < srcLen;) {
            final int cp = Character.codePointAt(src, srcOff + i, srcOff + srcLen);
            final int charCount = Character.charCount(cp);
            dest[destOff + codePointCount++] = cp;
            i += charCount;
        }
        return codePointCount;
    }

    /** Converts a sequence of unicode code points to a sequence of Java characters.
     *  @return the number of chars written to the destination buffer */
    public static int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
        if (srcLen < 0) {
            throw new IllegalArgumentException("srcLen must be >= 0");
        }
        int written = 0;
        for (int i = 0; i < srcLen; ++i) {
            written += Character.toChars(src[srcOff + i], dest, destOff + written);
        }
        return written;
    }

    /**
     * Fills the {@link CharacterBuffer} with characters read from the given
     * reader {@link Reader}. This method tries to read <code>numChars</code>
     * characters into the {@link CharacterBuffer}, each call to fill will start
     * filling the buffer from offset <code>0</code> up to <code>numChars</code>.
     * In case code points can span across 2 java characters, this method may
     * only fill <code>numChars - 1</code> characters in order not to split in
     * the middle of a surrogate pair, even if there are remaining characters in
     * the {@link Reader}.
     * <p>
     * This method guarantees
     * that the given {@link CharacterBuffer} will never contain a high surrogate
     * character as the last element in the buffer unless it is the last available
     * character in the reader. In other words, high and low surrogate pairs will
     * always be preserved across buffer boarders.
     * </p>
     * <p>
     * A return value of <code>false</code> means that this method call exhausted
     * the reader, but there may be some bytes which have been read, which can be
     * verified by checking whether <code>buffer.getLength() &gt; 0</code>.
     * </p>
     * 
     * @param buffer
     *          the buffer to fill.
     * @param reader
     *          the reader to read characters from.
     * @param numChars
     *          the number of chars to read
     * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
     * @throws IOException
     *           if the reader throws an {@link IOException}.
     */
    public static boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException {
        assert buffer.buffer.length >= 2;
        if (numChars < 2 || numChars > buffer.buffer.length) {
            throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
        }
        final char[] charBuffer = buffer.buffer;
        buffer.offset = 0;
        final int offset;

        // Install the previously saved ending high surrogate:
        if (buffer.lastTrailingHighSurrogate != 0) {
            charBuffer[0] = buffer.lastTrailingHighSurrogate;
            buffer.lastTrailingHighSurrogate = 0;
            offset = 1;
        } else {
            offset = 0;
        }

        final int read = readFully(reader, charBuffer, offset, numChars - offset);

        buffer.length = offset + read;
        final boolean result = buffer.length == numChars;
        if (buffer.length < numChars) {
            // We failed to fill the buffer. Even if the last char is a high
            // surrogate, there is nothing we can do
            return result;
        }

        if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
            buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
        }
        return result;
    }

    /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
    public static boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
        return fill(buffer, reader, buffer.buffer.length);
    }

    static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
        int read = 0;
        while (read < len) {
            final int r = reader.read(dest, offset + read, len - read);
            if (r == -1) {
                break;
            }
            read += r;
        }
        return read;
    }

    /**
     * A simple IO buffer to use with
     * {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
     */
    public static final class CharacterBuffer {

        private final char[] buffer;
        private int offset;
        private int length;
        // NOTE: not private so outer class can access without
        // $access methods:
        char lastTrailingHighSurrogate;

        CharacterBuffer(char[] buffer, int offset, int length) {
            this.buffer = buffer;
            this.offset = offset;
            this.length = length;
        }

        /**
         * Returns the internal buffer
         * 
         * @return the buffer
         */
        public char[] getBuffer() {
            return buffer;
        }

        /**
         * Returns the data offset in the internal buffer.
         * 
         * @return the offset
         */
        public int getOffset() {
            return offset;
        }

        /**
         * Return the length of the data in the internal buffer starting at
         * {@link #getOffset()}
         * 
         * @return the length
         */
        public int getLength() {
            return length;
        }

        /**
         * Resets the CharacterBuffer. All internals are reset to its default
         * values.
         */
        public void reset() {
            offset = 0;
            length = 0;
            lastTrailingHighSurrogate = 0;
        }
    }

}