com.norconex.commons.lang.io.TextReader.java Source code

Java tutorial

Introduction

Here is the source code for com.norconex.commons.lang.io.TextReader.java

Source

/* Copyright 2015 Norconex Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.norconex.commons.lang.io;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;

/**
 * Reads text form an input stream, splitting it wisely whenever the text
 * is too large.  First tries to split after the last paragraph.  If there
 * are no paragraph, it tries to split after the last sentence.  If no sentence
 * can be detected, it splits on the last word.  If no words are found,
 * it returns all it could read up to the maximum read size.
 * @author Pascal Essiembre
 * @since 1.6.0
 */
public class TextReader extends Reader {

    public static final int DEFAULT_MAX_READ_SIZE = (int) (FileUtils.ONE_KB * 64);

    private final BufferedReader reader;
    private final int maxReadSize;
    private final boolean removeTrailingDelimiter;
    private final StringBuilder buffer = new StringBuilder();

    private static final int PATTERN_FLAGS = Pattern.MULTILINE | Pattern.DOTALL | Pattern.UNICODE_CHARACTER_CLASS;

    private final Pattern paragraphDelimiterPattern = Pattern.compile(
            "(\\p{javaWhitespace}*[\\n\\r]\\p{javaWhitespace}*" + "[\\n\\r]\\p{javaWhitespace}*)", PATTERN_FLAGS);
    private final Pattern sentencePattern = Pattern.compile(".*((^\\p{javaWhitespace}*|[?!\\.]\\p{javaWhitespace}+)"
            + "([^\\p{javaWhitespace}].+?)([?!\\.]\\p{javaWhitespace}+|\\n))", PATTERN_FLAGS);
    private final Pattern wordDelimiterPattern = Pattern.compile("(\\p{javaWhitespace}+)", PATTERN_FLAGS);;

    /**
     * Create a new text reader, reading 64KB at a time with 
     * {@link #readText()} is called.
     * @param reader a Reader
     */
    public TextReader(Reader reader) {
        this(reader, DEFAULT_MAX_READ_SIZE);
    }

    /**
     * Constructor.
     * @param reader a Reader
     * @param maxReadSize maximum to read at once with {@link #readText()}.
     */
    public TextReader(Reader reader, int maxReadSize) {
        this(reader, maxReadSize, false);
    }

    /**
     * Constructor.
     * @param reader a Reader
     * @param maxReadSize maximum to read at once with {@link #readText()}.
     * @param removeTrailingDelimiter whether to remove trailing delimiter
     */
    public TextReader(Reader reader, int maxReadSize, boolean removeTrailingDelimiter) {
        super();
        this.maxReadSize = maxReadSize;
        this.reader = IOUtil.toBufferedReader(reader);
        this.removeTrailingDelimiter = removeTrailingDelimiter;
    }

    @Override
    public int read(char[] cbuf, int off, int len) throws IOException {
        return reader.read(cbuf, off, len);
    }

    /**
     * Reads the next chunk of text, up to the maximum read size specified.
     * It tries as much as possible to break long text into paragraph,
     * sentences or words, before returning.  See class documentation.
     * @return text read
     * @throws IOException problem reading text.
     */
    public String readText() throws IOException {
        char[] text = new char[maxReadSize - buffer.length()];
        int num = reader.read(text);
        if (num == -1) {
            return null;
        }

        buffer.append(String.valueOf(text, 0, num));

        // Return all if we reached the end.
        reader.mark(1);
        if (reader.read() == -1) {
            String t = buffer.toString();
            buffer.setLength(0);
            reader.reset();
            return t;
        } else {
            reader.reset();
        }

        Matcher m = null;

        // Try breaking at paragraph:
        m = paragraphDelimiterPattern.matcher(buffer);
        if (m.find()) {
            int mStart = m.start(m.groupCount());
            int mEnd = m.end(m.groupCount());
            int substringEnd = mEnd;
            if (removeTrailingDelimiter) {
                substringEnd = mStart;
            }
            String t = buffer.substring(0, substringEnd);
            buffer.delete(0, substringEnd);
            return t;
        }

        // Try breaking at sentence:
        m = sentencePattern.matcher(buffer);
        if (m.find()) {
            int mStart = m.start(1);
            int mEnd = m.end(1);
            int substringEnd = mEnd;
            if (removeTrailingDelimiter) {
                substringEnd = mStart;
            }
            String t = buffer.substring(0, substringEnd);
            buffer.delete(0, substringEnd);
            return t;
        }

        // Try breaking at word:
        m = wordDelimiterPattern.matcher(buffer);
        if (m.find()) {
            int mStart = m.start(m.groupCount());
            int mEnd = m.end(m.groupCount());
            int substringEnd = mEnd;
            if (removeTrailingDelimiter) {
                substringEnd = mStart;
            }
            String t = buffer.substring(0, substringEnd);
            buffer.delete(0, substringEnd);
            return t;
        }

        String t = buffer.toString();
        buffer.setLength(0);
        return t;
    }

    @Override
    public void close() throws IOException {
        reader.close();
    }

}