net.sf.jtmt.tokenizers.ParagraphTokenizer.java Source code

Java tutorial

Introduction

Here is the source code for net.sf.jtmt.tokenizers.ParagraphTokenizer.java

Source

/*
 * Copyright 2012 Nabeel Mukhtar 
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); 
 * you may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at 
 * 
 *  http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and
 * limitations under the License. 
 * 
 */
package net.sf.jtmt.tokenizers;

import java.io.File;
import java.text.BreakIterator;

import org.apache.commons.io.FileUtils;

import com.ibm.icu.text.RuleBasedBreakIterator;

/**
 * The Class ParagraphTokenizer.
 */
public class ParagraphTokenizer {

    /** The text. */
    private String text;

    /** The index. */
    private int index = 0;

    /** The break iterator. */
    private RuleBasedBreakIterator breakIterator;

    /**
     * Instantiates a new paragraph tokenizer.
     *
     * @throws Exception the exception
     */
    public ParagraphTokenizer() throws Exception {
        this("/resources/jtmt/paragraph_break_rules.txt");
    }

    /**
     * Instantiates a new paragraph tokenizer.
     *
     * @param rulesfile the rulesfile
     * @throws Exception the exception
     */
    public ParagraphTokenizer(String rulesfile) throws Exception {
        super();
        this.breakIterator = new RuleBasedBreakIterator(
                FileUtils.readFileToString(new File(getClass().getResource(rulesfile).getFile()), "UTF-8"));
    }

    /**
     * Sets the text.
     *
     * @param text the new text
     */
    public void setText(String text) {
        this.text = text;
        this.breakIterator.setText(text);
        this.index = 0;
    }

    /**
     * Next paragraph.
     *
     * @return the string
     */
    public String nextParagraph() {
        int end = breakIterator.next();
        if (end == BreakIterator.DONE) {
            return null;
        }
        String sentence = text.substring(index, end);
        index = end;
        return sentence;
    }
}