Example usage for java.text BreakIterator DONE

List of usage examples for java.text BreakIterator DONE

Introduction

In this page you can find the example usage for java.text BreakIterator DONE.

Prototype

int DONE

To view the source code for java.text BreakIterator DONE.

Click Source Link

Document

DONE is returned by previous(), next(), next(int), preceding(int) and following(int) when either the first or last text boundary has been reached.

Usage

From source file:de.tudarmstadt.lt.lm.service.BreakIteratorStringProvider.java

@Override
public List<String> tokenizeSentence_intern(String sentence, String language_code) {
    ArrayList<String> tokens = new ArrayList<String>();
    BreakIterator token_bounds = BreakIterator.getWordInstance(LocaleUtils.toLocale(language_code));
    token_bounds.setText(sentence.trim());
    int begin_t = token_bounds.first();
    for (int end_t = token_bounds.next(); end_t != BreakIterator.DONE; begin_t = end_t, end_t = token_bounds
            .next()) {//  w w w  .jav a2  s. com
        String token = de.tudarmstadt.lt.utilities.StringUtils
                .trim_and_replace_emptyspace(sentence.substring(begin_t, end_t), "_");
        if (!token.isEmpty()) { // add token iff token is not empty
            tokens.add(token);
        }
    }
    return tokens;
}

From source file:com.norconex.importer.handler.tagger.impl.TextStatisticsTagger.java

@Override
protected void tagTextDocument(String reference, Reader input, ImporterMetadata metadata, boolean parsed)
        throws ImporterHandlerException {
    long charCount = 0;
    long wordCharCount = 0;
    long wordCount = 0;
    long sentenceCount = 0;
    long sentenceCharCount = 0;
    long paragraphCount = 0;

    //TODO make this more efficient, by doing all this in one pass.
    LineIterator it = IOUtils.lineIterator(input);
    while (it.hasNext()) {
        String line = it.nextLine().trim();
        if (StringUtils.isBlank(line)) {
            continue;
        }//from   w ww . j  a v a  2s .c  o  m

        // Paragraph
        paragraphCount++;

        // Character
        charCount += line.length();

        // Word
        Matcher matcher = PATTERN_WORD.matcher(line);
        while (matcher.find()) {
            int wordLength = matcher.end() - matcher.start();
            wordCount++;
            wordCharCount += wordLength;
        }

        // Sentence
        BreakIterator boundary = BreakIterator.getSentenceInstance();
        boundary.setText(line);
        int start = boundary.first();
        for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
            sentenceCharCount += (end - start);
            sentenceCount++;
        }
    }

    String field = StringUtils.EMPTY;
    if (StringUtils.isNotBlank(fieldName)) {
        field = fieldName.trim() + ".";
    }

    //--- Add fields ---
    metadata.addLong("document.stat." + field + "characterCount", charCount);
    metadata.addLong("document.stat." + field + "wordCount", wordCount);
    metadata.addLong("document.stat." + field + "sentenceCount", sentenceCount);
    metadata.addLong("document.stat." + field + "paragraphCount", paragraphCount);
    metadata.addString("document.stat." + field + "averageWordCharacterCount",
            divide(wordCharCount, wordCount));
    metadata.addString("document.stat." + field + "averageSentenceCharacterCount",
            divide(sentenceCharCount, sentenceCount));
    metadata.addString("document.stat." + field + "averageSentenceWordCount", divide(wordCount, sentenceCount));
    metadata.addString("document.stat." + field + "averageParagraphCharacterCount",
            divide(charCount, paragraphCount));
    metadata.addString("document.stat." + field + "averageParagraphSentenceCount",
            divide(sentenceCount, paragraphCount));
    metadata.addString("document.stat." + field + "averageParagraphWordCount",
            divide(wordCount, paragraphCount));

}

From source file:graphene.util.StringUtils.java

/**
 * Convert a string to a list of strings broken up by end of sentence
 * tokens./*from  w w w.  j  a  v a  2s . com*/
 * 
 * @param input
 * @param locale
 * @return
 */
public static List<String> convertToSentences(final String input, final Locale locale) {
    final BreakIterator iterator = BreakIterator.getSentenceInstance(locale);
    iterator.setText(input);
    final ArrayList<String> sentences = new ArrayList<String>();

    int start = iterator.first();

    for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
        sentences.add(input.substring(start, end));
    }
    return sentences;
}

From source file:be.idamediafoundry.sofa.livecycle.dsc.util.AbstractQDoxComponentInfoExtractor.java

final protected String getFirstSentence(String text) {
    String result = text;//  w w w.jav a  2 s.c  om
    if (text != null) {
        BreakIterator iterator = BreakIterator.getSentenceInstance();
        iterator.setText(text);
        int start = iterator.first();
        int end = iterator.next();
        if (end != BreakIterator.DONE) {
            result = text.substring(start, end).trim();
        }
    }
    return result;
}

From source file:com.conversantmedia.mapreduce.tool.RunJob.java

private static void splitLine(List<String> lines, String text, int maxLength) {
    BreakIterator boundary = BreakIterator.getLineInstance();
    boundary.setText(text);/*from w ww  .  j  av  a 2  s  .  c  o m*/
    int start = boundary.first();
    int end = boundary.next();
    int lineLength = 0;
    StringBuilder buffer = new StringBuilder();
    while (end != BreakIterator.DONE) {
        String word = text.substring(start, end);
        lineLength = lineLength + word.length();
        if (lineLength > maxLength) {
            lineLength = word.length();
            lines.add(buffer.toString());
            buffer.setLength(0);
        }
        buffer.append(word);
        start = end;
        end = boundary.next();
    }
    lines.add(buffer.toString());
}

From source file:net.sf.jasperreports.engine.fill.SimpleTextLineWrapper.java

protected TextLine measureLine(float width, boolean requireWord, int endLimit) {
    // try to guess how much of the text would fit based on the average char width
    int measureIndex = estimateBreakIndex(width, endLimit);

    // if estimating that there's more than a line, check measureExactMultiline
    if (measureIndex < endLimit && measureExactMultiline) {
        return measureExactLine(width, endLimit, requireWord);
    }//  w ww  .ja va  2 s .c  o  m

    // measure the text
    Rectangle2D bounds = measureParagraphFragment(measureIndex);
    //FIXME fast exit when the height is exceeded

    Rectangle2D measuredBounds = bounds;
    if (bounds.getWidth() <= width) {
        // see if there's more that could fit
        boolean done = false;
        do {
            int nextBreakIndex = measureIndex < endLimit ? paragraphBreakIterator.following(measureIndex)
                    : BreakIterator.DONE;
            if (nextBreakIndex == BreakIterator.DONE || nextBreakIndex > endLimit) {
                // the next break is after the limit, we're done
                done = true;
            } else {
                // measure to the next break
                Rectangle2D nextBounds = measureParagraphFragment(nextBreakIndex);
                if (nextBounds.getWidth() <= width) {
                    measuredBounds = nextBounds;
                    measureIndex = nextBreakIndex;
                    // loop
                } else {
                    done = true;
                }
            }
        } while (!done);
    } else {
        // didn't fit, try shorter texts
        boolean done = false;
        do {
            int previousBreakIndex = measureIndex > paragraphPosition
                    ? paragraphBreakIterator.preceding(measureIndex)
                    : BreakIterator.DONE;
            if (previousBreakIndex == BreakIterator.DONE || previousBreakIndex <= paragraphPosition) {
                if (requireWord) {
                    // no full word fits, returning empty
                    measureIndex = paragraphPosition;
                } else {
                    // we need to break inside the word.
                    // measuring the exact break index as estimating/guessing might be slower.
                    measureIndex = measureExactLineBreakIndex(width, endLimit, requireWord);
                    measuredBounds = measureParagraphFragment(measureIndex);
                }
                done = true;
            } else {
                measureIndex = previousBreakIndex;
                Rectangle2D prevBounds = measureParagraphFragment(measureIndex);
                if (prevBounds.getWidth() <= width) {
                    // fitted, we're done
                    measuredBounds = prevBounds;
                    done = true;
                }
            }
        } while (!done);
    }

    if (measureIndex <= paragraphPosition) {
        // nothing fit
        return null;
    }
    return toTextLine(measureIndex, measuredBounds);
}

From source file:com.tao.realweb.util.StringUtil.java

/**
 * Converts a line of text into an array of lower case words using a
 * BreakIterator.wordInstance().<p>
 *
 * This method is under the Jive Open Source Software License and was
 * written by Mark Imbriaco.//  w w w .  j  av  a 2s  .  c  om
 *
 * @param text a String of text to convert into an array of words
 * @return text broken up into an array of words.
 */
public static String[] toLowerCaseWordArray(String text) {
    if (text == null || text.length() == 0) {
        return new String[0];
    }

    List<String> wordList = new ArrayList<String>();
    BreakIterator boundary = BreakIterator.getWordInstance();
    boundary.setText(text);
    int start = 0;

    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        String tmp = text.substring(start, end).trim();
        // Remove characters that are not needed.
        tmp = replace(tmp, "+", "");
        tmp = replace(tmp, "/", "");
        tmp = replace(tmp, "\\", "");
        tmp = replace(tmp, "#", "");
        tmp = replace(tmp, "*", "");
        tmp = replace(tmp, ")", "");
        tmp = replace(tmp, "(", "");
        tmp = replace(tmp, "&", "");
        if (tmp.length() > 0) {
            wordList.add(tmp);
        }
    }
    return wordList.toArray(new String[wordList.size()]);
}

From source file:net.sf.jasperreports.engine.fill.SimpleTextLineWrapper.java

protected int estimateBreakIndex(float width, int endLimit) {
    double avgCharWidth = fontInfo.charWidthEstimate();
    if ((endLimit - paragraphPosition) * avgCharWidth <= width * FONT_WIDTH_CHECK_FACTOR) {
        // there are chances that the entire text would fit, let's be optimistic
        return endLimit;
    }//  w  ww . j  a  v a  2s.  c om

    // estimate how many characters would fit
    int charCountEstimate = (int) Math.ceil(width / avgCharWidth);
    int estimateFitPosition = paragraphPosition + charCountEstimate;
    if (estimateFitPosition > endLimit) {
        // estimated that everything would fit
        return endLimit;
    }

    // find the break after the estimate
    int breakAfterEstimatePosition = paragraphBreakIterator.following(estimateFitPosition);
    if (breakAfterEstimatePosition == BreakIterator.DONE || breakAfterEstimatePosition > endLimit) {
        breakAfterEstimatePosition = endLimit;
    }

    int estimateIndex = breakAfterEstimatePosition;
    // if the after break is too far way from the estimate, see if the break before is closer
    if (breakAfterEstimatePosition > estimateFitPosition + NEXT_BREAK_INDEX_THRESHOLD) {
        int breakBeforeEstimatePosition = paragraphBreakIterator.previous();
        // if the break before is closer than the break after, use the break before
        if (breakBeforeEstimatePosition == BreakIterator.DONE && breakBeforeEstimatePosition > paragraphPosition
                && estimateFitPosition - breakBeforeEstimatePosition < breakAfterEstimatePosition
                        - estimateFitPosition) {
            estimateIndex = breakBeforeEstimatePosition;
        }
    }
    return estimateIndex;
}

From source file:net.sf.jasperreports.engine.fill.SimpleTextLineWrapper.java

protected Rectangle2D measureParagraphFragment(int measureIndex) {
    int endIndex = measureIndex;
    if (endIndex > paragraphPosition + 1) {
        char lastMeasureChar = paragraphText.charAt(endIndex - 1);
        if (Character.isWhitespace(lastMeasureChar)) {
            // exclude trailing white space from the text to measure.
            // use the previous break as limit, but always keep at least one character to measure.
            int preceding = paragraphBreakIterator.preceding(endIndex);
            if (preceding == BreakIterator.DONE || preceding <= paragraphPosition) {
                preceding = paragraphPosition + 1;
            }/*w  w  w .  j  ava 2s . co  m*/

            do {
                --endIndex;
                lastMeasureChar = paragraphText.charAt(endIndex - 1);
            } while (endIndex > preceding && Character.isWhitespace(lastMeasureChar));
        }
    }

    // note that trailing white space will not be included in the advance
    Rectangle2D bounds = fontInfo.fontInfo.font.getStringBounds(paragraphText, paragraphPosition, endIndex,
            context.getFontRenderContext());

    // adding the measurement to the font info statistics
    fontInfo.recordMeasurement(bounds.getWidth() / (endIndex - paragraphPosition));

    if (logTrace) {
        log.trace("measured to index " + (endIndex + paragraphOffset) + " at width " + bounds.getWidth());
    }

    return bounds;
}

From source file:com.redhat.rcm.version.Cli.java

private static void printTextLine(final String line, final String indent, final int max, final PrintWriter pw) {
    final String fmt = "%s%-" + max + "s\n";

    final List<String> lines = new ArrayList<String>();

    final BreakIterator iter = BreakIterator.getLineInstance();
    iter.setText(line);//from   ww  w .j  a va2 s .  c  o m

    int start = iter.first();
    int end = BreakIterator.DONE;
    final StringBuilder currentLine = new StringBuilder();
    String seg;
    while (start != BreakIterator.DONE && (end = iter.next()) != BreakIterator.DONE) {
        seg = line.substring(start, end);
        if (currentLine.length() + seg.length() > max) {
            lines.add(currentLine.toString());
            currentLine.setLength(0);
        }

        currentLine.append(seg);
        start = end;
    }

    if (currentLine.length() > 0) {
        lines.add(currentLine.toString());
    }

    for (final String ln : lines) {
        pw.printf(fmt, indent, ln);
    }
}