Example usage for java.text BreakIterator next

Introduction

In this page you can find the example usage for java.text BreakIterator next.

Prototype

public abstract int next();

Source Link

Document

Returns the boundary following the current boundary.

Usage

From source file:Main.java

public TextBoundaryFrame() {
    getContentPane().add(new JScrollPane(outputText));

    Locale currentLocale = Locale.getDefault();
    BreakIterator currentBreakIterator = null;
    currentBreakIterator = BreakIterator.getCharacterInstance(currentLocale);

    String text = "The quick, brown fox jump-ed\n" + "over the lazy \"dog.\" And then...what happened?";
    currentBreakIterator.setText(text);//from w ww.  ja va2 s.c o  m
    outputText.setText("");

    int from = currentBreakIterator.first();
    int to;
    while ((to = currentBreakIterator.next()) != BreakIterator.DONE) {
        outputText.append(text.substring(from, to) + "|");
        from = to;
    }
    outputText.append(text.substring(from));
}

From source file:be.idamediafoundry.sofa.livecycle.dsc.util.AbstractQDoxComponentInfoExtractor.java

final protected String getFirstSentence(String text) {
    String result = text;//from  ww w.  ja  v  a 2s . com
    if (text != null) {
        BreakIterator iterator = BreakIterator.getSentenceInstance();
        iterator.setText(text);
        int start = iterator.first();
        int end = iterator.next();
        if (end != BreakIterator.DONE) {
            result = text.substring(start, end).trim();
        }
    }
    return result;
}

From source file:com.norconex.importer.handler.tagger.impl.TextStatisticsTagger.java

@Override
protected void tagTextDocument(String reference, Reader input, ImporterMetadata metadata, boolean parsed)
        throws ImporterHandlerException {
    long charCount = 0;
    long wordCharCount = 0;
    long wordCount = 0;
    long sentenceCount = 0;
    long sentenceCharCount = 0;
    long paragraphCount = 0;

    //TODO make this more efficient, by doing all this in one pass.
    LineIterator it = IOUtils.lineIterator(input);
    while (it.hasNext()) {
        String line = it.nextLine().trim();
        if (StringUtils.isBlank(line)) {
            continue;
        }/*from   w ww  .j av  a  2s.  c om*/

        // Paragraph
        paragraphCount++;

        // Character
        charCount += line.length();

        // Word
        Matcher matcher = PATTERN_WORD.matcher(line);
        while (matcher.find()) {
            int wordLength = matcher.end() - matcher.start();
            wordCount++;
            wordCharCount += wordLength;
        }

        // Sentence
        BreakIterator boundary = BreakIterator.getSentenceInstance();
        boundary.setText(line);
        int start = boundary.first();
        for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
            sentenceCharCount += (end - start);
            sentenceCount++;
        }
    }

    String field = StringUtils.EMPTY;
    if (StringUtils.isNotBlank(fieldName)) {
        field = fieldName.trim() + ".";
    }

    //--- Add fields ---
    metadata.addLong("document.stat." + field + "characterCount", charCount);
    metadata.addLong("document.stat." + field + "wordCount", wordCount);
    metadata.addLong("document.stat." + field + "sentenceCount", sentenceCount);
    metadata.addLong("document.stat." + field + "paragraphCount", paragraphCount);
    metadata.addString("document.stat." + field + "averageWordCharacterCount",
            divide(wordCharCount, wordCount));
    metadata.addString("document.stat." + field + "averageSentenceCharacterCount",
            divide(sentenceCharCount, sentenceCount));
    metadata.addString("document.stat." + field + "averageSentenceWordCount", divide(wordCount, sentenceCount));
    metadata.addString("document.stat." + field + "averageParagraphCharacterCount",
            divide(charCount, paragraphCount));
    metadata.addString("document.stat." + field + "averageParagraphSentenceCount",
            divide(sentenceCount, paragraphCount));
    metadata.addString("document.stat." + field + "averageParagraphWordCount",
            divide(wordCount, paragraphCount));

}

From source file:com.glaf.core.util.StringTools.java

public static String[] toLowerCaseWordArray(String text) {
    if (text == null || text.length() == 0) {
        return new String[0];
    }//  w  w w .ja  v  a2 s  .  c o m

    List<String> wordList = new java.util.ArrayList<String>();
    BreakIterator boundary = BreakIterator.getWordInstance();
    boundary.setText(text);
    int start = 0;

    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        String tmp = text.substring(start, end).trim();
        tmp = replace(tmp, "+", "");
        tmp = replace(tmp, "/", "");
        tmp = replace(tmp, "\\", "");
        tmp = replace(tmp, "#", "");
        tmp = replace(tmp, "*", "");
        tmp = replace(tmp, ")", "");
        tmp = replace(tmp, "(", "");
        tmp = replace(tmp, "&", "");
        if (tmp.length() > 0) {
            wordList.add(tmp);
        }
    }
    return wordList.toArray(new String[wordList.size()]);
}

From source file:IteratorTest.java

protected void refreshDisplay() {
    int startIndex, nextIndex;
    Vector items = new Vector();
    String msgText = textArea.getText();
    Locale locale = (Locale) (localeButton.getSelectedItem());
    BreakIterator iterator = null;
    if (charButton.isSelected()) {
        iterator = BreakIterator.getCharacterInstance(locale);
    } else if (wordButton.isSelected()) {
        iterator = BreakIterator.getWordInstance(locale);
    } else if (lineButton.isSelected()) {
        iterator = BreakIterator.getLineInstance(locale);
    } else if (sentButton.isSelected()) {
        iterator = BreakIterator.getSentenceInstance(locale);
    }//w  w w. j a  va2s  . c  o m
    iterator.setText(msgText);
    startIndex = iterator.first();
    nextIndex = iterator.next();

    while (nextIndex != BreakIterator.DONE) {
        items.addElement(msgText.substring(startIndex, nextIndex));
        startIndex = nextIndex;
        nextIndex = iterator.next();
    }
    itemList.setListData(items);
}

From source file:com.redhat.rcm.version.Cli.java

private static void printTextLine(final String line, final String indent, final int max, final PrintWriter pw) {
    final String fmt = "%s%-" + max + "s\n";

    final List<String> lines = new ArrayList<String>();

    final BreakIterator iter = BreakIterator.getLineInstance();
    iter.setText(line);//from   www .j a  va 2  s  .  c  om

    int start = iter.first();
    int end = BreakIterator.DONE;
    final StringBuilder currentLine = new StringBuilder();
    String seg;
    while (start != BreakIterator.DONE && (end = iter.next()) != BreakIterator.DONE) {
        seg = line.substring(start, end);
        if (currentLine.length() + seg.length() > max) {
            lines.add(currentLine.toString());
            currentLine.setLength(0);
        }

        currentLine.append(seg);
        start = end;
    }

    if (currentLine.length() > 0) {
        lines.add(currentLine.toString());
    }

    for (final String ln : lines) {
        pw.printf(fmt, indent, ln);
    }
}

From source file:com.redhat.rcm.version.Cli.java

private static void printKVLine(final String key, final String value, final String fmt, final int valMax,
        final PrintWriter pw) {
    final List<String> lines = new ArrayList<String>();

    final BreakIterator iter = BreakIterator.getLineInstance();
    iter.setText(value);/*from w w w  .j a  va  2  s .  c o m*/

    int start = iter.first();
    int end = BreakIterator.DONE;
    final StringBuilder currentLine = new StringBuilder();
    String seg;
    while (start != BreakIterator.DONE && (end = iter.next()) != BreakIterator.DONE) {
        seg = value.substring(start, end);
        if (currentLine.length() + seg.length() > valMax) {
            lines.add(currentLine.toString());
            currentLine.setLength(0);
        }

        currentLine.append(seg);
        start = end;
    }

    if (currentLine.length() > 0) {
        lines.add(currentLine.toString());
    }

    pw.printf(fmt, key, lines.isEmpty() ? "" : lines.get(0));
    if (lines.size() > 1) {
        for (int i = 1; i < lines.size(); i++) {
            // blank string to serve for indentation in format with two fields.
            pw.printf(fmt, "", lines.get(i));
        }
    }
}

From source file:Utils.java

/**
 * Wrap multi-line strings (and get the individual lines).
 * /*from  w w  w. j av  a 2 s . c  o  m*/
 * @param original
 *          the original string to wrap
 * @param width
 *          the maximum width of lines
 * @param breakIterator
 *          breaks original to chars, words, sentences, depending on what
 *          instance you provide.
 * @param removeNewLines
 *          if <code>true</code>, any newlines in the original string are
 *          ignored
 * @return the lines after wrapping
 */
public static String[] wrapStringToArray(String original, int width, BreakIterator breakIterator,
        boolean removeNewLines) {
    if (original.length() == 0) {
        return new String[] { original };
    }

    String[] workingSet;

    // substitute original newlines with spaces,
    // remove newlines from head and tail
    if (removeNewLines) {
        original = trimString(original);
        original = original.replace('\n', ' ');
        workingSet = new String[] { original };
    } else {
        StringTokenizer tokens = new StringTokenizer(original, "\n"); // NOI18N
        int len = tokens.countTokens();
        workingSet = new String[len];

        for (int i = 0; i < len; i++) {
            workingSet[i] = tokens.nextToken();
        }
    }

    if (width < 1) {
        width = 1;
    }

    if (original.length() <= width) {
        return workingSet;
    }

    widthcheck: {
        boolean ok = true;

        for (int i = 0; i < workingSet.length; i++) {
            ok = ok && (workingSet[i].length() < width);

            if (!ok) {
                break widthcheck;
            }
        }

        return workingSet;
    }

    java.util.ArrayList<String> lines = new java.util.ArrayList<String>();

    int lineStart = 0; // the position of start of currently processed line in
                       // the original string

    for (int i = 0; i < workingSet.length; i++) {
        if (workingSet[i].length() < width) {
            lines.add(workingSet[i]);
        } else {
            breakIterator.setText(workingSet[i]);

            int nextStart = breakIterator.next();
            int prevStart = 0;

            do {
                while (((nextStart - lineStart) < width) && (nextStart != BreakIterator.DONE)) {
                    prevStart = nextStart;
                    nextStart = breakIterator.next();
                }

                if (nextStart == BreakIterator.DONE) {
                    nextStart = prevStart = workingSet[i].length();
                }

                if (prevStart == 0) {
                    prevStart = nextStart;
                }

                lines.add(workingSet[i].substring(lineStart, prevStart));

                lineStart = prevStart;
                prevStart = 0;
            } while (lineStart < workingSet[i].length());

            lineStart = 0;
        }
    }

    String[] s = new String[lines.size()];

    return (String[]) lines.toArray(s);
}

From source file:com.tao.realweb.util.StringUtil.java

/**
 * Converts a line of text into an array of lower case words using a
 * BreakIterator.wordInstance().<p>
 *
 * This method is under the Jive Open Source Software License and was
 * written by Mark Imbriaco./*  www.  j  av a  2s  .c  o m*/
 *
 * @param text a String of text to convert into an array of words
 * @return text broken up into an array of words.
 */
public static String[] toLowerCaseWordArray(String text) {
    if (text == null || text.length() == 0) {
        return new String[0];
    }

    List<String> wordList = new ArrayList<String>();
    BreakIterator boundary = BreakIterator.getWordInstance();
    boundary.setText(text);
    int start = 0;

    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        String tmp = text.substring(start, end).trim();
        // Remove characters that are not needed.
        tmp = replace(tmp, "+", "");
        tmp = replace(tmp, "/", "");
        tmp = replace(tmp, "\\", "");
        tmp = replace(tmp, "#", "");
        tmp = replace(tmp, "*", "");
        tmp = replace(tmp, ")", "");
        tmp = replace(tmp, "(", "");
        tmp = replace(tmp, "&", "");
        if (tmp.length() > 0) {
            wordList.add(tmp);
        }
    }
    return wordList.toArray(new String[wordList.size()]);
}

From source file:org.cloudgraph.examples.test.model.NLPWikiParseTest.java

private void parse(StringBuilder buf) throws IOException {
    BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);

    String text = buf.toString();
    int counter = 0;
    iterator.setText(text);//w w w  . j  a  v a 2s .  c  o m

    int lastIndex = iterator.first();
    while (lastIndex != BreakIterator.DONE) {
        int firstIndex = lastIndex;
        lastIndex = iterator.next();

        if (lastIndex != BreakIterator.DONE) {
            String sentence = text.substring(firstIndex, lastIndex);
            long before = System.currentTimeMillis();
            //parse(sentence);
            long after = System.currentTimeMillis();
            log.info("time4: " + String.valueOf(after - before) + ": " + sentence);
            counter++;
        }
    }

}