Example usage for java.text BreakIterator next

Introduction

In this page you can find the example usage for java.text BreakIterator next.

Prototype

public abstract int next();

Source Link

Document

Returns the boundary following the current boundary.

Usage

From source file:org.cloudgraph.examples.test.model.StanfordCoreNLPTest.java

private void parse(StringBuilder buf) throws IOException {
    BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);

    String text = buf.toString();
    int counter = 0;
    iterator.setText(text);//from w ww.j av  a2  s.  c om

    int lastIndex = iterator.first();
    while (lastIndex != BreakIterator.DONE) {
        int firstIndex = lastIndex;
        lastIndex = iterator.next();

        if (lastIndex != BreakIterator.DONE) {
            String sentence = text.substring(firstIndex, lastIndex);
            long before = System.currentTimeMillis();
            parse(sentence);
            long after = System.currentTimeMillis();
            log.info("time4: " + String.valueOf(after - before) + ": " + sentence);
            counter++;
        }
    }

}

From source file:org.jivesoftware.community.util.StringUtils.java

public static String[] toLowerCaseWordArray(String text) {
    if (text == null || text.length() == 0)
        return new String[0];
    ArrayList wordList = new ArrayList();
    BreakIterator boundary = BreakIterator.getWordInstance();
    boundary.setText(text);/*  w  ww.j a v a 2s .  c  om*/
    int start = 0;
    for (int end = boundary.next(); end != -1; end = boundary.next()) {
        String tmp = text.substring(start, end).trim();
        tmp = replace(tmp, "+", "");
        tmp = replace(tmp, "/", "");
        tmp = replace(tmp, "\\", "");
        tmp = replace(tmp, "#", "");
        tmp = replace(tmp, "*", "");
        tmp = replace(tmp, ")", "");
        tmp = replace(tmp, "(", "");
        tmp = replace(tmp, "&", "");
        if (tmp.length() > 0)
            wordList.add(tmp);
        start = end;
    }

    return (String[]) wordList.toArray(new String[wordList.size()]);
}

From source file:org.jivesoftware.util.StringUtils.java

/**
 * Converts a line of text into an array of lower case words using a
 * BreakIterator.wordInstance().<p>
 *
 * This method is under the Jive Open Source Software License and was
 * written by Mark Imbriaco.//w ww .j a  v  a2s  .c o  m
 *
 * @param text a String of text to convert into an array of words
 * @return text broken up into an array of words.
 */
public static String[] toLowerCaseWordArray(String text) {
    if (text == null || text.length() == 0) {
        return new String[0];
    }

    List<String> wordList = new ArrayList<>();
    BreakIterator boundary = BreakIterator.getWordInstance();
    boundary.setText(text);
    int start = 0;

    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        String tmp = text.substring(start, end).trim();
        // Remove characters that are not needed.
        tmp = replace(tmp, "+", "");
        tmp = replace(tmp, "/", "");
        tmp = replace(tmp, "\\", "");
        tmp = replace(tmp, "#", "");
        tmp = replace(tmp, "*", "");
        tmp = replace(tmp, ")", "");
        tmp = replace(tmp, "(", "");
        tmp = replace(tmp, "&", "");
        if (tmp.length() > 0) {
            wordList.add(tmp);
        }
    }
    return wordList.toArray(new String[wordList.size()]);
}

From source file:org.lnicholls.galleon.util.Tools.java

public static String[] layout(int width, FontMetrics metrics, String text) {
    ArrayList lines = new ArrayList();

    if (text != null) {
        String line = "";
        BreakIterator boundary = BreakIterator.getWordInstance();
        boundary.setText(text);/*from  w  w w . j av  a  2  s . com*/
        int start = boundary.first();
        for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
            String word = text.substring(start, end);
            String trimmed = word.replaceAll(" ", "");
            int metricsWidth = (line + word).length() * 20;
            if (metrics != null)
                metricsWidth = metrics.stringWidth(line + word);

            if (trimmed.equals("\n") || trimmed.equals("\r") || trimmed.equals("\r\n")) {
                lines.add(line.trim());
                line = "";
            } else if (metricsWidth > width) {
                lines.add(line.trim());
                line = word;
            } else
                line = line + word;
        }
        if (line.trim().length() > 0)
            lines.add(line.trim());
    }

    return (String[]) lines.toArray(new String[0]);
}

From source file:pl.edu.icm.coansys.kwdextraction.RakeExtractor.java

/**
 * Finding words or word sequences separated by stopwords, punctuation marks
 * etc./*from w w  w.  j  a v  a 2 s .  c o m*/
 */
private void extractKeywordCandidates() {

    Map<String, KeywordCandidate> candidatesMap = new HashMap<String, KeywordCandidate>();

    BreakIterator wordIterator = BreakIterator.getWordInstance();

    wordIterator.setText(content);
    int wordStart = wordIterator.first();

    int candidateStart = wordStart;
    String candidateStr = null;
    KeywordCandidate kwdCand = new KeywordCandidate();

    for (int wordEnd = wordIterator
            .next(); wordEnd != BreakIterator.DONE; wordStart = wordEnd, wordEnd = wordIterator.next()) {

        String word = content.substring(wordStart, wordEnd).trim().toLowerCase();
        String alpha = word.replaceAll(ILLEGAL_CHARS, "");

        if (!word.isEmpty()) {

            if (stopwords.get(lang).contains(word) || word.matches("\\W+") || isNum(word)
                    || !word.equals(alpha)) {
                candidateStr = content.substring(candidateStart, wordStart);
            } else {
                kwdCand.addWord(word);
                if (wordEnd == content.length()) {
                    candidateStr = content.substring(candidateStart, wordEnd);
                }
            }
            if (candidateStr != null) {
                candidateStr = candidateStr.trim().toLowerCase().replaceAll(ILLEGAL_CHARS, "")
                        .replaceAll("\\s+", " ");
                if (!candidateStr.isEmpty()) {
                    if (candidatesMap.containsKey(candidateStr)) {
                        candidatesMap.get(candidateStr).incCounter();
                    } else {
                        kwdCand.setKeyword(candidateStr);
                        candidatesMap.put(candidateStr, kwdCand);
                    }
                }
                candidateStr = null;
                candidateStart = wordEnd;
                kwdCand = new KeywordCandidate();
            }
        }
    }

    keywordCandidates = new ArrayList<KeywordCandidate>();
    for (Entry<String, KeywordCandidate> e : candidatesMap.entrySet()) {
        keywordCandidates.add(e.getValue());
    }
}