Example usage for java.text BreakIterator next

List of usage examples for java.text BreakIterator next

Introduction

In this page you can find the example usage for java.text BreakIterator next.

Prototype

public abstract int next();

Source Link

Document

Returns the boundary following the current boundary.

Usage

From source file:org.cloudgraph.examples.test.model.StanfordCoreNLPTest.java

private void parse(StringBuilder buf) throws IOException {
    BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);

    String text = buf.toString();
    int counter = 0;
    iterator.setText(text);//from w ww.j av  a2  s.  c om

    int lastIndex = iterator.first();
    while (lastIndex != BreakIterator.DONE) {
        int firstIndex = lastIndex;
        lastIndex = iterator.next();

        if (lastIndex != BreakIterator.DONE) {
            String sentence = text.substring(firstIndex, lastIndex);
            long before = System.currentTimeMillis();
            parse(sentence);
            long after = System.currentTimeMillis();
            log.info("time4: " + String.valueOf(after - before) + ": " + sentence);
            counter++;
        }
    }

}

From source file:org.jivesoftware.community.util.StringUtils.java

public static String[] toLowerCaseWordArray(String text) {
    if (text == null || text.length() == 0)
        return new String[0];
    ArrayList wordList = new ArrayList();
    BreakIterator boundary = BreakIterator.getWordInstance();
    boundary.setText(text);/*  w  ww.j a v a 2s .  c  om*/
    int start = 0;
    for (int end = boundary.next(); end != -1; end = boundary.next()) {
        String tmp = text.substring(start, end).trim();
        tmp = replace(tmp, "+", "");
        tmp = replace(tmp, "/", "");
        tmp = replace(tmp, "\\", "");
        tmp = replace(tmp, "#", "");
        tmp = replace(tmp, "*", "");
        tmp = replace(tmp, ")", "");
        tmp = replace(tmp, "(", "");
        tmp = replace(tmp, "&", "");
        if (tmp.length() > 0)
            wordList.add(tmp);
        start = end;
    }

    return (String[]) wordList.toArray(new String[wordList.size()]);
}

From source file:org.jivesoftware.util.StringUtils.java

/**
 * Converts a line of text into an array of lower case words using a
 * BreakIterator.wordInstance().<p>
 *
 * This method is under the Jive Open Source Software License and was
 * written by Mark Imbriaco.//w ww .j a  v  a2s  .c o  m
 *
 * @param text a String of text to convert into an array of words
 * @return text broken up into an array of words.
 */
public static String[] toLowerCaseWordArray(String text) {
    if (text == null || text.length() == 0) {
        return new String[0];
    }

    List<String> wordList = new ArrayList<>();
    BreakIterator boundary = BreakIterator.getWordInstance();
    boundary.setText(text);
    int start = 0;

    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        String tmp = text.substring(start, end).trim();
        // Remove characters that are not needed.
        tmp = replace(tmp, "+", "");
        tmp = replace(tmp, "/", "");
        tmp = replace(tmp, "\\", "");
        tmp = replace(tmp, "#", "");
        tmp = replace(tmp, "*", "");
        tmp = replace(tmp, ")", "");
        tmp = replace(tmp, "(", "");
        tmp = replace(tmp, "&", "");
        if (tmp.length() > 0) {
            wordList.add(tmp);
        }
    }
    return wordList.toArray(new String[wordList.size()]);
}

From source file:org.lnicholls.galleon.util.Tools.java

public static String[] layout(int width, FontMetrics metrics, String text) {
    ArrayList lines = new ArrayList();

    if (text != null) {
        String line = "";
        BreakIterator boundary = BreakIterator.getWordInstance();
        boundary.setText(text);/*from  w  w w . j av  a  2  s . com*/
        int start = boundary.first();
        for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
            String word = text.substring(start, end);
            String trimmed = word.replaceAll(" ", "");
            int metricsWidth = (line + word).length() * 20;
            if (metrics != null)
                metricsWidth = metrics.stringWidth(line + word);

            if (trimmed.equals("\n") || trimmed.equals("\r") || trimmed.equals("\r\n")) {
                lines.add(line.trim());
                line = "";
            } else if (metricsWidth > width) {
                lines.add(line.trim());
                line = word;
            } else
                line = line + word;
        }
        if (line.trim().length() > 0)
            lines.add(line.trim());
    }

    return (String[]) lines.toArray(new String[0]);
}

From source file:pl.edu.icm.coansys.kwdextraction.RakeExtractor.java

/**
 * Finding words or word sequences separated by stopwords, punctuation marks
 * etc./*from w w  w.  j  a v  a 2 s .  c o m*/
 */
private void extractKeywordCandidates() {

    Map<String, KeywordCandidate> candidatesMap = new HashMap<String, KeywordCandidate>();

    BreakIterator wordIterator = BreakIterator.getWordInstance();

    wordIterator.setText(content);
    int wordStart = wordIterator.first();

    int candidateStart = wordStart;
    String candidateStr = null;
    KeywordCandidate kwdCand = new KeywordCandidate();

    for (int wordEnd = wordIterator
            .next(); wordEnd != BreakIterator.DONE; wordStart = wordEnd, wordEnd = wordIterator.next()) {

        String word = content.substring(wordStart, wordEnd).trim().toLowerCase();
        String alpha = word.replaceAll(ILLEGAL_CHARS, "");

        if (!word.isEmpty()) {

            if (stopwords.get(lang).contains(word) || word.matches("\\W+") || isNum(word)
                    || !word.equals(alpha)) {
                candidateStr = content.substring(candidateStart, wordStart);
            } else {
                kwdCand.addWord(word);
                if (wordEnd == content.length()) {
                    candidateStr = content.substring(candidateStart, wordEnd);
                }
            }
            if (candidateStr != null) {
                candidateStr = candidateStr.trim().toLowerCase().replaceAll(ILLEGAL_CHARS, "")
                        .replaceAll("\\s+", " ");
                if (!candidateStr.isEmpty()) {
                    if (candidatesMap.containsKey(candidateStr)) {
                        candidatesMap.get(candidateStr).incCounter();
                    } else {
                        kwdCand.setKeyword(candidateStr);
                        candidatesMap.put(candidateStr, kwdCand);
                    }
                }
                candidateStr = null;
                candidateStart = wordEnd;
                kwdCand = new KeywordCandidate();
            }
        }
    }

    keywordCandidates = new ArrayList<KeywordCandidate>();
    for (Entry<String, KeywordCandidate> e : candidatesMap.entrySet()) {
        keywordCandidates.add(e.getValue());
    }
}