Example usage for java.text BreakIterator DONE

Introduction

In this page you can find the example usage for java.text BreakIterator DONE.

Prototype

int DONE

To view the source code for java.text BreakIterator DONE.

Click Source Link

Document

DONE is returned by previous(), next(), next(int), preceding(int) and following(int) when either the first or last text boundary has been reached.

Usage

From source file:org.jivesoftware.util.StringUtils.java

/**
 * Converts a line of text into an array of lower case words using a
 * BreakIterator.wordInstance().<p>
 *
 * This method is under the Jive Open Source Software License and was
 * written by Mark Imbriaco.//ww  w .ja  v a2s  .  c o m
 *
 * @param text a String of text to convert into an array of words
 * @return text broken up into an array of words.
 */
public static String[] toLowerCaseWordArray(String text) {
    if (text == null || text.length() == 0) {
        return new String[0];
    }

    List<String> wordList = new ArrayList<>();
    BreakIterator boundary = BreakIterator.getWordInstance();
    boundary.setText(text);
    int start = 0;

    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        String tmp = text.substring(start, end).trim();
        // Remove characters that are not needed.
        tmp = replace(tmp, "+", "");
        tmp = replace(tmp, "/", "");
        tmp = replace(tmp, "\\", "");
        tmp = replace(tmp, "#", "");
        tmp = replace(tmp, "*", "");
        tmp = replace(tmp, ")", "");
        tmp = replace(tmp, "(", "");
        tmp = replace(tmp, "&", "");
        if (tmp.length() > 0) {
            wordList.add(tmp);
        }
    }
    return wordList.toArray(new String[wordList.size()]);
}

From source file:org.jivesoftware.util.StringUtils.java

/**
 * Reformats a string where lines that are longer than <tt>width</tt>
 * are split apart at the earliest wordbreak or at maxLength, whichever is
 * sooner. If the width specified is less than 5 or greater than the input
 * Strings length the string will be returned as is.
 * <p>/*from  ww  w.j  a v a  2s .  c  om*/
 * Please note that this method can be lossy - trailing spaces on wrapped
 * lines may be trimmed.</p>
 *
 * @param input the String to reformat.
 * @param width the maximum length of any one line.
 * @return a new String with reformatted as needed.
 */
public static String wordWrap(String input, int width, Locale locale) {
    // protect ourselves
    if (input == null) {
        return "";
    } else if (width < 5) {
        return input;
    } else if (width >= input.length()) {
        return input;
    }

    // default locale
    if (locale == null) {
        locale = JiveGlobals.getLocale();
    }

    StringBuilder buf = new StringBuilder(input);
    boolean endOfLine = false;
    int lineStart = 0;

    for (int i = 0; i < buf.length(); i++) {
        if (buf.charAt(i) == '\n') {
            lineStart = i + 1;
            endOfLine = true;
        }

        // handle splitting at width character
        if (i > lineStart + width - 1) {
            if (!endOfLine) {
                int limit = i - lineStart - 1;
                BreakIterator breaks = BreakIterator.getLineInstance(locale);
                breaks.setText(buf.substring(lineStart, i));
                int end = breaks.last();

                // if the last character in the search string isn't a space,
                // we can't split on it (looks bad). Search for a previous
                // break character
                if (end == limit + 1) {
                    if (!Character.isWhitespace(buf.charAt(lineStart + end))) {
                        end = breaks.preceding(end - 1);
                    }
                }

                // if the last character is a space, replace it with a \n
                if (end != BreakIterator.DONE && end == limit + 1) {
                    buf.replace(lineStart + end, lineStart + end + 1, "\n");
                    lineStart = lineStart + end;
                }
                // otherwise, just insert a \n
                else if (end != BreakIterator.DONE && end != 0) {
                    buf.insert(lineStart + end, '\n');
                    lineStart = lineStart + end + 1;
                } else {
                    buf.insert(i, '\n');
                    lineStart = i + 1;
                }
            } else {
                buf.insert(i, '\n');
                lineStart = i + 1;
                endOfLine = false;
            }
        }
    }

    return buf.toString();
}

From source file:org.lnicholls.galleon.util.Tools.java

public static String[] layout(int width, FontMetrics metrics, String text) {
    ArrayList lines = new ArrayList();

    if (text != null) {
        String line = "";
        BreakIterator boundary = BreakIterator.getWordInstance();
        boundary.setText(text);//from w w  w .  j  a va2  s .  c  o  m
        int start = boundary.first();
        for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
            String word = text.substring(start, end);
            String trimmed = word.replaceAll(" ", "");
            int metricsWidth = (line + word).length() * 20;
            if (metrics != null)
                metricsWidth = metrics.stringWidth(line + word);

            if (trimmed.equals("\n") || trimmed.equals("\r") || trimmed.equals("\r\n")) {
                lines.add(line.trim());
                line = "";
            } else if (metricsWidth > width) {
                lines.add(line.trim());
                line = word;
            } else
                line = line + word;
        }
        if (line.trim().length() > 0)
            lines.add(line.trim());
    }

    return (String[]) lines.toArray(new String[0]);
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

/**
 * {@inheritDoc}/* w ww  .  j  a va  2s. c om*/
 */
@Override
public Token[] tokenizeVerbatim(final String strOrig) {
    if (StringUtil.isEmpty(strOrig)) {
        return EMPTY_TOKENS_LIST;
    }

    if (!shouldDelegateTokenizeExactly) {
        return tokenize(strOrig, false, false, false, false);
    }

    List<Token> result = new ArrayList<Token>(DEFAULT_TOKENS_COUNT);

    WordIterator iterator = new WordIterator();
    iterator.setText(strOrig);

    int start = iterator.first();
    for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
        String tokenStr = strOrig.substring(start, end);
        result.add(new Token(tokenStr, start));
    }

    return result.toArray(new Token[result.size()]);
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

@Override
public String[] tokenizeVerbatimToStrings(String str) {
    if (StringUtil.isEmpty(str)) {
        return EMPTY_STRING_LIST;
    }/*from w  w w. j a  v a2  s.  co m*/

    if (!shouldDelegateTokenizeExactly) {
        return tokenizeToStrings(str, false, false, false, false);
    }

    List<String> result = new ArrayList<String>(DEFAULT_TOKENS_COUNT);

    WordIterator iterator = new WordIterator();
    iterator.setText(str);

    int start = iterator.first();
    for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
        String tokenStr = str.substring(start, end);
        result.add(tokenStr);
    }

    return result.toArray(new String[result.size()]);
}

From source file:pl.edu.icm.coansys.kwdextraction.RakeExtractor.java

/**
 * Finding words or word sequences separated by stopwords, punctuation marks
 * etc./*w  w  w  . j ava 2 s.  co m*/
 */
private void extractKeywordCandidates() {

    Map<String, KeywordCandidate> candidatesMap = new HashMap<String, KeywordCandidate>();

    BreakIterator wordIterator = BreakIterator.getWordInstance();

    wordIterator.setText(content);
    int wordStart = wordIterator.first();

    int candidateStart = wordStart;
    String candidateStr = null;
    KeywordCandidate kwdCand = new KeywordCandidate();

    for (int wordEnd = wordIterator
            .next(); wordEnd != BreakIterator.DONE; wordStart = wordEnd, wordEnd = wordIterator.next()) {

        String word = content.substring(wordStart, wordEnd).trim().toLowerCase();
        String alpha = word.replaceAll(ILLEGAL_CHARS, "");

        if (!word.isEmpty()) {

            if (stopwords.get(lang).contains(word) || word.matches("\\W+") || isNum(word)
                    || !word.equals(alpha)) {
                candidateStr = content.substring(candidateStart, wordStart);
            } else {
                kwdCand.addWord(word);
                if (wordEnd == content.length()) {
                    candidateStr = content.substring(candidateStart, wordEnd);
                }
            }
            if (candidateStr != null) {
                candidateStr = candidateStr.trim().toLowerCase().replaceAll(ILLEGAL_CHARS, "")
                        .replaceAll("\\s+", " ");
                if (!candidateStr.isEmpty()) {
                    if (candidatesMap.containsKey(candidateStr)) {
                        candidatesMap.get(candidateStr).incCounter();
                    } else {
                        kwdCand.setKeyword(candidateStr);
                        candidatesMap.put(candidateStr, kwdCand);
                    }
                }
                candidateStr = null;
                candidateStart = wordEnd;
                kwdCand = new KeywordCandidate();
            }
        }
    }

    keywordCandidates = new ArrayList<KeywordCandidate>();
    for (Entry<String, KeywordCandidate> e : candidatesMap.entrySet()) {
        keywordCandidates.add(e.getValue());
    }
}

From source file:pt.ua.ri.tokenizer.WordTokenizer.java

@Override
public boolean hasNext() {
    return nonNull(bf) && (nextIndex != BreakIterator.DONE);
}

From source file:pt.ua.ri.tokenizer.WordTokenizer.java

private void findNext() {
    checkState(bf.length() > 0);/*  w w w .jav a 2  s .c  o m*/
    CharSequence dirtySequence;
    String dirtyString;
    Token nextToken;
    do {
        int firstIndex = nextIndex;
        nextIndex = bi.next();
        dirtySequence = bf.subSequence(firstIndex, (nextIndex == BreakIterator.DONE) ? bi.last() : nextIndex);
        dirtyString = dirtySequence.toString();
        nextToken = tokens.getOrDefault(dirtyString, null);

    } while (nextToken == null && !isAcceptableWord(dirtySequence) && hasNext());
    if (nextToken == null) {
        final String afterNormalization = cleanWord(dirtySequence).toString();
        nextToken = tokens.compute(afterNormalization, (key, token) -> token == null ? new Token(key) : token);
        // also store the dirty sequence, to increase performance
        tokens.putIfAbsent(dirtyString, nextToken);
    }
    current = nextToken;

}