Example usage for java.text BreakIterator getWordInstance

Introduction

In this page you can find the example usage for java.text BreakIterator getWordInstance.

Prototype

public static BreakIterator getWordInstance()

Source Link

Document

Returns a new BreakIterator instance for word breaks for the Locale#getDefault() default locale .

Usage

From source file:eu.fbk.utils.lsa.util.Anvur.java

static String tokenize(String in) {

    //print each word in order
    BreakIterator boundary = BreakIterator.getWordInstance();
    boundary.setText(in);/*  w w w  . j  a  va  2  s  . c  om*/
    StringBuilder out = new StringBuilder();
    int start = boundary.first();

    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        out.append(" ");
        out.append(in.substring(start, end));
    }
    return out.toString();
}

From source file:ezbake.training.TweetWordDivideWorker.java

/**
 * Performs processing on the Tweet object by dividing the tweet's text into words.
 *
 * @param visibility The Visibility containing the Accumulo visibility string representing the classification level
 * of the data contained in the incoming thrift data object.
 * @param data The incoming Thrift object to be processed.
 *//*from w  w w  . j  av  a 2 s  .  c  om*/
@Override
public void process(Visibility visibility, Tweet data) {
    if (data != null && data.getText() != null) {
        BreakIterator wordIterator = BreakIterator.getWordInstance();
        wordIterator.setText(data.getText());

        int wordStart = wordIterator.first();
        int wordEnd = wordIterator.next();
        for (; wordEnd != BreakIterator.DONE; wordStart = wordEnd, wordEnd = wordIterator.next()) {
            String tweetTextWord = data.getText().substring(wordStart, wordEnd);
            if (StringUtils.isNotBlank(tweetTextWord)) {
                try {
                    outputResultsToPipe(visibility, tweetTextWord);
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}

From source file:org.eclipse.fx.core.text.TextUtil.java

/**
 * Find the end offset of the word//from   w w  w  . jav a2s .c om
 *
 * @param content
 *            the content
 * @param offset
 *            the offset to start the search from
 * @param pointAsBoundary
 *            should the '.' treated as word boundary
 * @return the end offset or {@link BreakIterator#DONE}
 */
public static int findWordEndOffset(IterableCharSequence content, int offset, boolean pointAsBoundary) {
    BreakIterator wordInstance = BreakIterator.getWordInstance();
    wordInstance.setText(content.getIterator());
    int rv = wordInstance.following(offset);
    if (rv != BreakIterator.DONE && pointAsBoundary) {
        String s = content.subSequence(offset, rv).toString();
        int idx = s.indexOf('.');
        if (idx >= 0) {
            rv = offset + idx;
        }

        if (rv == offset) {
            rv = offset + 1;
        }
    }
    return rv;
}

From source file:org.yamj.common.tools.StringTools.java

/**
 * Check that the passed string is not longer than the required length and
 * trim it if necessary/*w  w  w  . ja  va2s.c om*/
 *
 * @param sourceString The string to check
 * @param requiredLength The required length (Maximum)
 * @param trimToWord Trim the source string to the last space to avoid
 * partial words
 * @param endingSuffix The ending to append if the string is longer than the
 * required length
 * @return
 */
public static String trimToLength(String sourceString, int requiredLength, boolean trimToWord,
        String endingSuffix) {
    String changedString = sourceString.trim();

    if (StringUtils.isNotBlank(changedString)) {
        if (changedString.length() <= requiredLength) {
            // No need to do anything
            return changedString;
        } else if (trimToWord) {
            BreakIterator bi = BreakIterator.getWordInstance();
            bi.setText(changedString);
            int biLength = bi.preceding(requiredLength - endingSuffix.length());
            return changedString.substring(0, biLength).trim() + endingSuffix;
        } else {
            // We know that the source string is longer that the required length, so trim it to size
            return changedString.substring(0, requiredLength - endingSuffix.length()).trim() + endingSuffix;
        }
    }

    return changedString;
}

From source file:org.eclipse.fx.core.text.TextUtil.java

/**
 * Find the start offset of the word//from www. ja  va  2  s .  c  o  m
 *
 * @param content
 *            the content
 * @param offset
 *            the offset to start the search from
 * @param pointAsBoundary
 *            should the '.' treated as word boundary
 * @return the start offset or or {@link BreakIterator#DONE}
 */
public static int findWordStartOffset(IterableCharSequence content, int offset, boolean pointAsBoundary) {
    BreakIterator wordInstance = BreakIterator.getWordInstance();
    wordInstance.setText(content.getIterator());
    int rv = wordInstance.preceding(offset);

    if (rv != BreakIterator.DONE && pointAsBoundary) {
        String s = content.subSequence(rv, offset).toString();
        int idx = s.lastIndexOf('.');
        if (idx > 0) {
            rv += idx + 1;
        }

        // move before the point
        if (rv == offset) {
            rv -= 1;
        }
    }

    return rv;
}

From source file:org.eclipse.fx.core.text.TextUtil.java

/**
 * Find the bounds of the word/*w  w  w.  ja va2s  .  c  o m*/
 *
 * @param content
 *            the content
 * @param offset
 *            the offset
 * @param pointAsBoundary
 *            should the '.' treated as word boundary
 * @return a tuple of value representing start and end
 */
public static IntTuple findWordBounds(IterableCharSequence content, int offset, boolean pointAsBoundary) {
    BreakIterator wordInstance = BreakIterator.getWordInstance();
    wordInstance.setText(content.getIterator());
    int previous = wordInstance.preceding(offset);
    int next = wordInstance.following(offset);

    if (pointAsBoundary && previous != BreakIterator.DONE && next != BreakIterator.DONE) {
        String preMatch = content.subSequence(previous, offset).toString();
        String postMatch = content.subSequence(offset, next).toString();

        int idx = preMatch.lastIndexOf('.');
        if (idx > 0) {
            previous += idx + 1;
        }

        idx = postMatch.indexOf('.');
        if (idx > 0) {
            next = offset + idx;
        }
    }

    return new IntTuple(previous, next);
}

From source file:com.arthackday.killerapp.util.Util.java

public static String truncate(String text, int charLimit) {
    if (text.length() > charLimit) {
        BreakIterator bi = BreakIterator.getWordInstance();
        bi.setText(text);//  w w w .j a  va 2  s  .  c  o  m
        int cutOff = bi.following(charLimit);
        text = text.substring(0, cutOff) + " ...";
    }
    return text;
}

From source file:org.yamj.core.tools.StringTools.java

/**
 * Check that the passed string is not longer than the required length and trim it if necessary
 *
 * @param sourceString The string to check
 * @param requiredLength The required length (Maximum)
 * @param trimToWord Trim the source string to the last space to avoid partial words
 * @param endingSuffix The ending to append if the string is longer than the required length
 * @return/*w w w .jav  a2 s. c o m*/
 */
public static String trimToLength(String sourceString, int requiredLength, boolean trimToWord,
        String endingSuffix) {
    String changedString = sourceString.trim();

    if (StringUtils.isNotBlank(changedString)) {
        if (changedString.length() <= requiredLength) {
            // No need to do anything
            return changedString;
        } else {
            if (trimToWord) {
                BreakIterator bi = BreakIterator.getWordInstance();
                bi.setText(changedString);
                int biLength = bi.preceding(requiredLength - endingSuffix.length());
                return changedString.substring(0, biLength).trim() + endingSuffix;
            } else {
                // We know that the source string is longer that the required length, so trim it to size
                return changedString.substring(0, requiredLength - endingSuffix.length()).trim() + endingSuffix;
            }
        }
    }

    return changedString;
}

From source file:com.moviejukebox.tools.StringTools.java

/**
 * Check that the passed string is not longer than the required length and
 * trim it if necessary//from   www .j a v a2s.  c  o  m
 *
 * @param sourceString The string to check
 * @param requiredLength The required length (Maximum)
 * @param trimToWord Trim the source string to the last space to avoid
 * partial words
 * @param endingSuffix The ending to append if the string is longer than the
 * required length
 * @return
 */
public static String trimToLength(String sourceString, int requiredLength, boolean trimToWord,
        String endingSuffix) {
    String changedString = sourceString.trim();

    if (isValidString(changedString)) {
        if (changedString.length() <= requiredLength) {
            // No need to do anything
            return changedString;
        }

        if (trimToWord) {
            BreakIterator bi = BreakIterator.getWordInstance();
            bi.setText(changedString);
            int biLength = bi.preceding(requiredLength - endingSuffix.length() + 1);
            return changedString.substring(0, biLength).trim() + endingSuffix;
        }

        // We know that the source string is longer that the required length, so trim it to size
        return changedString.substring(0, requiredLength - endingSuffix.length()).trim() + endingSuffix;
    }

    return changedString;
}

From source file:pl.edu.icm.coansys.kwdextraction.RakeExtractor.java

/**
 * Finding words or word sequences separated by stopwords, punctuation marks
 * etc.//from w w w  . ja  v a 2 s  .  c  om
 */
private void extractKeywordCandidates() {

    Map<String, KeywordCandidate> candidatesMap = new HashMap<String, KeywordCandidate>();

    BreakIterator wordIterator = BreakIterator.getWordInstance();

    wordIterator.setText(content);
    int wordStart = wordIterator.first();

    int candidateStart = wordStart;
    String candidateStr = null;
    KeywordCandidate kwdCand = new KeywordCandidate();

    for (int wordEnd = wordIterator
            .next(); wordEnd != BreakIterator.DONE; wordStart = wordEnd, wordEnd = wordIterator.next()) {

        String word = content.substring(wordStart, wordEnd).trim().toLowerCase();
        String alpha = word.replaceAll(ILLEGAL_CHARS, "");

        if (!word.isEmpty()) {

            if (stopwords.get(lang).contains(word) || word.matches("\\W+") || isNum(word)
                    || !word.equals(alpha)) {
                candidateStr = content.substring(candidateStart, wordStart);
            } else {
                kwdCand.addWord(word);
                if (wordEnd == content.length()) {
                    candidateStr = content.substring(candidateStart, wordEnd);
                }
            }
            if (candidateStr != null) {
                candidateStr = candidateStr.trim().toLowerCase().replaceAll(ILLEGAL_CHARS, "")
                        .replaceAll("\\s+", " ");
                if (!candidateStr.isEmpty()) {
                    if (candidatesMap.containsKey(candidateStr)) {
                        candidatesMap.get(candidateStr).incCounter();
                    } else {
                        kwdCand.setKeyword(candidateStr);
                        candidatesMap.put(candidateStr, kwdCand);
                    }
                }
                candidateStr = null;
                candidateStart = wordEnd;
                kwdCand = new KeywordCandidate();
            }
        }
    }

    keywordCandidates = new ArrayList<KeywordCandidate>();
    for (Entry<String, KeywordCandidate> e : candidatesMap.entrySet()) {
        keywordCandidates.add(e.getValue());
    }
}