Example usage for java.text BreakIterator getWordInstance

List of usage examples for java.text BreakIterator getWordInstance

Introduction

In this page you can find the example usage for java.text BreakIterator getWordInstance.

Prototype

public static BreakIterator getWordInstance() 

Source Link

Document

Returns a new BreakIterator instance for word breaks for the Locale#getDefault() default locale .

Usage

From source file:eu.fbk.utils.lsa.util.Anvur.java

static String tokenize(String in) {

    //print each word in order
    BreakIterator boundary = BreakIterator.getWordInstance();
    boundary.setText(in);/*  w w w  . j  a  va  2  s  . c  om*/
    StringBuilder out = new StringBuilder();
    int start = boundary.first();

    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        out.append(" ");
        out.append(in.substring(start, end));
    }
    return out.toString();
}

From source file:ezbake.training.TweetWordDivideWorker.java

/**
 * Performs processing on the Tweet object by dividing the tweet's text into words.
 *
 * @param visibility The Visibility containing the Accumulo visibility string representing the classification level
 * of the data contained in the incoming thrift data object.
 * @param data The incoming Thrift object to be processed.
 *//*from w  w w  . j  av  a 2 s  .  c  om*/
@Override
public void process(Visibility visibility, Tweet data) {
    if (data != null && data.getText() != null) {
        BreakIterator wordIterator = BreakIterator.getWordInstance();
        wordIterator.setText(data.getText());

        int wordStart = wordIterator.first();
        int wordEnd = wordIterator.next();
        for (; wordEnd != BreakIterator.DONE; wordStart = wordEnd, wordEnd = wordIterator.next()) {
            String tweetTextWord = data.getText().substring(wordStart, wordEnd);
            if (StringUtils.isNotBlank(tweetTextWord)) {
                try {
                    outputResultsToPipe(visibility, tweetTextWord);
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}

From source file:org.eclipse.fx.core.text.TextUtil.java

/**
 * Find the end offset of the word//from   w w  w  . jav a2s .c om
 *
 * @param content
 *            the content
 * @param offset
 *            the offset to start the search from
 * @param pointAsBoundary
 *            should the '.' treated as word boundary
 * @return the end offset or {@link BreakIterator#DONE}
 */
public static int findWordEndOffset(IterableCharSequence content, int offset, boolean pointAsBoundary) {
    BreakIterator wordInstance = BreakIterator.getWordInstance();
    wordInstance.setText(content.getIterator());
    int rv = wordInstance.following(offset);
    if (rv != BreakIterator.DONE && pointAsBoundary) {
        String s = content.subSequence(offset, rv).toString();
        int idx = s.indexOf('.');
        if (idx >= 0) {
            rv = offset + idx;
        }

        if (rv == offset) {
            rv = offset + 1;
        }
    }
    return rv;
}

From source file:org.yamj.common.tools.StringTools.java

/**
 * Check that the passed string is not longer than the required length and
 * trim it if necessary/*w  w  w  . ja  va2s.c om*/
 *
 * @param sourceString The string to check
 * @param requiredLength The required length (Maximum)
 * @param trimToWord Trim the source string to the last space to avoid
 * partial words
 * @param endingSuffix The ending to append if the string is longer than the
 * required length
 * @return
 */
public static String trimToLength(String sourceString, int requiredLength, boolean trimToWord,
        String endingSuffix) {
    String changedString = sourceString.trim();

    if (StringUtils.isNotBlank(changedString)) {
        if (changedString.length() <= requiredLength) {
            // No need to do anything
            return changedString;
        } else if (trimToWord) {
            BreakIterator bi = BreakIterator.getWordInstance();
            bi.setText(changedString);
            int biLength = bi.preceding(requiredLength - endingSuffix.length());
            return changedString.substring(0, biLength).trim() + endingSuffix;
        } else {
            // We know that the source string is longer that the required length, so trim it to size
            return changedString.substring(0, requiredLength - endingSuffix.length()).trim() + endingSuffix;
        }
    }

    return changedString;
}

From source file:org.eclipse.fx.core.text.TextUtil.java

/**
 * Find the start offset of the word//from www. ja  va  2  s .  c  o  m
 *
 * @param content
 *            the content
 * @param offset
 *            the offset to start the search from
 * @param pointAsBoundary
 *            should the '.' treated as word boundary
 * @return the start offset or or {@link BreakIterator#DONE}
 */
public static int findWordStartOffset(IterableCharSequence content, int offset, boolean pointAsBoundary) {
    BreakIterator wordInstance = BreakIterator.getWordInstance();
    wordInstance.setText(content.getIterator());
    int rv = wordInstance.preceding(offset);

    if (rv != BreakIterator.DONE && pointAsBoundary) {
        String s = content.subSequence(rv, offset).toString();
        int idx = s.lastIndexOf('.');
        if (idx > 0) {
            rv += idx + 1;
        }

        // move before the point
        if (rv == offset) {
            rv -= 1;
        }
    }

    return rv;
}

From source file:org.eclipse.fx.core.text.TextUtil.java

/**
 * Find the bounds of the word/*w  w  w.  ja va2s  .  c  o m*/
 *
 * @param content
 *            the content
 * @param offset
 *            the offset
 * @param pointAsBoundary
 *            should the '.' treated as word boundary
 * @return a tuple of value representing start and end
 */
public static IntTuple findWordBounds(IterableCharSequence content, int offset, boolean pointAsBoundary) {
    BreakIterator wordInstance = BreakIterator.getWordInstance();
    wordInstance.setText(content.getIterator());
    int previous = wordInstance.preceding(offset);
    int next = wordInstance.following(offset);

    if (pointAsBoundary && previous != BreakIterator.DONE && next != BreakIterator.DONE) {
        String preMatch = content.subSequence(previous, offset).toString();
        String postMatch = content.subSequence(offset, next).toString();

        int idx = preMatch.lastIndexOf('.');
        if (idx > 0) {
            previous += idx + 1;
        }

        idx = postMatch.indexOf('.');
        if (idx > 0) {
            next = offset + idx;
        }
    }

    return new IntTuple(previous, next);
}

From source file:com.arthackday.killerapp.util.Util.java

public static String truncate(String text, int charLimit) {
    if (text.length() > charLimit) {
        BreakIterator bi = BreakIterator.getWordInstance();
        bi.setText(text);//  w w w .j a  va 2  s  .  c  o  m
        int cutOff = bi.following(charLimit);
        text = text.substring(0, cutOff) + " ...";
    }
    return text;
}

From source file:org.yamj.core.tools.StringTools.java

/**
 * Check that the passed string is not longer than the required length and trim it if necessary
 *
 * @param sourceString The string to check
 * @param requiredLength The required length (Maximum)
 * @param trimToWord Trim the source string to the last space to avoid partial words
 * @param endingSuffix The ending to append if the string is longer than the required length
 * @return/*w w w .jav  a2 s. c o m*/
 */
public static String trimToLength(String sourceString, int requiredLength, boolean trimToWord,
        String endingSuffix) {
    String changedString = sourceString.trim();

    if (StringUtils.isNotBlank(changedString)) {
        if (changedString.length() <= requiredLength) {
            // No need to do anything
            return changedString;
        } else {
            if (trimToWord) {
                BreakIterator bi = BreakIterator.getWordInstance();
                bi.setText(changedString);
                int biLength = bi.preceding(requiredLength - endingSuffix.length());
                return changedString.substring(0, biLength).trim() + endingSuffix;
            } else {
                // We know that the source string is longer that the required length, so trim it to size
                return changedString.substring(0, requiredLength - endingSuffix.length()).trim() + endingSuffix;
            }
        }
    }

    return changedString;
}

From source file:com.moviejukebox.tools.StringTools.java

/**
 * Check that the passed string is not longer than the required length and
 * trim it if necessary//from   www .j a v a2s.  c  o  m
 *
 * @param sourceString The string to check
 * @param requiredLength The required length (Maximum)
 * @param trimToWord Trim the source string to the last space to avoid
 * partial words
 * @param endingSuffix The ending to append if the string is longer than the
 * required length
 * @return
 */
public static String trimToLength(String sourceString, int requiredLength, boolean trimToWord,
        String endingSuffix) {
    String changedString = sourceString.trim();

    if (isValidString(changedString)) {
        if (changedString.length() <= requiredLength) {
            // No need to do anything
            return changedString;
        }

        if (trimToWord) {
            BreakIterator bi = BreakIterator.getWordInstance();
            bi.setText(changedString);
            int biLength = bi.preceding(requiredLength - endingSuffix.length() + 1);
            return changedString.substring(0, biLength).trim() + endingSuffix;
        }

        // We know that the source string is longer that the required length, so trim it to size
        return changedString.substring(0, requiredLength - endingSuffix.length()).trim() + endingSuffix;
    }

    return changedString;
}

From source file:pl.edu.icm.coansys.kwdextraction.RakeExtractor.java

/**
 * Finding words or word sequences separated by stopwords, punctuation marks
 * etc.//from w w w  . ja  v a 2 s  .  c  om
 */
private void extractKeywordCandidates() {

    Map<String, KeywordCandidate> candidatesMap = new HashMap<String, KeywordCandidate>();

    BreakIterator wordIterator = BreakIterator.getWordInstance();

    wordIterator.setText(content);
    int wordStart = wordIterator.first();

    int candidateStart = wordStart;
    String candidateStr = null;
    KeywordCandidate kwdCand = new KeywordCandidate();

    for (int wordEnd = wordIterator
            .next(); wordEnd != BreakIterator.DONE; wordStart = wordEnd, wordEnd = wordIterator.next()) {

        String word = content.substring(wordStart, wordEnd).trim().toLowerCase();
        String alpha = word.replaceAll(ILLEGAL_CHARS, "");

        if (!word.isEmpty()) {

            if (stopwords.get(lang).contains(word) || word.matches("\\W+") || isNum(word)
                    || !word.equals(alpha)) {
                candidateStr = content.substring(candidateStart, wordStart);
            } else {
                kwdCand.addWord(word);
                if (wordEnd == content.length()) {
                    candidateStr = content.substring(candidateStart, wordEnd);
                }
            }
            if (candidateStr != null) {
                candidateStr = candidateStr.trim().toLowerCase().replaceAll(ILLEGAL_CHARS, "")
                        .replaceAll("\\s+", " ");
                if (!candidateStr.isEmpty()) {
                    if (candidatesMap.containsKey(candidateStr)) {
                        candidatesMap.get(candidateStr).incCounter();
                    } else {
                        kwdCand.setKeyword(candidateStr);
                        candidatesMap.put(candidateStr, kwdCand);
                    }
                }
                candidateStr = null;
                candidateStart = wordEnd;
                kwdCand = new KeywordCandidate();
            }
        }
    }

    keywordCandidates = new ArrayList<KeywordCandidate>();
    for (Entry<String, KeywordCandidate> e : candidatesMap.entrySet()) {
        keywordCandidates.add(e.getValue());
    }
}