List of usage examples for java.text BreakIterator getWordInstance
public static BreakIterator getWordInstance()
BreakIterator
instance for word breaks for the Locale#getDefault() default locale . From source file:eu.fbk.utils.lsa.util.Anvur.java
static String tokenize(String in) { //print each word in order BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(in);/* w w w . j a va 2 s . c om*/ StringBuilder out = new StringBuilder(); int start = boundary.first(); for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { out.append(" "); out.append(in.substring(start, end)); } return out.toString(); }
From source file:ezbake.training.TweetWordDivideWorker.java
/** * Performs processing on the Tweet object by dividing the tweet's text into words. * * @param visibility The Visibility containing the Accumulo visibility string representing the classification level * of the data contained in the incoming thrift data object. * @param data The incoming Thrift object to be processed. *//*from w w w . j av a 2 s . c om*/ @Override public void process(Visibility visibility, Tweet data) { if (data != null && data.getText() != null) { BreakIterator wordIterator = BreakIterator.getWordInstance(); wordIterator.setText(data.getText()); int wordStart = wordIterator.first(); int wordEnd = wordIterator.next(); for (; wordEnd != BreakIterator.DONE; wordStart = wordEnd, wordEnd = wordIterator.next()) { String tweetTextWord = data.getText().substring(wordStart, wordEnd); if (StringUtils.isNotBlank(tweetTextWord)) { try { outputResultsToPipe(visibility, tweetTextWord); } catch (IOException e) { e.printStackTrace(); } } } } }
From source file:org.eclipse.fx.core.text.TextUtil.java
/** * Find the end offset of the word//from w w w . jav a2s .c om * * @param content * the content * @param offset * the offset to start the search from * @param pointAsBoundary * should the '.' treated as word boundary * @return the end offset or {@link BreakIterator#DONE} */ public static int findWordEndOffset(IterableCharSequence content, int offset, boolean pointAsBoundary) { BreakIterator wordInstance = BreakIterator.getWordInstance(); wordInstance.setText(content.getIterator()); int rv = wordInstance.following(offset); if (rv != BreakIterator.DONE && pointAsBoundary) { String s = content.subSequence(offset, rv).toString(); int idx = s.indexOf('.'); if (idx >= 0) { rv = offset + idx; } if (rv == offset) { rv = offset + 1; } } return rv; }
From source file:org.yamj.common.tools.StringTools.java
/** * Check that the passed string is not longer than the required length and * trim it if necessary/*w w w . ja va2s.c om*/ * * @param sourceString The string to check * @param requiredLength The required length (Maximum) * @param trimToWord Trim the source string to the last space to avoid * partial words * @param endingSuffix The ending to append if the string is longer than the * required length * @return */ public static String trimToLength(String sourceString, int requiredLength, boolean trimToWord, String endingSuffix) { String changedString = sourceString.trim(); if (StringUtils.isNotBlank(changedString)) { if (changedString.length() <= requiredLength) { // No need to do anything return changedString; } else if (trimToWord) { BreakIterator bi = BreakIterator.getWordInstance(); bi.setText(changedString); int biLength = bi.preceding(requiredLength - endingSuffix.length()); return changedString.substring(0, biLength).trim() + endingSuffix; } else { // We know that the source string is longer that the required length, so trim it to size return changedString.substring(0, requiredLength - endingSuffix.length()).trim() + endingSuffix; } } return changedString; }
From source file:org.eclipse.fx.core.text.TextUtil.java
/** * Find the start offset of the word//from www. ja va 2 s . c o m * * @param content * the content * @param offset * the offset to start the search from * @param pointAsBoundary * should the '.' treated as word boundary * @return the start offset or or {@link BreakIterator#DONE} */ public static int findWordStartOffset(IterableCharSequence content, int offset, boolean pointAsBoundary) { BreakIterator wordInstance = BreakIterator.getWordInstance(); wordInstance.setText(content.getIterator()); int rv = wordInstance.preceding(offset); if (rv != BreakIterator.DONE && pointAsBoundary) { String s = content.subSequence(rv, offset).toString(); int idx = s.lastIndexOf('.'); if (idx > 0) { rv += idx + 1; } // move before the point if (rv == offset) { rv -= 1; } } return rv; }
From source file:org.eclipse.fx.core.text.TextUtil.java
/** * Find the bounds of the word/*w w w. ja va2s . c o m*/ * * @param content * the content * @param offset * the offset * @param pointAsBoundary * should the '.' treated as word boundary * @return a tuple of value representing start and end */ public static IntTuple findWordBounds(IterableCharSequence content, int offset, boolean pointAsBoundary) { BreakIterator wordInstance = BreakIterator.getWordInstance(); wordInstance.setText(content.getIterator()); int previous = wordInstance.preceding(offset); int next = wordInstance.following(offset); if (pointAsBoundary && previous != BreakIterator.DONE && next != BreakIterator.DONE) { String preMatch = content.subSequence(previous, offset).toString(); String postMatch = content.subSequence(offset, next).toString(); int idx = preMatch.lastIndexOf('.'); if (idx > 0) { previous += idx + 1; } idx = postMatch.indexOf('.'); if (idx > 0) { next = offset + idx; } } return new IntTuple(previous, next); }
From source file:com.arthackday.killerapp.util.Util.java
public static String truncate(String text, int charLimit) { if (text.length() > charLimit) { BreakIterator bi = BreakIterator.getWordInstance(); bi.setText(text);// w w w .j a va 2 s . c o m int cutOff = bi.following(charLimit); text = text.substring(0, cutOff) + " ..."; } return text; }
From source file:org.yamj.core.tools.StringTools.java
/** * Check that the passed string is not longer than the required length and trim it if necessary * * @param sourceString The string to check * @param requiredLength The required length (Maximum) * @param trimToWord Trim the source string to the last space to avoid partial words * @param endingSuffix The ending to append if the string is longer than the required length * @return/*w w w .jav a2 s. c o m*/ */ public static String trimToLength(String sourceString, int requiredLength, boolean trimToWord, String endingSuffix) { String changedString = sourceString.trim(); if (StringUtils.isNotBlank(changedString)) { if (changedString.length() <= requiredLength) { // No need to do anything return changedString; } else { if (trimToWord) { BreakIterator bi = BreakIterator.getWordInstance(); bi.setText(changedString); int biLength = bi.preceding(requiredLength - endingSuffix.length()); return changedString.substring(0, biLength).trim() + endingSuffix; } else { // We know that the source string is longer that the required length, so trim it to size return changedString.substring(0, requiredLength - endingSuffix.length()).trim() + endingSuffix; } } } return changedString; }
From source file:com.moviejukebox.tools.StringTools.java
/** * Check that the passed string is not longer than the required length and * trim it if necessary//from www .j a v a2s. c o m * * @param sourceString The string to check * @param requiredLength The required length (Maximum) * @param trimToWord Trim the source string to the last space to avoid * partial words * @param endingSuffix The ending to append if the string is longer than the * required length * @return */ public static String trimToLength(String sourceString, int requiredLength, boolean trimToWord, String endingSuffix) { String changedString = sourceString.trim(); if (isValidString(changedString)) { if (changedString.length() <= requiredLength) { // No need to do anything return changedString; } if (trimToWord) { BreakIterator bi = BreakIterator.getWordInstance(); bi.setText(changedString); int biLength = bi.preceding(requiredLength - endingSuffix.length() + 1); return changedString.substring(0, biLength).trim() + endingSuffix; } // We know that the source string is longer that the required length, so trim it to size return changedString.substring(0, requiredLength - endingSuffix.length()).trim() + endingSuffix; } return changedString; }
From source file:pl.edu.icm.coansys.kwdextraction.RakeExtractor.java
/** * Finding words or word sequences separated by stopwords, punctuation marks * etc.//from w w w . ja v a 2 s . c om */ private void extractKeywordCandidates() { Map<String, KeywordCandidate> candidatesMap = new HashMap<String, KeywordCandidate>(); BreakIterator wordIterator = BreakIterator.getWordInstance(); wordIterator.setText(content); int wordStart = wordIterator.first(); int candidateStart = wordStart; String candidateStr = null; KeywordCandidate kwdCand = new KeywordCandidate(); for (int wordEnd = wordIterator .next(); wordEnd != BreakIterator.DONE; wordStart = wordEnd, wordEnd = wordIterator.next()) { String word = content.substring(wordStart, wordEnd).trim().toLowerCase(); String alpha = word.replaceAll(ILLEGAL_CHARS, ""); if (!word.isEmpty()) { if (stopwords.get(lang).contains(word) || word.matches("\\W+") || isNum(word) || !word.equals(alpha)) { candidateStr = content.substring(candidateStart, wordStart); } else { kwdCand.addWord(word); if (wordEnd == content.length()) { candidateStr = content.substring(candidateStart, wordEnd); } } if (candidateStr != null) { candidateStr = candidateStr.trim().toLowerCase().replaceAll(ILLEGAL_CHARS, "") .replaceAll("\\s+", " "); if (!candidateStr.isEmpty()) { if (candidatesMap.containsKey(candidateStr)) { candidatesMap.get(candidateStr).incCounter(); } else { kwdCand.setKeyword(candidateStr); candidatesMap.put(candidateStr, kwdCand); } } candidateStr = null; candidateStart = wordEnd; kwdCand = new KeywordCandidate(); } } } keywordCandidates = new ArrayList<KeywordCandidate>(); for (Entry<String, KeywordCandidate> e : candidatesMap.entrySet()) { keywordCandidates.add(e.getValue()); } }