List of usage examples for java.text BreakIterator DONE
int DONE
To view the source code for java.text BreakIterator DONE.
Click Source Link
From source file:org.jivesoftware.util.StringUtils.java
/** * Converts a line of text into an array of lower case words using a * BreakIterator.wordInstance().<p> * * This method is under the Jive Open Source Software License and was * written by Mark Imbriaco.//ww w .ja v a2s . c o m * * @param text a String of text to convert into an array of words * @return text broken up into an array of words. */ public static String[] toLowerCaseWordArray(String text) { if (text == null || text.length() == 0) { return new String[0]; } List<String> wordList = new ArrayList<>(); BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(text); int start = 0; for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { String tmp = text.substring(start, end).trim(); // Remove characters that are not needed. tmp = replace(tmp, "+", ""); tmp = replace(tmp, "/", ""); tmp = replace(tmp, "\\", ""); tmp = replace(tmp, "#", ""); tmp = replace(tmp, "*", ""); tmp = replace(tmp, ")", ""); tmp = replace(tmp, "(", ""); tmp = replace(tmp, "&", ""); if (tmp.length() > 0) { wordList.add(tmp); } } return wordList.toArray(new String[wordList.size()]); }
From source file:org.jivesoftware.util.StringUtils.java
/** * Reformats a string where lines that are longer than <tt>width</tt> * are split apart at the earliest wordbreak or at maxLength, whichever is * sooner. If the width specified is less than 5 or greater than the input * Strings length the string will be returned as is. * <p>/*from ww w.j a v a 2s . c om*/ * Please note that this method can be lossy - trailing spaces on wrapped * lines may be trimmed.</p> * * @param input the String to reformat. * @param width the maximum length of any one line. * @return a new String with reformatted as needed. */ public static String wordWrap(String input, int width, Locale locale) { // protect ourselves if (input == null) { return ""; } else if (width < 5) { return input; } else if (width >= input.length()) { return input; } // default locale if (locale == null) { locale = JiveGlobals.getLocale(); } StringBuilder buf = new StringBuilder(input); boolean endOfLine = false; int lineStart = 0; for (int i = 0; i < buf.length(); i++) { if (buf.charAt(i) == '\n') { lineStart = i + 1; endOfLine = true; } // handle splitting at width character if (i > lineStart + width - 1) { if (!endOfLine) { int limit = i - lineStart - 1; BreakIterator breaks = BreakIterator.getLineInstance(locale); breaks.setText(buf.substring(lineStart, i)); int end = breaks.last(); // if the last character in the search string isn't a space, // we can't split on it (looks bad). Search for a previous // break character if (end == limit + 1) { if (!Character.isWhitespace(buf.charAt(lineStart + end))) { end = breaks.preceding(end - 1); } } // if the last character is a space, replace it with a \n if (end != BreakIterator.DONE && end == limit + 1) { buf.replace(lineStart + end, lineStart + end + 1, "\n"); lineStart = lineStart + end; } // otherwise, just insert a \n else if (end != BreakIterator.DONE && end != 0) { buf.insert(lineStart + end, '\n'); lineStart = lineStart + end + 1; } else { buf.insert(i, '\n'); lineStart = i + 1; } } else { buf.insert(i, '\n'); lineStart = i + 1; endOfLine = false; } } } return buf.toString(); }
From source file:org.lnicholls.galleon.util.Tools.java
public static String[] layout(int width, FontMetrics metrics, String text) { ArrayList lines = new ArrayList(); if (text != null) { String line = ""; BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(text);//from w w w . j a va2 s . c o m int start = boundary.first(); for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { String word = text.substring(start, end); String trimmed = word.replaceAll(" ", ""); int metricsWidth = (line + word).length() * 20; if (metrics != null) metricsWidth = metrics.stringWidth(line + word); if (trimmed.equals("\n") || trimmed.equals("\r") || trimmed.equals("\r\n")) { lines.add(line.trim()); line = ""; } else if (metricsWidth > width) { lines.add(line.trim()); line = word; } else line = line + word; } if (line.trim().length() > 0) lines.add(line.trim()); } return (String[]) lines.toArray(new String[0]); }
From source file:org.omegat.tokenizer.BaseTokenizer.java
/** * {@inheritDoc}/* w ww . j a va 2s. c om*/ */ @Override public Token[] tokenizeVerbatim(final String strOrig) { if (StringUtil.isEmpty(strOrig)) { return EMPTY_TOKENS_LIST; } if (!shouldDelegateTokenizeExactly) { return tokenize(strOrig, false, false, false, false); } List<Token> result = new ArrayList<Token>(DEFAULT_TOKENS_COUNT); WordIterator iterator = new WordIterator(); iterator.setText(strOrig); int start = iterator.first(); for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) { String tokenStr = strOrig.substring(start, end); result.add(new Token(tokenStr, start)); } return result.toArray(new Token[result.size()]); }
From source file:org.omegat.tokenizer.BaseTokenizer.java
@Override public String[] tokenizeVerbatimToStrings(String str) { if (StringUtil.isEmpty(str)) { return EMPTY_STRING_LIST; }/*from w w w. j a v a2 s. co m*/ if (!shouldDelegateTokenizeExactly) { return tokenizeToStrings(str, false, false, false, false); } List<String> result = new ArrayList<String>(DEFAULT_TOKENS_COUNT); WordIterator iterator = new WordIterator(); iterator.setText(str); int start = iterator.first(); for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) { String tokenStr = str.substring(start, end); result.add(tokenStr); } return result.toArray(new String[result.size()]); }
From source file:pl.edu.icm.coansys.kwdextraction.RakeExtractor.java
/** * Finding words or word sequences separated by stopwords, punctuation marks * etc./*w w w . j ava 2 s. co m*/ */ private void extractKeywordCandidates() { Map<String, KeywordCandidate> candidatesMap = new HashMap<String, KeywordCandidate>(); BreakIterator wordIterator = BreakIterator.getWordInstance(); wordIterator.setText(content); int wordStart = wordIterator.first(); int candidateStart = wordStart; String candidateStr = null; KeywordCandidate kwdCand = new KeywordCandidate(); for (int wordEnd = wordIterator .next(); wordEnd != BreakIterator.DONE; wordStart = wordEnd, wordEnd = wordIterator.next()) { String word = content.substring(wordStart, wordEnd).trim().toLowerCase(); String alpha = word.replaceAll(ILLEGAL_CHARS, ""); if (!word.isEmpty()) { if (stopwords.get(lang).contains(word) || word.matches("\\W+") || isNum(word) || !word.equals(alpha)) { candidateStr = content.substring(candidateStart, wordStart); } else { kwdCand.addWord(word); if (wordEnd == content.length()) { candidateStr = content.substring(candidateStart, wordEnd); } } if (candidateStr != null) { candidateStr = candidateStr.trim().toLowerCase().replaceAll(ILLEGAL_CHARS, "") .replaceAll("\\s+", " "); if (!candidateStr.isEmpty()) { if (candidatesMap.containsKey(candidateStr)) { candidatesMap.get(candidateStr).incCounter(); } else { kwdCand.setKeyword(candidateStr); candidatesMap.put(candidateStr, kwdCand); } } candidateStr = null; candidateStart = wordEnd; kwdCand = new KeywordCandidate(); } } } keywordCandidates = new ArrayList<KeywordCandidate>(); for (Entry<String, KeywordCandidate> e : candidatesMap.entrySet()) { keywordCandidates.add(e.getValue()); } }
From source file:pt.ua.ri.tokenizer.WordTokenizer.java
@Override public boolean hasNext() { return nonNull(bf) && (nextIndex != BreakIterator.DONE); }
From source file:pt.ua.ri.tokenizer.WordTokenizer.java
private void findNext() { checkState(bf.length() > 0);/* w w w .jav a 2 s .c o m*/ CharSequence dirtySequence; String dirtyString; Token nextToken; do { int firstIndex = nextIndex; nextIndex = bi.next(); dirtySequence = bf.subSequence(firstIndex, (nextIndex == BreakIterator.DONE) ? bi.last() : nextIndex); dirtyString = dirtySequence.toString(); nextToken = tokens.getOrDefault(dirtyString, null); } while (nextToken == null && !isAcceptableWord(dirtySequence) && hasNext()); if (nextToken == null) { final String afterNormalization = cleanWord(dirtySequence).toString(); nextToken = tokens.compute(afterNormalization, (key, token) -> token == null ? new Token(key) : token); // also store the dirty sequence, to increase performance tokens.putIfAbsent(dirtyString, nextToken); } current = nextToken; }