List of usage examples for java.text BreakIterator next
public abstract int next();
From source file:org.cloudgraph.examples.test.model.StanfordCoreNLPTest.java
private void parse(StringBuilder buf) throws IOException { BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); String text = buf.toString(); int counter = 0; iterator.setText(text);//from w ww.j av a2 s. c om int lastIndex = iterator.first(); while (lastIndex != BreakIterator.DONE) { int firstIndex = lastIndex; lastIndex = iterator.next(); if (lastIndex != BreakIterator.DONE) { String sentence = text.substring(firstIndex, lastIndex); long before = System.currentTimeMillis(); parse(sentence); long after = System.currentTimeMillis(); log.info("time4: " + String.valueOf(after - before) + ": " + sentence); counter++; } } }
From source file:org.jivesoftware.community.util.StringUtils.java
public static String[] toLowerCaseWordArray(String text) { if (text == null || text.length() == 0) return new String[0]; ArrayList wordList = new ArrayList(); BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(text);/* w ww.j a v a 2s . c om*/ int start = 0; for (int end = boundary.next(); end != -1; end = boundary.next()) { String tmp = text.substring(start, end).trim(); tmp = replace(tmp, "+", ""); tmp = replace(tmp, "/", ""); tmp = replace(tmp, "\\", ""); tmp = replace(tmp, "#", ""); tmp = replace(tmp, "*", ""); tmp = replace(tmp, ")", ""); tmp = replace(tmp, "(", ""); tmp = replace(tmp, "&", ""); if (tmp.length() > 0) wordList.add(tmp); start = end; } return (String[]) wordList.toArray(new String[wordList.size()]); }
From source file:org.jivesoftware.util.StringUtils.java
/** * Converts a line of text into an array of lower case words using a * BreakIterator.wordInstance().<p> * * This method is under the Jive Open Source Software License and was * written by Mark Imbriaco.//w ww .j a v a2s .c o m * * @param text a String of text to convert into an array of words * @return text broken up into an array of words. */ public static String[] toLowerCaseWordArray(String text) { if (text == null || text.length() == 0) { return new String[0]; } List<String> wordList = new ArrayList<>(); BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(text); int start = 0; for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { String tmp = text.substring(start, end).trim(); // Remove characters that are not needed. tmp = replace(tmp, "+", ""); tmp = replace(tmp, "/", ""); tmp = replace(tmp, "\\", ""); tmp = replace(tmp, "#", ""); tmp = replace(tmp, "*", ""); tmp = replace(tmp, ")", ""); tmp = replace(tmp, "(", ""); tmp = replace(tmp, "&", ""); if (tmp.length() > 0) { wordList.add(tmp); } } return wordList.toArray(new String[wordList.size()]); }
From source file:org.lnicholls.galleon.util.Tools.java
public static String[] layout(int width, FontMetrics metrics, String text) { ArrayList lines = new ArrayList(); if (text != null) { String line = ""; BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(text);/*from w w w . j av a 2 s . com*/ int start = boundary.first(); for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { String word = text.substring(start, end); String trimmed = word.replaceAll(" ", ""); int metricsWidth = (line + word).length() * 20; if (metrics != null) metricsWidth = metrics.stringWidth(line + word); if (trimmed.equals("\n") || trimmed.equals("\r") || trimmed.equals("\r\n")) { lines.add(line.trim()); line = ""; } else if (metricsWidth > width) { lines.add(line.trim()); line = word; } else line = line + word; } if (line.trim().length() > 0) lines.add(line.trim()); } return (String[]) lines.toArray(new String[0]); }
From source file:pl.edu.icm.coansys.kwdextraction.RakeExtractor.java
/** * Finding words or word sequences separated by stopwords, punctuation marks * etc./*from w w w. j a v a 2 s . c o m*/ */ private void extractKeywordCandidates() { Map<String, KeywordCandidate> candidatesMap = new HashMap<String, KeywordCandidate>(); BreakIterator wordIterator = BreakIterator.getWordInstance(); wordIterator.setText(content); int wordStart = wordIterator.first(); int candidateStart = wordStart; String candidateStr = null; KeywordCandidate kwdCand = new KeywordCandidate(); for (int wordEnd = wordIterator .next(); wordEnd != BreakIterator.DONE; wordStart = wordEnd, wordEnd = wordIterator.next()) { String word = content.substring(wordStart, wordEnd).trim().toLowerCase(); String alpha = word.replaceAll(ILLEGAL_CHARS, ""); if (!word.isEmpty()) { if (stopwords.get(lang).contains(word) || word.matches("\\W+") || isNum(word) || !word.equals(alpha)) { candidateStr = content.substring(candidateStart, wordStart); } else { kwdCand.addWord(word); if (wordEnd == content.length()) { candidateStr = content.substring(candidateStart, wordEnd); } } if (candidateStr != null) { candidateStr = candidateStr.trim().toLowerCase().replaceAll(ILLEGAL_CHARS, "") .replaceAll("\\s+", " "); if (!candidateStr.isEmpty()) { if (candidatesMap.containsKey(candidateStr)) { candidatesMap.get(candidateStr).incCounter(); } else { kwdCand.setKeyword(candidateStr); candidatesMap.put(candidateStr, kwdCand); } } candidateStr = null; candidateStart = wordEnd; kwdCand = new KeywordCandidate(); } } } keywordCandidates = new ArrayList<KeywordCandidate>(); for (Entry<String, KeywordCandidate> e : candidatesMap.entrySet()) { keywordCandidates.add(e.getValue()); } }