List of usage examples for java.text BreakIterator next
public abstract int next();
From source file:Main.java
public TextBoundaryFrame() { getContentPane().add(new JScrollPane(outputText)); Locale currentLocale = Locale.getDefault(); BreakIterator currentBreakIterator = null; currentBreakIterator = BreakIterator.getCharacterInstance(currentLocale); String text = "The quick, brown fox jump-ed\n" + "over the lazy \"dog.\" And then...what happened?"; currentBreakIterator.setText(text);//from w ww. ja va2 s.c o m outputText.setText(""); int from = currentBreakIterator.first(); int to; while ((to = currentBreakIterator.next()) != BreakIterator.DONE) { outputText.append(text.substring(from, to) + "|"); from = to; } outputText.append(text.substring(from)); }
From source file:be.idamediafoundry.sofa.livecycle.dsc.util.AbstractQDoxComponentInfoExtractor.java
final protected String getFirstSentence(String text) { String result = text;//from ww w. ja v a 2s . com if (text != null) { BreakIterator iterator = BreakIterator.getSentenceInstance(); iterator.setText(text); int start = iterator.first(); int end = iterator.next(); if (end != BreakIterator.DONE) { result = text.substring(start, end).trim(); } } return result; }
From source file:com.norconex.importer.handler.tagger.impl.TextStatisticsTagger.java
@Override protected void tagTextDocument(String reference, Reader input, ImporterMetadata metadata, boolean parsed) throws ImporterHandlerException { long charCount = 0; long wordCharCount = 0; long wordCount = 0; long sentenceCount = 0; long sentenceCharCount = 0; long paragraphCount = 0; //TODO make this more efficient, by doing all this in one pass. LineIterator it = IOUtils.lineIterator(input); while (it.hasNext()) { String line = it.nextLine().trim(); if (StringUtils.isBlank(line)) { continue; }/*from w ww .j av a 2s. c om*/ // Paragraph paragraphCount++; // Character charCount += line.length(); // Word Matcher matcher = PATTERN_WORD.matcher(line); while (matcher.find()) { int wordLength = matcher.end() - matcher.start(); wordCount++; wordCharCount += wordLength; } // Sentence BreakIterator boundary = BreakIterator.getSentenceInstance(); boundary.setText(line); int start = boundary.first(); for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { sentenceCharCount += (end - start); sentenceCount++; } } String field = StringUtils.EMPTY; if (StringUtils.isNotBlank(fieldName)) { field = fieldName.trim() + "."; } //--- Add fields --- metadata.addLong("document.stat." + field + "characterCount", charCount); metadata.addLong("document.stat." + field + "wordCount", wordCount); metadata.addLong("document.stat." + field + "sentenceCount", sentenceCount); metadata.addLong("document.stat." + field + "paragraphCount", paragraphCount); metadata.addString("document.stat." + field + "averageWordCharacterCount", divide(wordCharCount, wordCount)); metadata.addString("document.stat." + field + "averageSentenceCharacterCount", divide(sentenceCharCount, sentenceCount)); metadata.addString("document.stat." + field + "averageSentenceWordCount", divide(wordCount, sentenceCount)); metadata.addString("document.stat." + field + "averageParagraphCharacterCount", divide(charCount, paragraphCount)); metadata.addString("document.stat." + field + "averageParagraphSentenceCount", divide(sentenceCount, paragraphCount)); metadata.addString("document.stat." + field + "averageParagraphWordCount", divide(wordCount, paragraphCount)); }
From source file:com.glaf.core.util.StringTools.java
public static String[] toLowerCaseWordArray(String text) { if (text == null || text.length() == 0) { return new String[0]; }// w w w .ja v a2 s . c o m List<String> wordList = new java.util.ArrayList<String>(); BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(text); int start = 0; for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { String tmp = text.substring(start, end).trim(); tmp = replace(tmp, "+", ""); tmp = replace(tmp, "/", ""); tmp = replace(tmp, "\\", ""); tmp = replace(tmp, "#", ""); tmp = replace(tmp, "*", ""); tmp = replace(tmp, ")", ""); tmp = replace(tmp, "(", ""); tmp = replace(tmp, "&", ""); if (tmp.length() > 0) { wordList.add(tmp); } } return wordList.toArray(new String[wordList.size()]); }
From source file:IteratorTest.java
protected void refreshDisplay() { int startIndex, nextIndex; Vector items = new Vector(); String msgText = textArea.getText(); Locale locale = (Locale) (localeButton.getSelectedItem()); BreakIterator iterator = null; if (charButton.isSelected()) { iterator = BreakIterator.getCharacterInstance(locale); } else if (wordButton.isSelected()) { iterator = BreakIterator.getWordInstance(locale); } else if (lineButton.isSelected()) { iterator = BreakIterator.getLineInstance(locale); } else if (sentButton.isSelected()) { iterator = BreakIterator.getSentenceInstance(locale); }//w w w. j a va2s . c o m iterator.setText(msgText); startIndex = iterator.first(); nextIndex = iterator.next(); while (nextIndex != BreakIterator.DONE) { items.addElement(msgText.substring(startIndex, nextIndex)); startIndex = nextIndex; nextIndex = iterator.next(); } itemList.setListData(items); }
From source file:com.redhat.rcm.version.Cli.java
private static void printTextLine(final String line, final String indent, final int max, final PrintWriter pw) { final String fmt = "%s%-" + max + "s\n"; final List<String> lines = new ArrayList<String>(); final BreakIterator iter = BreakIterator.getLineInstance(); iter.setText(line);//from www .j a va 2 s . c om int start = iter.first(); int end = BreakIterator.DONE; final StringBuilder currentLine = new StringBuilder(); String seg; while (start != BreakIterator.DONE && (end = iter.next()) != BreakIterator.DONE) { seg = line.substring(start, end); if (currentLine.length() + seg.length() > max) { lines.add(currentLine.toString()); currentLine.setLength(0); } currentLine.append(seg); start = end; } if (currentLine.length() > 0) { lines.add(currentLine.toString()); } for (final String ln : lines) { pw.printf(fmt, indent, ln); } }
From source file:com.redhat.rcm.version.Cli.java
private static void printKVLine(final String key, final String value, final String fmt, final int valMax, final PrintWriter pw) { final List<String> lines = new ArrayList<String>(); final BreakIterator iter = BreakIterator.getLineInstance(); iter.setText(value);/*from w w w .j a va 2 s . c o m*/ int start = iter.first(); int end = BreakIterator.DONE; final StringBuilder currentLine = new StringBuilder(); String seg; while (start != BreakIterator.DONE && (end = iter.next()) != BreakIterator.DONE) { seg = value.substring(start, end); if (currentLine.length() + seg.length() > valMax) { lines.add(currentLine.toString()); currentLine.setLength(0); } currentLine.append(seg); start = end; } if (currentLine.length() > 0) { lines.add(currentLine.toString()); } pw.printf(fmt, key, lines.isEmpty() ? "" : lines.get(0)); if (lines.size() > 1) { for (int i = 1; i < lines.size(); i++) { // blank string to serve for indentation in format with two fields. pw.printf(fmt, "", lines.get(i)); } } }
From source file:Utils.java
/** * Wrap multi-line strings (and get the individual lines). * /*from w w w. j av a 2 s . c o m*/ * @param original * the original string to wrap * @param width * the maximum width of lines * @param breakIterator * breaks original to chars, words, sentences, depending on what * instance you provide. * @param removeNewLines * if <code>true</code>, any newlines in the original string are * ignored * @return the lines after wrapping */ public static String[] wrapStringToArray(String original, int width, BreakIterator breakIterator, boolean removeNewLines) { if (original.length() == 0) { return new String[] { original }; } String[] workingSet; // substitute original newlines with spaces, // remove newlines from head and tail if (removeNewLines) { original = trimString(original); original = original.replace('\n', ' '); workingSet = new String[] { original }; } else { StringTokenizer tokens = new StringTokenizer(original, "\n"); // NOI18N int len = tokens.countTokens(); workingSet = new String[len]; for (int i = 0; i < len; i++) { workingSet[i] = tokens.nextToken(); } } if (width < 1) { width = 1; } if (original.length() <= width) { return workingSet; } widthcheck: { boolean ok = true; for (int i = 0; i < workingSet.length; i++) { ok = ok && (workingSet[i].length() < width); if (!ok) { break widthcheck; } } return workingSet; } java.util.ArrayList<String> lines = new java.util.ArrayList<String>(); int lineStart = 0; // the position of start of currently processed line in // the original string for (int i = 0; i < workingSet.length; i++) { if (workingSet[i].length() < width) { lines.add(workingSet[i]); } else { breakIterator.setText(workingSet[i]); int nextStart = breakIterator.next(); int prevStart = 0; do { while (((nextStart - lineStart) < width) && (nextStart != BreakIterator.DONE)) { prevStart = nextStart; nextStart = breakIterator.next(); } if (nextStart == BreakIterator.DONE) { nextStart = prevStart = workingSet[i].length(); } if (prevStart == 0) { prevStart = nextStart; } lines.add(workingSet[i].substring(lineStart, prevStart)); lineStart = prevStart; prevStart = 0; } while (lineStart < workingSet[i].length()); lineStart = 0; } } String[] s = new String[lines.size()]; return (String[]) lines.toArray(s); }
From source file:com.tao.realweb.util.StringUtil.java
/** * Converts a line of text into an array of lower case words using a * BreakIterator.wordInstance().<p> * * This method is under the Jive Open Source Software License and was * written by Mark Imbriaco./* www. j av a 2s .c o m*/ * * @param text a String of text to convert into an array of words * @return text broken up into an array of words. */ public static String[] toLowerCaseWordArray(String text) { if (text == null || text.length() == 0) { return new String[0]; } List<String> wordList = new ArrayList<String>(); BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(text); int start = 0; for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { String tmp = text.substring(start, end).trim(); // Remove characters that are not needed. tmp = replace(tmp, "+", ""); tmp = replace(tmp, "/", ""); tmp = replace(tmp, "\\", ""); tmp = replace(tmp, "#", ""); tmp = replace(tmp, "*", ""); tmp = replace(tmp, ")", ""); tmp = replace(tmp, "(", ""); tmp = replace(tmp, "&", ""); if (tmp.length() > 0) { wordList.add(tmp); } } return wordList.toArray(new String[wordList.size()]); }
From source file:org.cloudgraph.examples.test.model.NLPWikiParseTest.java
private void parse(StringBuilder buf) throws IOException { BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); String text = buf.toString(); int counter = 0; iterator.setText(text);//w w w . j a v a 2s . c o m int lastIndex = iterator.first(); while (lastIndex != BreakIterator.DONE) { int firstIndex = lastIndex; lastIndex = iterator.next(); if (lastIndex != BreakIterator.DONE) { String sentence = text.substring(firstIndex, lastIndex); long before = System.currentTimeMillis(); //parse(sentence); long after = System.currentTimeMillis(); log.info("time4: " + String.valueOf(after - before) + ": " + sentence); counter++; } } }