List of usage examples for java.text BreakIterator setText
public abstract void setText(CharacterIterator newText);
From source file:de.tudarmstadt.lt.lm.service.BreakIteratorStringProvider.java
@Override public List<String> splitSentences(String text, String language_code) throws Exception { LOG.trace(String.format("Splitting sentences from text: %s", StringUtils.abbreviate(text, 200))); List<String> sentences = new ArrayList<String>(); text = de.tudarmstadt.lt.utilities.StringUtils.trim_and_replace_emptyspace(text, " "); for (LineIterator iter = new LineIterator(new StringReader(text)); iter.hasNext();) { String line = iter.nextLine(); BreakIterator sentence_bounds = BreakIterator.getSentenceInstance(LocaleUtils.toLocale(language_code)); sentence_bounds.setText(line); int begin_s = sentence_bounds.first(); for (int end_s = sentence_bounds .next(); end_s != BreakIterator.DONE; begin_s = end_s, end_s = sentence_bounds.next()) { String sentence = de.tudarmstadt.lt.utilities.StringUtils.trim(line.substring(begin_s, end_s)); if (sentence.isEmpty()) continue; sentences.add(sentence);/*from ww w. java 2 s. c o m*/ LOG.trace(String.format("Current sentence: %s", StringUtils.abbreviate(sentence, 200))); } } LOG.trace(String.format("Split text '%s' into '%d' sentences.", StringUtils.abbreviate(text, 200), sentences.size())); return sentences; }
From source file:Main.java
public TextBoundaryFrame() { getContentPane().add(new JScrollPane(outputText)); Locale currentLocale = Locale.getDefault(); BreakIterator currentBreakIterator = null; currentBreakIterator = BreakIterator.getCharacterInstance(currentLocale); String text = "The quick, brown fox jump-ed\n" + "over the lazy \"dog.\" And then...what happened?"; currentBreakIterator.setText(text); outputText.setText(""); int from = currentBreakIterator.first(); int to;/* ww w.jav a 2 s . co m*/ while ((to = currentBreakIterator.next()) != BreakIterator.DONE) { outputText.append(text.substring(from, to) + "|"); from = to; } outputText.append(text.substring(from)); }
From source file:be.idamediafoundry.sofa.livecycle.dsc.util.AbstractQDoxComponentInfoExtractor.java
final protected String getFirstSentence(String text) { String result = text;// w ww. j av a2 s .co m if (text != null) { BreakIterator iterator = BreakIterator.getSentenceInstance(); iterator.setText(text); int start = iterator.first(); int end = iterator.next(); if (end != BreakIterator.DONE) { result = text.substring(start, end).trim(); } } return result; }
From source file:com.norconex.importer.handler.tagger.impl.TextStatisticsTagger.java
@Override protected void tagTextDocument(String reference, Reader input, ImporterMetadata metadata, boolean parsed) throws ImporterHandlerException { long charCount = 0; long wordCharCount = 0; long wordCount = 0; long sentenceCount = 0; long sentenceCharCount = 0; long paragraphCount = 0; //TODO make this more efficient, by doing all this in one pass. LineIterator it = IOUtils.lineIterator(input); while (it.hasNext()) { String line = it.nextLine().trim(); if (StringUtils.isBlank(line)) { continue; }/*from w w w . j ava 2 s .c o m*/ // Paragraph paragraphCount++; // Character charCount += line.length(); // Word Matcher matcher = PATTERN_WORD.matcher(line); while (matcher.find()) { int wordLength = matcher.end() - matcher.start(); wordCount++; wordCharCount += wordLength; } // Sentence BreakIterator boundary = BreakIterator.getSentenceInstance(); boundary.setText(line); int start = boundary.first(); for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { sentenceCharCount += (end - start); sentenceCount++; } } String field = StringUtils.EMPTY; if (StringUtils.isNotBlank(fieldName)) { field = fieldName.trim() + "."; } //--- Add fields --- metadata.addLong("document.stat." + field + "characterCount", charCount); metadata.addLong("document.stat." + field + "wordCount", wordCount); metadata.addLong("document.stat." + field + "sentenceCount", sentenceCount); metadata.addLong("document.stat." + field + "paragraphCount", paragraphCount); metadata.addString("document.stat." + field + "averageWordCharacterCount", divide(wordCharCount, wordCount)); metadata.addString("document.stat." + field + "averageSentenceCharacterCount", divide(sentenceCharCount, sentenceCount)); metadata.addString("document.stat." + field + "averageSentenceWordCount", divide(wordCount, sentenceCount)); metadata.addString("document.stat." + field + "averageParagraphCharacterCount", divide(charCount, paragraphCount)); metadata.addString("document.stat." + field + "averageParagraphSentenceCount", divide(sentenceCount, paragraphCount)); metadata.addString("document.stat." + field + "averageParagraphWordCount", divide(wordCount, paragraphCount)); }
From source file:com.glaf.core.util.StringTools.java
public static String[] toLowerCaseWordArray(String text) { if (text == null || text.length() == 0) { return new String[0]; }/* w w w . j a va 2 s . c o m*/ List<String> wordList = new java.util.ArrayList<String>(); BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(text); int start = 0; for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { String tmp = text.substring(start, end).trim(); tmp = replace(tmp, "+", ""); tmp = replace(tmp, "/", ""); tmp = replace(tmp, "\\", ""); tmp = replace(tmp, "#", ""); tmp = replace(tmp, "*", ""); tmp = replace(tmp, ")", ""); tmp = replace(tmp, "(", ""); tmp = replace(tmp, "&", ""); if (tmp.length() > 0) { wordList.add(tmp); } } return wordList.toArray(new String[wordList.size()]); }
From source file:org.cloudgraph.examples.test.model.StanfordCoreNLPTest.java
private void parse(StringBuilder buf) throws IOException { BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); String text = buf.toString(); int counter = 0; iterator.setText(text); int lastIndex = iterator.first(); while (lastIndex != BreakIterator.DONE) { int firstIndex = lastIndex; lastIndex = iterator.next();/*from w ww .j a va 2 s . co m*/ if (lastIndex != BreakIterator.DONE) { String sentence = text.substring(firstIndex, lastIndex); long before = System.currentTimeMillis(); parse(sentence); long after = System.currentTimeMillis(); log.info("time4: " + String.valueOf(after - before) + ": " + sentence); counter++; } } }
From source file:com.amazon.android.ui.widget.EllipsizedTextView.java
/** * Find the first word/character break index before ellipsisIndex. *///from w w w . j a va 2 s . c o m private int breakBefore(final String displayText, final int ellipsisIndex, final BreakIterator iterator) { iterator.setText(displayText); return iterator.preceding(ellipsisIndex); }
From source file:org.cloudgraph.examples.test.model.NLPWikiParseTest.java
private void parse(StringBuilder buf) throws IOException { BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); String text = buf.toString(); int counter = 0; iterator.setText(text); int lastIndex = iterator.first(); while (lastIndex != BreakIterator.DONE) { int firstIndex = lastIndex; lastIndex = iterator.next();//from ww w.j a v a2 s . c om if (lastIndex != BreakIterator.DONE) { String sentence = text.substring(firstIndex, lastIndex); long before = System.currentTimeMillis(); //parse(sentence); long after = System.currentTimeMillis(); log.info("time4: " + String.valueOf(after - before) + ": " + sentence); counter++; } } }
From source file:com.redhat.rcm.version.Cli.java
private static void printTextLine(final String line, final String indent, final int max, final PrintWriter pw) { final String fmt = "%s%-" + max + "s\n"; final List<String> lines = new ArrayList<String>(); final BreakIterator iter = BreakIterator.getLineInstance(); iter.setText(line); int start = iter.first(); int end = BreakIterator.DONE; final StringBuilder currentLine = new StringBuilder(); String seg;// w w w. ja v a2 s . co m while (start != BreakIterator.DONE && (end = iter.next()) != BreakIterator.DONE) { seg = line.substring(start, end); if (currentLine.length() + seg.length() > max) { lines.add(currentLine.toString()); currentLine.setLength(0); } currentLine.append(seg); start = end; } if (currentLine.length() > 0) { lines.add(currentLine.toString()); } for (final String ln : lines) { pw.printf(fmt, indent, ln); } }
From source file:com.redhat.rcm.version.Cli.java
private static void printKVLine(final String key, final String value, final String fmt, final int valMax, final PrintWriter pw) { final List<String> lines = new ArrayList<String>(); final BreakIterator iter = BreakIterator.getLineInstance(); iter.setText(value); int start = iter.first(); int end = BreakIterator.DONE; final StringBuilder currentLine = new StringBuilder(); String seg;//from w w w .java2s .c o m while (start != BreakIterator.DONE && (end = iter.next()) != BreakIterator.DONE) { seg = value.substring(start, end); if (currentLine.length() + seg.length() > valMax) { lines.add(currentLine.toString()); currentLine.setLength(0); } currentLine.append(seg); start = end; } if (currentLine.length() > 0) { lines.add(currentLine.toString()); } pw.printf(fmt, key, lines.isEmpty() ? "" : lines.get(0)); if (lines.size() > 1) { for (int i = 1; i < lines.size(); i++) { // blank string to serve for indentation in format with two fields. pw.printf(fmt, "", lines.get(i)); } } }