List of usage examples for java.text BreakIterator first
public abstract int first();
From source file:Main.java
public TextBoundaryFrame() { getContentPane().add(new JScrollPane(outputText)); Locale currentLocale = Locale.getDefault(); BreakIterator currentBreakIterator = null; currentBreakIterator = BreakIterator.getCharacterInstance(currentLocale); String text = "The quick, brown fox jump-ed\n" + "over the lazy \"dog.\" And then...what happened?"; currentBreakIterator.setText(text);//w w w . j a va 2s.c o m outputText.setText(""); int from = currentBreakIterator.first(); int to; while ((to = currentBreakIterator.next()) != BreakIterator.DONE) { outputText.append(text.substring(from, to) + "|"); from = to; } outputText.append(text.substring(from)); }
From source file:be.idamediafoundry.sofa.livecycle.dsc.util.AbstractQDoxComponentInfoExtractor.java
final protected String getFirstSentence(String text) { String result = text;//from w w w . j a va 2 s. c o m if (text != null) { BreakIterator iterator = BreakIterator.getSentenceInstance(); iterator.setText(text); int start = iterator.first(); int end = iterator.next(); if (end != BreakIterator.DONE) { result = text.substring(start, end).trim(); } } return result; }
From source file:com.norconex.importer.handler.tagger.impl.TextStatisticsTagger.java
@Override protected void tagTextDocument(String reference, Reader input, ImporterMetadata metadata, boolean parsed) throws ImporterHandlerException { long charCount = 0; long wordCharCount = 0; long wordCount = 0; long sentenceCount = 0; long sentenceCharCount = 0; long paragraphCount = 0; //TODO make this more efficient, by doing all this in one pass. LineIterator it = IOUtils.lineIterator(input); while (it.hasNext()) { String line = it.nextLine().trim(); if (StringUtils.isBlank(line)) { continue; }// w ww.j a va 2 s . c om // Paragraph paragraphCount++; // Character charCount += line.length(); // Word Matcher matcher = PATTERN_WORD.matcher(line); while (matcher.find()) { int wordLength = matcher.end() - matcher.start(); wordCount++; wordCharCount += wordLength; } // Sentence BreakIterator boundary = BreakIterator.getSentenceInstance(); boundary.setText(line); int start = boundary.first(); for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { sentenceCharCount += (end - start); sentenceCount++; } } String field = StringUtils.EMPTY; if (StringUtils.isNotBlank(fieldName)) { field = fieldName.trim() + "."; } //--- Add fields --- metadata.addLong("document.stat." + field + "characterCount", charCount); metadata.addLong("document.stat." + field + "wordCount", wordCount); metadata.addLong("document.stat." + field + "sentenceCount", sentenceCount); metadata.addLong("document.stat." + field + "paragraphCount", paragraphCount); metadata.addString("document.stat." + field + "averageWordCharacterCount", divide(wordCharCount, wordCount)); metadata.addString("document.stat." + field + "averageSentenceCharacterCount", divide(sentenceCharCount, sentenceCount)); metadata.addString("document.stat." + field + "averageSentenceWordCount", divide(wordCount, sentenceCount)); metadata.addString("document.stat." + field + "averageParagraphCharacterCount", divide(charCount, paragraphCount)); metadata.addString("document.stat." + field + "averageParagraphSentenceCount", divide(sentenceCount, paragraphCount)); metadata.addString("document.stat." + field + "averageParagraphWordCount", divide(wordCount, paragraphCount)); }
From source file:com.redhat.rcm.version.Cli.java
private static void printTextLine(final String line, final String indent, final int max, final PrintWriter pw) { final String fmt = "%s%-" + max + "s\n"; final List<String> lines = new ArrayList<String>(); final BreakIterator iter = BreakIterator.getLineInstance(); iter.setText(line);//from www.j a va2 s . c o m int start = iter.first(); int end = BreakIterator.DONE; final StringBuilder currentLine = new StringBuilder(); String seg; while (start != BreakIterator.DONE && (end = iter.next()) != BreakIterator.DONE) { seg = line.substring(start, end); if (currentLine.length() + seg.length() > max) { lines.add(currentLine.toString()); currentLine.setLength(0); } currentLine.append(seg); start = end; } if (currentLine.length() > 0) { lines.add(currentLine.toString()); } for (final String ln : lines) { pw.printf(fmt, indent, ln); } }
From source file:com.redhat.rcm.version.Cli.java
private static void printKVLine(final String key, final String value, final String fmt, final int valMax, final PrintWriter pw) { final List<String> lines = new ArrayList<String>(); final BreakIterator iter = BreakIterator.getLineInstance(); iter.setText(value);/* ww w. ja v a 2 s . c o m*/ int start = iter.first(); int end = BreakIterator.DONE; final StringBuilder currentLine = new StringBuilder(); String seg; while (start != BreakIterator.DONE && (end = iter.next()) != BreakIterator.DONE) { seg = value.substring(start, end); if (currentLine.length() + seg.length() > valMax) { lines.add(currentLine.toString()); currentLine.setLength(0); } currentLine.append(seg); start = end; } if (currentLine.length() > 0) { lines.add(currentLine.toString()); } pw.printf(fmt, key, lines.isEmpty() ? "" : lines.get(0)); if (lines.size() > 1) { for (int i = 1; i < lines.size(); i++) { // blank string to serve for indentation in format with two fields. pw.printf(fmt, "", lines.get(i)); } } }
From source file:IteratorTest.java
protected void refreshDisplay() { int startIndex, nextIndex; Vector items = new Vector(); String msgText = textArea.getText(); Locale locale = (Locale) (localeButton.getSelectedItem()); BreakIterator iterator = null; if (charButton.isSelected()) { iterator = BreakIterator.getCharacterInstance(locale); } else if (wordButton.isSelected()) { iterator = BreakIterator.getWordInstance(locale); } else if (lineButton.isSelected()) { iterator = BreakIterator.getLineInstance(locale); } else if (sentButton.isSelected()) { iterator = BreakIterator.getSentenceInstance(locale); }// w ww .j a v a 2 s . c om iterator.setText(msgText); startIndex = iterator.first(); nextIndex = iterator.next(); while (nextIndex != BreakIterator.DONE) { items.addElement(msgText.substring(startIndex, nextIndex)); startIndex = nextIndex; nextIndex = iterator.next(); } itemList.setListData(items); }
From source file:org.cloudgraph.examples.test.model.NLPWikiParseTest.java
private void parse(StringBuilder buf) throws IOException { BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); String text = buf.toString(); int counter = 0; iterator.setText(text);/* ww w .ja v a 2s.c om*/ int lastIndex = iterator.first(); while (lastIndex != BreakIterator.DONE) { int firstIndex = lastIndex; lastIndex = iterator.next(); if (lastIndex != BreakIterator.DONE) { String sentence = text.substring(firstIndex, lastIndex); long before = System.currentTimeMillis(); //parse(sentence); long after = System.currentTimeMillis(); log.info("time4: " + String.valueOf(after - before) + ": " + sentence); counter++; } } }
From source file:org.cloudgraph.examples.test.model.StanfordCoreNLPTest.java
private void parse(StringBuilder buf) throws IOException { BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); String text = buf.toString(); int counter = 0; iterator.setText(text);//from ww w. j a va 2s.c o m int lastIndex = iterator.first(); while (lastIndex != BreakIterator.DONE) { int firstIndex = lastIndex; lastIndex = iterator.next(); if (lastIndex != BreakIterator.DONE) { String sentence = text.substring(firstIndex, lastIndex); long before = System.currentTimeMillis(); parse(sentence); long after = System.currentTimeMillis(); log.info("time4: " + String.valueOf(after - before) + ": " + sentence); counter++; } } }
From source file:org.lnicholls.galleon.util.Tools.java
public static String[] layout(int width, FontMetrics metrics, String text) { ArrayList lines = new ArrayList(); if (text != null) { String line = ""; BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(text);/*from w ww . j a v a 2 s. c om*/ int start = boundary.first(); for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { String word = text.substring(start, end); String trimmed = word.replaceAll(" ", ""); int metricsWidth = (line + word).length() * 20; if (metrics != null) metricsWidth = metrics.stringWidth(line + word); if (trimmed.equals("\n") || trimmed.equals("\r") || trimmed.equals("\r\n")) { lines.add(line.trim()); line = ""; } else if (metricsWidth > width) { lines.add(line.trim()); line = word; } else line = line + word; } if (line.trim().length() > 0) lines.add(line.trim()); } return (String[]) lines.toArray(new String[0]); }
From source file:pl.edu.icm.coansys.kwdextraction.RakeExtractor.java
/** * Finding words or word sequences separated by stopwords, punctuation marks * etc.// w w w. ja v a 2 s. c om */ private void extractKeywordCandidates() { Map<String, KeywordCandidate> candidatesMap = new HashMap<String, KeywordCandidate>(); BreakIterator wordIterator = BreakIterator.getWordInstance(); wordIterator.setText(content); int wordStart = wordIterator.first(); int candidateStart = wordStart; String candidateStr = null; KeywordCandidate kwdCand = new KeywordCandidate(); for (int wordEnd = wordIterator .next(); wordEnd != BreakIterator.DONE; wordStart = wordEnd, wordEnd = wordIterator.next()) { String word = content.substring(wordStart, wordEnd).trim().toLowerCase(); String alpha = word.replaceAll(ILLEGAL_CHARS, ""); if (!word.isEmpty()) { if (stopwords.get(lang).contains(word) || word.matches("\\W+") || isNum(word) || !word.equals(alpha)) { candidateStr = content.substring(candidateStart, wordStart); } else { kwdCand.addWord(word); if (wordEnd == content.length()) { candidateStr = content.substring(candidateStart, wordEnd); } } if (candidateStr != null) { candidateStr = candidateStr.trim().toLowerCase().replaceAll(ILLEGAL_CHARS, "") .replaceAll("\\s+", " "); if (!candidateStr.isEmpty()) { if (candidatesMap.containsKey(candidateStr)) { candidatesMap.get(candidateStr).incCounter(); } else { kwdCand.setKeyword(candidateStr); candidatesMap.put(candidateStr, kwdCand); } } candidateStr = null; candidateStart = wordEnd; kwdCand = new KeywordCandidate(); } } } keywordCandidates = new ArrayList<KeywordCandidate>(); for (Entry<String, KeywordCandidate> e : candidatesMap.entrySet()) { keywordCandidates.add(e.getValue()); } }