List of usage examples for java.text BreakIterator DONE
int DONE
To view the source code for java.text BreakIterator DONE.
Click Source Link
From source file:de.tudarmstadt.lt.lm.service.BreakIteratorStringProvider.java
@Override public List<String> tokenizeSentence_intern(String sentence, String language_code) { ArrayList<String> tokens = new ArrayList<String>(); BreakIterator token_bounds = BreakIterator.getWordInstance(LocaleUtils.toLocale(language_code)); token_bounds.setText(sentence.trim()); int begin_t = token_bounds.first(); for (int end_t = token_bounds.next(); end_t != BreakIterator.DONE; begin_t = end_t, end_t = token_bounds .next()) {// w w w .jav a2 s. com String token = de.tudarmstadt.lt.utilities.StringUtils .trim_and_replace_emptyspace(sentence.substring(begin_t, end_t), "_"); if (!token.isEmpty()) { // add token iff token is not empty tokens.add(token); } } return tokens; }
From source file:com.norconex.importer.handler.tagger.impl.TextStatisticsTagger.java
@Override protected void tagTextDocument(String reference, Reader input, ImporterMetadata metadata, boolean parsed) throws ImporterHandlerException { long charCount = 0; long wordCharCount = 0; long wordCount = 0; long sentenceCount = 0; long sentenceCharCount = 0; long paragraphCount = 0; //TODO make this more efficient, by doing all this in one pass. LineIterator it = IOUtils.lineIterator(input); while (it.hasNext()) { String line = it.nextLine().trim(); if (StringUtils.isBlank(line)) { continue; }//from w ww . j a v a 2s .c o m // Paragraph paragraphCount++; // Character charCount += line.length(); // Word Matcher matcher = PATTERN_WORD.matcher(line); while (matcher.find()) { int wordLength = matcher.end() - matcher.start(); wordCount++; wordCharCount += wordLength; } // Sentence BreakIterator boundary = BreakIterator.getSentenceInstance(); boundary.setText(line); int start = boundary.first(); for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { sentenceCharCount += (end - start); sentenceCount++; } } String field = StringUtils.EMPTY; if (StringUtils.isNotBlank(fieldName)) { field = fieldName.trim() + "."; } //--- Add fields --- metadata.addLong("document.stat." + field + "characterCount", charCount); metadata.addLong("document.stat." + field + "wordCount", wordCount); metadata.addLong("document.stat." + field + "sentenceCount", sentenceCount); metadata.addLong("document.stat." + field + "paragraphCount", paragraphCount); metadata.addString("document.stat." + field + "averageWordCharacterCount", divide(wordCharCount, wordCount)); metadata.addString("document.stat." + field + "averageSentenceCharacterCount", divide(sentenceCharCount, sentenceCount)); metadata.addString("document.stat." + field + "averageSentenceWordCount", divide(wordCount, sentenceCount)); metadata.addString("document.stat." + field + "averageParagraphCharacterCount", divide(charCount, paragraphCount)); metadata.addString("document.stat." + field + "averageParagraphSentenceCount", divide(sentenceCount, paragraphCount)); metadata.addString("document.stat." + field + "averageParagraphWordCount", divide(wordCount, paragraphCount)); }
From source file:graphene.util.StringUtils.java
/** * Convert a string to a list of strings broken up by end of sentence * tokens./*from w w w. j a v a 2s . com*/ * * @param input * @param locale * @return */ public static List<String> convertToSentences(final String input, final Locale locale) { final BreakIterator iterator = BreakIterator.getSentenceInstance(locale); iterator.setText(input); final ArrayList<String> sentences = new ArrayList<String>(); int start = iterator.first(); for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) { sentences.add(input.substring(start, end)); } return sentences; }
From source file:be.idamediafoundry.sofa.livecycle.dsc.util.AbstractQDoxComponentInfoExtractor.java
final protected String getFirstSentence(String text) { String result = text;// w w w.jav a 2 s.c om if (text != null) { BreakIterator iterator = BreakIterator.getSentenceInstance(); iterator.setText(text); int start = iterator.first(); int end = iterator.next(); if (end != BreakIterator.DONE) { result = text.substring(start, end).trim(); } } return result; }
From source file:com.conversantmedia.mapreduce.tool.RunJob.java
private static void splitLine(List<String> lines, String text, int maxLength) { BreakIterator boundary = BreakIterator.getLineInstance(); boundary.setText(text);/*from w ww . j av a 2 s . c o m*/ int start = boundary.first(); int end = boundary.next(); int lineLength = 0; StringBuilder buffer = new StringBuilder(); while (end != BreakIterator.DONE) { String word = text.substring(start, end); lineLength = lineLength + word.length(); if (lineLength > maxLength) { lineLength = word.length(); lines.add(buffer.toString()); buffer.setLength(0); } buffer.append(word); start = end; end = boundary.next(); } lines.add(buffer.toString()); }
From source file:net.sf.jasperreports.engine.fill.SimpleTextLineWrapper.java
protected TextLine measureLine(float width, boolean requireWord, int endLimit) { // try to guess how much of the text would fit based on the average char width int measureIndex = estimateBreakIndex(width, endLimit); // if estimating that there's more than a line, check measureExactMultiline if (measureIndex < endLimit && measureExactMultiline) { return measureExactLine(width, endLimit, requireWord); }// w ww .ja va 2 s .c o m // measure the text Rectangle2D bounds = measureParagraphFragment(measureIndex); //FIXME fast exit when the height is exceeded Rectangle2D measuredBounds = bounds; if (bounds.getWidth() <= width) { // see if there's more that could fit boolean done = false; do { int nextBreakIndex = measureIndex < endLimit ? paragraphBreakIterator.following(measureIndex) : BreakIterator.DONE; if (nextBreakIndex == BreakIterator.DONE || nextBreakIndex > endLimit) { // the next break is after the limit, we're done done = true; } else { // measure to the next break Rectangle2D nextBounds = measureParagraphFragment(nextBreakIndex); if (nextBounds.getWidth() <= width) { measuredBounds = nextBounds; measureIndex = nextBreakIndex; // loop } else { done = true; } } } while (!done); } else { // didn't fit, try shorter texts boolean done = false; do { int previousBreakIndex = measureIndex > paragraphPosition ? paragraphBreakIterator.preceding(measureIndex) : BreakIterator.DONE; if (previousBreakIndex == BreakIterator.DONE || previousBreakIndex <= paragraphPosition) { if (requireWord) { // no full word fits, returning empty measureIndex = paragraphPosition; } else { // we need to break inside the word. // measuring the exact break index as estimating/guessing might be slower. measureIndex = measureExactLineBreakIndex(width, endLimit, requireWord); measuredBounds = measureParagraphFragment(measureIndex); } done = true; } else { measureIndex = previousBreakIndex; Rectangle2D prevBounds = measureParagraphFragment(measureIndex); if (prevBounds.getWidth() <= width) { // fitted, we're done measuredBounds = prevBounds; done = true; } } } while (!done); } if (measureIndex <= paragraphPosition) { // nothing fit return null; } return toTextLine(measureIndex, measuredBounds); }
From source file:com.tao.realweb.util.StringUtil.java
/** * Converts a line of text into an array of lower case words using a * BreakIterator.wordInstance().<p> * * This method is under the Jive Open Source Software License and was * written by Mark Imbriaco.// w w w . j av a 2s . c om * * @param text a String of text to convert into an array of words * @return text broken up into an array of words. */ public static String[] toLowerCaseWordArray(String text) { if (text == null || text.length() == 0) { return new String[0]; } List<String> wordList = new ArrayList<String>(); BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(text); int start = 0; for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { String tmp = text.substring(start, end).trim(); // Remove characters that are not needed. tmp = replace(tmp, "+", ""); tmp = replace(tmp, "/", ""); tmp = replace(tmp, "\\", ""); tmp = replace(tmp, "#", ""); tmp = replace(tmp, "*", ""); tmp = replace(tmp, ")", ""); tmp = replace(tmp, "(", ""); tmp = replace(tmp, "&", ""); if (tmp.length() > 0) { wordList.add(tmp); } } return wordList.toArray(new String[wordList.size()]); }
From source file:net.sf.jasperreports.engine.fill.SimpleTextLineWrapper.java
protected int estimateBreakIndex(float width, int endLimit) { double avgCharWidth = fontInfo.charWidthEstimate(); if ((endLimit - paragraphPosition) * avgCharWidth <= width * FONT_WIDTH_CHECK_FACTOR) { // there are chances that the entire text would fit, let's be optimistic return endLimit; }// w ww . j a v a 2s. c om // estimate how many characters would fit int charCountEstimate = (int) Math.ceil(width / avgCharWidth); int estimateFitPosition = paragraphPosition + charCountEstimate; if (estimateFitPosition > endLimit) { // estimated that everything would fit return endLimit; } // find the break after the estimate int breakAfterEstimatePosition = paragraphBreakIterator.following(estimateFitPosition); if (breakAfterEstimatePosition == BreakIterator.DONE || breakAfterEstimatePosition > endLimit) { breakAfterEstimatePosition = endLimit; } int estimateIndex = breakAfterEstimatePosition; // if the after break is too far way from the estimate, see if the break before is closer if (breakAfterEstimatePosition > estimateFitPosition + NEXT_BREAK_INDEX_THRESHOLD) { int breakBeforeEstimatePosition = paragraphBreakIterator.previous(); // if the break before is closer than the break after, use the break before if (breakBeforeEstimatePosition == BreakIterator.DONE && breakBeforeEstimatePosition > paragraphPosition && estimateFitPosition - breakBeforeEstimatePosition < breakAfterEstimatePosition - estimateFitPosition) { estimateIndex = breakBeforeEstimatePosition; } } return estimateIndex; }
From source file:net.sf.jasperreports.engine.fill.SimpleTextLineWrapper.java
protected Rectangle2D measureParagraphFragment(int measureIndex) { int endIndex = measureIndex; if (endIndex > paragraphPosition + 1) { char lastMeasureChar = paragraphText.charAt(endIndex - 1); if (Character.isWhitespace(lastMeasureChar)) { // exclude trailing white space from the text to measure. // use the previous break as limit, but always keep at least one character to measure. int preceding = paragraphBreakIterator.preceding(endIndex); if (preceding == BreakIterator.DONE || preceding <= paragraphPosition) { preceding = paragraphPosition + 1; }/*w w w . j ava 2s . co m*/ do { --endIndex; lastMeasureChar = paragraphText.charAt(endIndex - 1); } while (endIndex > preceding && Character.isWhitespace(lastMeasureChar)); } } // note that trailing white space will not be included in the advance Rectangle2D bounds = fontInfo.fontInfo.font.getStringBounds(paragraphText, paragraphPosition, endIndex, context.getFontRenderContext()); // adding the measurement to the font info statistics fontInfo.recordMeasurement(bounds.getWidth() / (endIndex - paragraphPosition)); if (logTrace) { log.trace("measured to index " + (endIndex + paragraphOffset) + " at width " + bounds.getWidth()); } return bounds; }
From source file:com.redhat.rcm.version.Cli.java
private static void printTextLine(final String line, final String indent, final int max, final PrintWriter pw) { final String fmt = "%s%-" + max + "s\n"; final List<String> lines = new ArrayList<String>(); final BreakIterator iter = BreakIterator.getLineInstance(); iter.setText(line);//from ww w .j a va2 s . c o m int start = iter.first(); int end = BreakIterator.DONE; final StringBuilder currentLine = new StringBuilder(); String seg; while (start != BreakIterator.DONE && (end = iter.next()) != BreakIterator.DONE) { seg = line.substring(start, end); if (currentLine.length() + seg.length() > max) { lines.add(currentLine.toString()); currentLine.setLength(0); } currentLine.append(seg); start = end; } if (currentLine.length() > 0) { lines.add(currentLine.toString()); } for (final String ln : lines) { pw.printf(fmt, indent, ln); } }