Java tutorial
// This should be the same as org.apache.pdfbox.text except that in two // cases a private has been turned to protected. Search for "kwa". // Because the package has been changed, some imports needed to be added. /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.repeatability.pdf; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.StringWriter; import java.io.Writer; import java.text.Bidi; import java.text.Normalizer; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.SortedMap; import java.util.SortedSet; import java.util.StringTokenizer; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageTree; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; import org.apache.pdfbox.util.QuickSort; // kwa import org.apache.pdfbox.text.*; /** * This class will take a pdf document and strip out all of the text and ignore the formatting and such. Please note; it * is up to clients of this class to verify that a specific user has the correct permissions to extract text from the * PDF document. * * The basic flow of this process is that we get a document and use a series of processXXX() functions that work on * smaller and smaller chunks of the page. Eventually, we fully process each page and then print it. * * @author Ben Litchfield */ public class PDFTextStripper extends PDFTextStreamEngine { private static float defaultIndentThreshold = 2.0f; private static float defaultDropThreshold = 2.5f; private static final boolean useCustomQuickSort; private static final Log LOG = LogFactory.getLog(PDFTextStripper.class); // enable the ability to set the default indent/drop thresholds // with -D system properties: // pdftextstripper.indent // pdftextstripper.drop static { String strDrop = null, strIndent = null; try { String className = PDFTextStripper.class.getSimpleName().toLowerCase(); String prop = className + ".indent"; strIndent = System.getProperty(prop); prop = className + ".drop"; strDrop = System.getProperty(prop); } catch (SecurityException e) { // PDFBOX-1946 when run in an applet // ignore and use default } if (strIndent != null && strIndent.length() > 0) { try { defaultIndentThreshold = Float.parseFloat(strIndent); } catch (NumberFormatException nfe) { // ignore and use default } } if (strDrop != null && strDrop.length() > 0) { try { defaultDropThreshold = Float.parseFloat(strDrop); } catch (NumberFormatException nfe) { // ignore and use default } } } static { // check if we need to use the custom quicksort algorithm as a // workaround to the PDFBOX-1512 transitivity issue of TextPositionComparator: boolean is16orLess = false; try { String version = System.getProperty("java.specification.version"); StringTokenizer st = new StringTokenizer(version, "."); int majorVersion = Integer.parseInt(st.nextToken()); int minorVersion = 0; if (st.hasMoreTokens()) { minorVersion = Integer.parseInt(st.nextToken()); } is16orLess = majorVersion == 1 && minorVersion <= 6; } catch (SecurityException x) { // when run in an applet ignore and use default // assume 1.7 or higher so that quicksort is used } catch (NumberFormatException nfe) { // should never happen, but if it does, // assume 1.7 or higher so that quicksort is used } useCustomQuickSort = !is16orLess; } /** * The platform's line separator. */ protected final String LINE_SEPARATOR = System.getProperty("line.separator"); private String lineSeparator = LINE_SEPARATOR; private String wordSeparator = " "; private String paragraphStart = ""; private String paragraphEnd = ""; private String pageStart = ""; private String pageEnd = LINE_SEPARATOR; private String articleStart = ""; private String articleEnd = ""; private int currentPageNo = 0; private int startPage = 1; private int endPage = Integer.MAX_VALUE; private PDOutlineItem startBookmark = null; // 1-based bookmark pages private int startBookmarkPageNumber = -1; private int endBookmarkPageNumber = -1; private PDOutlineItem endBookmark = null; private boolean suppressDuplicateOverlappingText = true; private boolean shouldSeparateByBeads = true; private boolean sortByPosition = false; private boolean addMoreFormatting = false; private float indentThreshold = defaultIndentThreshold; private float dropThreshold = defaultDropThreshold; // we will need to estimate where to add spaces, these are used to help guess private float spacingTolerance = .5f; private float averageCharTolerance = .3f; private List<PDRectangle> beadRectangles = null; /** * The charactersByArticle is used to extract text by article divisions. For example a PDF that has two columns like * a newspaper, we want to extract the first column and then the second column. In this example the PDF would have 2 * beads(or articles), one for each column. The size of the charactersByArticle would be 5, because not all text on * the screen will fall into one of the articles. The five divisions are shown below * * Text before first article * first article text * text between first article and second article * second article text * text after second article * * Most PDFs won't have any beads, so charactersByArticle will contain a single entry. */ protected ArrayList<List<TextPosition>> charactersByArticle = new ArrayList<List<TextPosition>>(); private Map<String, TreeMap<Float, TreeSet<Float>>> characterListMapping = new HashMap<String, TreeMap<Float, TreeSet<Float>>>(); protected PDDocument document; protected Writer output; /** * True if we started a paragraph but haven't ended it yet. */ private boolean inParagraph; /** * Instantiate a new PDFTextStripper object. * * @throws IOException If there is an error loading the properties. */ public PDFTextStripper() throws IOException { } /** * This will return the text of a document. See writeText. <br /> * NOTE: The document must not be encrypted when coming into this method. * * @param doc The document to get the text from. * @return The text of the PDF document. * @throws IOException if the doc state is invalid or it is encrypted. */ public String getText(PDDocument doc) throws IOException { StringWriter outputStream = new StringWriter(); writeText(doc, outputStream); return outputStream.toString(); } private void resetEngine() { currentPageNo = 0; document = null; if (charactersByArticle != null) { charactersByArticle.clear(); } if (characterListMapping != null) { characterListMapping.clear(); } } /** * This will take a PDDocument and write the text of that document to the print writer. * * @param doc The document to get the data from. * @param outputStream The location to put the text. * * @throws IOException If the doc is in an invalid state. */ public void writeText(PDDocument doc, Writer outputStream) throws IOException { resetEngine(); document = doc; output = outputStream; if (getAddMoreFormatting()) { paragraphEnd = lineSeparator; pageStart = lineSeparator; articleStart = lineSeparator; articleEnd = lineSeparator; } startDocument(document); processPages(document.getPages()); endDocument(document); } /** * This will process all of the pages and the text that is in them. * * @param pages The pages object in the document. * * @throws IOException If there is an error parsing the text. */ protected void processPages(PDPageTree pages) throws IOException { PDPage startBookmarkPage = startBookmark == null ? null : startBookmark.findDestinationPage(document); if (startBookmarkPage != null) { startBookmarkPageNumber = pages.indexOf(startBookmarkPage) + 1; } else { // -1 = undefined startBookmarkPageNumber = -1; } PDPage endBookmarkPage = endBookmark == null ? null : endBookmark.findDestinationPage(document); if (endBookmarkPage != null) { endBookmarkPageNumber = pages.indexOf(endBookmarkPage) + 1; } else { // -1 = undefined endBookmarkPageNumber = -1; } if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject()) { // this is a special case where both the start and end bookmark // are the same but point to nothing. In this case // we will not extract any text. startBookmarkPageNumber = 0; endBookmarkPageNumber = 0; } for (PDPage page : pages) { currentPageNo++; if (page.hasContents()) { processPage(page); } } } /** * This method is available for subclasses of this class. It will be called before processing of the document start. * * @param document The PDF document that is being processed. * @throws IOException If an IO error occurs. */ protected void startDocument(PDDocument document) throws IOException { // no default implementation, but available for subclasses } /** * This method is available for subclasses of this class. It will be called after processing of the document * finishes. * * @param document The PDF document that is being processed. * @throws IOException If an IO error occurs. */ protected void endDocument(PDDocument document) throws IOException { // no default implementation, but available for subclasses } /** * This will process the contents of a page. * * @param page The page to process. * * @throws IOException If there is an error processing the page. */ @Override public void processPage(PDPage page) throws IOException { if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) { startPage(page); int numberOfArticleSections = 1; if (shouldSeparateByBeads) { fillBeadRectangles(page); numberOfArticleSections += beadRectangles.size() * 2; } int originalSize = charactersByArticle.size(); charactersByArticle.ensureCapacity(numberOfArticleSections); int lastIndex = Math.max(numberOfArticleSections, originalSize); for (int i = 0; i < lastIndex; i++) { if (i < originalSize) { charactersByArticle.get(i).clear(); } else { if (numberOfArticleSections < originalSize) { charactersByArticle.remove(i); } else { charactersByArticle.add(new ArrayList<TextPosition>()); } } } characterListMapping.clear(); super.processPage(page); writePage(); endPage(page); } } private void fillBeadRectangles(PDPage page) { beadRectangles = new ArrayList<PDRectangle>(); for (PDThreadBead bead : page.getThreadBeads()) { if (bead == null) { // can't skip, because of null entry handling in processTextPosition() beadRectangles.add(null); continue; } PDRectangle rect = bead.getRectangle(); // bead rectangle is in PDF coordinates (y=0 is bottom), // glyphs are in image coordinates (y=0 is top), // so we must flip PDRectangle mediaBox = page.getMediaBox(); float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY(); float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY(); rect.setLowerLeftY(lowerLeftY); rect.setUpperRightY(upperRightY); // adjust for cropbox PDRectangle cropBox = page.getCropBox(); if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0) { rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX()); rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY()); rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX()); rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY()); } beadRectangles.add(rect); } } /** * Start a new article, which is typically defined as a column on a single page (also referred to as a bead). This * assumes that the primary direction of text is left to right. Default implementation is to do nothing. Subclasses * may provide additional information. * * @throws IOException If there is any error writing to the stream. */ protected void startArticle() throws IOException { startArticle(true); } /** * Start a new article, which is typically defined as a column on a single page (also referred to as a bead). * Default implementation is to do nothing. Subclasses may provide additional information. * * @param isLTR true if primary direction of text is left to right. * @throws IOException If there is any error writing to the stream. */ protected void startArticle(boolean isLTR) throws IOException { output.write(getArticleStart()); } /** * End an article. Default implementation is to do nothing. Subclasses may provide additional information. * * @throws IOException If there is any error writing to the stream. */ protected void endArticle() throws IOException { output.write(getArticleEnd()); } /** * Start a new page. Default implementation is to do nothing. Subclasses may provide additional information. * * @param page The page we are about to process. * * @throws IOException If there is any error writing to the stream. */ protected void startPage(PDPage page) throws IOException { // default is to do nothing } /** * End a page. Default implementation is to do nothing. Subclasses may provide additional information. * * @param page The page we are about to process. * * @throws IOException If there is any error writing to the stream. */ protected void endPage(PDPage page) throws IOException { // default is to do nothing } private static final float END_OF_LAST_TEXT_X_RESET_VALUE = -1; private static final float MAX_Y_FOR_LINE_RESET_VALUE = -Float.MAX_VALUE; private static final float EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE = -Float.MAX_VALUE; private static final float MAX_HEIGHT_FOR_LINE_RESET_VALUE = -1; private static final float MIN_Y_TOP_FOR_LINE_RESET_VALUE = Float.MAX_VALUE; private static final float LAST_WORD_SPACING_RESET_VALUE = -1; /** * This will print the text of the processed page to "output". It will estimate, based on the coordinates of the * text, where newlines and word spacings should be placed. The text will be sorted only if that feature was * enabled. * * @throws IOException If there is an error writing the text. */ protected void writePage() throws IOException { float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE; float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE; float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; PositionWrapper lastPosition = null; PositionWrapper lastLineStartPosition = null; boolean startOfPage = true; // flag to indicate start of page boolean startOfArticle; if (charactersByArticle.size() > 0) { writePageStart(); } for (List<TextPosition> textList : charactersByArticle) { if (getSortByPosition()) { TextPositionComparator comparator = new TextPositionComparator(); // because the TextPositionComparator is not transitive, but // JDK7+ enforces transitivity on comparators, we need to use // a custom quicksort implementation (which is slower, unfortunately). if (useCustomQuickSort) { QuickSort.sort(textList, comparator); } else { Collections.sort(textList, comparator); } } Iterator<TextPosition> textIter = textList.iterator(); startArticle(); startOfArticle = true; // Now cycle through to print the text. // We queue up a line at a time before we print so that we can convert // the line from presentation form to logical form (if needed). List<LineItem> line = new ArrayList<LineItem>(); textIter = textList.iterator(); // start from the beginning again // PDF files don't always store spaces. We will need to guess where we should add // spaces based on the distances between TextPositions. Historically, this was done // based on the size of the space character provided by the font. In general, this // worked but there were cases where it did not work. Calculating the average character // width and using that as a metric works better in some cases but fails in some cases // where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of // these examples. // Keeps track of the previous average character width float previousAveCharWidth = -1; while (textIter.hasNext()) { TextPosition position = textIter.next(); PositionWrapper current = new PositionWrapper(position); String characterValue = position.getUnicode(); // Resets the average character width when we see a change in font // or a change in the font size if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition().getFont() || position.getFontSize() != lastPosition.getTextPosition().getFontSize())) { previousAveCharWidth = -1; } float positionX; float positionY; float positionWidth; float positionHeight; // If we are sorting, then we need to use the text direction // adjusted coordinates, because they were used in the sorting. if (getSortByPosition()) { positionX = position.getXDirAdj(); positionY = position.getYDirAdj(); positionWidth = position.getWidthDirAdj(); positionHeight = position.getHeightDir(); } else { positionX = position.getX(); positionY = position.getY(); positionWidth = position.getWidth(); positionHeight = position.getHeight(); } // The current amount of characters in a word int wordCharCount = position.getIndividualWidths().length; // Estimate the expected width of the space based on the // space character with some margin. float wordSpacing = position.getWidthOfSpace(); float deltaSpace; if (wordSpacing == 0 || Float.isNaN(wordSpacing)) { deltaSpace = Float.MAX_VALUE; } else { if (lastWordSpacing < 0) { deltaSpace = wordSpacing * getSpacingTolerance(); } else { deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance(); } } // Estimate the expected width of the space based on the average character width // with some margin. This calculation does not make a true average (average of // averages) but we found that it gave the best results after numerous experiments. // Based on experiments we also found that .3 worked well. float averageCharWidth; if (previousAveCharWidth < 0) { averageCharWidth = positionWidth / wordCharCount; } else { averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f; } float deltaCharWidth = averageCharWidth * getAverageCharTolerance(); // Compares the values obtained by the average method and the wordSpacing method // and picks the smaller number. float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE) { if (deltaCharWidth > deltaSpace) { expectedStartOfNextWordX = endOfLastTextX + deltaSpace; } else { expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth; } } if (lastPosition != null) { if (startOfArticle) { lastPosition.setArticleStart(); startOfArticle = false; } // RDD - Here we determine whether this text object is on the current // line. We use the lastBaselineFontSize to handle the superscript // case, and the size of the current font to handle the subscript case. // Text must overlap with the last rendered baseline text by at least // a small amount in order to be considered as being on the same line. // XXX BC: In theory, this check should really check if the next char is in // full range seen in this line. This is what I tried to do with minYTopForLine, // but this caused a lot of regression test failures. So, I'm leaving it be for // now if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) { writeLine(normalize(line)); line.clear(); lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; } // test if our TextPosition starts after a new word would be expected to start if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE && expectedStartOfNextWordX < positionX && // only bother adding a space if the last character was not a space lastPosition.getTextPosition().getUnicode() != null && !lastPosition.getTextPosition().getUnicode().endsWith(" ")) { line.add(LineItem.getWordSeparator()); } } if (positionY >= maxYForLine) { maxYForLine = positionY; } // RDD - endX is what PDF considers to be the x coordinate of the // end position of the text. We use it in computing our metrics below. endOfLastTextX = positionX + positionWidth; // add it to the list if (characterValue != null) { if (startOfPage && lastPosition == null) { writeParagraphStart();// not sure this is correct for RTL? } line.add(new LineItem(position)); } maxHeightForLine = Math.max(maxHeightForLine, positionHeight); minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight); lastPosition = current; if (startOfPage) { lastPosition.setParagraphStart(); lastPosition.setLineStart(); lastLineStartPosition = lastPosition; startOfPage = false; } lastWordSpacing = wordSpacing; previousAveCharWidth = averageCharWidth; } // print the final line if (line.size() > 0) { writeLine(normalize(line)); writeParagraphEnd(); } endArticle(); } writePageEnd(); } private boolean overlap(float y1, float height1, float y2, float height2) { return within(y1, y2, .1f) || y2 <= y1 && y2 >= y1 - height1 || y1 <= y2 && y1 >= y2 - height2; } /** * Write the line separator value to the output stream. * * @throws IOException If there is a problem writing out the lineseparator to the document. */ protected void writeLineSeparator() throws IOException { output.write(getLineSeparator()); } /** * Write the word separator value to the output stream. * * @throws IOException If there is a problem writing out the wordseparator to the document. */ protected void writeWordSeparator() throws IOException { output.write(getWordSeparator()); } /** * Write the string in TextPosition to the output stream. * * @param text The text to write to the stream. * @throws IOException If there is an error when writing the text. */ protected void writeCharacters(TextPosition text) throws IOException { output.write(text.getUnicode()); } /** * Write a Java string to the output stream. The default implementation will ignore the <code>textPositions</code> * and just calls {@link #writeString(String)}. * * @param text The text to write to the stream. * @param textPositions The TextPositions belonging to the text. * @throws IOException If there is an error when writing the text. */ protected void writeString(String text, List<TextPosition> textPositions) throws IOException { writeString(text); } /** * Write a Java string to the output stream. * * @param text The text to write to the stream. * @throws IOException If there is an error when writing the text. */ protected void writeString(String text) throws IOException { output.write(text); } /** * This will determine of two floating point numbers are within a specified variance. * * @param first The first number to compare to. * @param second The second number to compare to. * @param variance The allowed variance. */ private boolean within(float first, float second, float variance) { return second < first + variance && second > first - variance; } /** * This will process a TextPosition object and add the text to the list of characters on a page. It takes care of * overlapping text. * * @param text The text to process. */ @Override protected void processTextPosition(TextPosition text) { boolean showCharacter = true; if (suppressDuplicateOverlappingText) { showCharacter = false; String textCharacter = text.getUnicode(); float textX = text.getX(); float textY = text.getY(); TreeMap<Float, TreeSet<Float>> sameTextCharacters = characterListMapping.get(textCharacter); if (sameTextCharacters == null) { sameTextCharacters = new TreeMap<Float, TreeSet<Float>>(); characterListMapping.put(textCharacter, sameTextCharacters); } // RDD - Here we compute the value that represents the end of the rendered // text. This value is used to determine whether subsequent text rendered // on the same line overwrites the current text. // // We subtract any positive padding to handle cases where extreme amounts // of padding are applied, then backed off (not sure why this is done, but there // are cases where the padding is on the order of 10x the character width, and // the TJ just backs up to compensate after each character). Also, we subtract // an amount to allow for kerning (a percentage of the width of the last // character). boolean suppressCharacter = false; float tolerance = text.getWidth() / textCharacter.length() / 3.0f; SortedMap<Float, TreeSet<Float>> xMatches = sameTextCharacters.subMap(textX - tolerance, textX + tolerance); for (TreeSet<Float> xMatch : xMatches.values()) { SortedSet<Float> yMatches = xMatch.subSet(textY - tolerance, textY + tolerance); if (!yMatches.isEmpty()) { suppressCharacter = true; break; } } if (!suppressCharacter) { TreeSet<Float> ySet = sameTextCharacters.get(textX); if (ySet == null) { ySet = new TreeSet<Float>(); sameTextCharacters.put(textX, ySet); } ySet.add(textY); showCharacter = true; } } if (showCharacter) { // if we are showing the character then we need to determine which article it belongs to int foundArticleDivisionIndex = -1; int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; int notFoundButFirstLeftArticleDivisionIndex = -1; int notFoundButFirstAboveArticleDivisionIndex = -1; float x = text.getX(); float y = text.getY(); if (shouldSeparateByBeads) { for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++) { PDRectangle rect = beadRectangles.get(i); if (rect != null) { if (rect.contains(x, y)) { foundArticleDivisionIndex = i * 2 + 1; } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) { notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2; } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) { notFoundButFirstLeftArticleDivisionIndex = i * 2; } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) { notFoundButFirstAboveArticleDivisionIndex = i * 2; } } else { foundArticleDivisionIndex = 0; } } } else { foundArticleDivisionIndex = 0; } int articleDivisionIndex; if (foundArticleDivisionIndex != -1) { articleDivisionIndex = foundArticleDivisionIndex; } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; } else if (notFoundButFirstLeftArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; } else if (notFoundButFirstAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; } else { articleDivisionIndex = charactersByArticle.size() - 1; } List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex); // In the wild, some PDF encoded documents put diacritics (accents on // top of characters) into a separate Tj element. When displaying them // graphically, the two chunks get overlayed. With text output though, // we need to do the overlay. This code recombines the diacritic with // its associated character if the two are consecutive. if (textList.isEmpty()) { textList.add(text); } else { // test if we overlap the previous entry. // Note that we are making an assumption that we need to only look back // one TextPosition to find what we are overlapping. // This may not always be true. */ TextPosition previousTextPosition = textList.get(textList.size() - 1); if (text.isDiacritic() && previousTextPosition.contains(text)) { previousTextPosition.mergeDiacritic(text); } // If the previous TextPosition was the diacritic, merge it into this // one and remove it from the list. else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) { text.mergeDiacritic(previousTextPosition); textList.remove(textList.size() - 1); textList.add(text); } else { textList.add(text); } } } } /** * This is the page that the text extraction will start on. The pages start at page 1. For example in a 5 page PDF * document, if the start page is 1 then all pages will be extracted. If the start page is 4 then pages 4 and 5 will * be extracted. The default value is 1. * * @return Value of property startPage. */ public int getStartPage() { return startPage; } /** * This will set the first page to be extracted by this class. * * @param startPageValue New value of 1-based startPage property. */ public void setStartPage(int startPageValue) { startPage = startPageValue; } /** * This will get the last page that will be extracted. This is inclusive, for example if a 5 page PDF an endPage * value of 5 would extract the entire document, an end page of 2 would extract pages 1 and 2. This defaults to * Integer.MAX_VALUE such that all pages of the pdf will be extracted. * * @return Value of property endPage. */ public int getEndPage() { return endPage; } /** * This will set the last page to be extracted by this class. * * @param endPageValue New value of 1-based endPage property. */ public void setEndPage(int endPageValue) { endPage = endPageValue; } /** * Set the desired line separator for output text. The line.separator system property is used if the line separator * preference is not set explicitly using this method. * * @param separator The desired line separator string. */ public void setLineSeparator(String separator) { lineSeparator = separator; } /** * This will get the line separator. * * @return The desired line separator string. */ public String getLineSeparator() { return lineSeparator; } /** * This will get the word separator. * * @return The desired word separator string. */ public String getWordSeparator() { return wordSeparator; } /** * Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space * character if there is enough space between two words. By default a space character is used. If you need and * accurate count of characters that are found in a PDF document then you might want to set the word separator to * the empty string. * * @param separator The desired page separator string. */ public void setWordSeparator(String separator) { wordSeparator = separator; } /** * @return Returns the suppressDuplicateOverlappingText. */ public boolean getSuppressDuplicateOverlappingText() { return suppressDuplicateOverlappingText; } /** * Get the current page number that is being processed. * * @return A 1 based number representing the current page. */ protected int getCurrentPageNo() { return currentPageNo; } /** * The output stream that is being written to. * * @return The stream that output is being written to. */ protected Writer getOutput() { return output; } /** * Character strings are grouped by articles. It is quite common that there will only be a single article. This * returns a List that contains List objects, the inner lists will contain TextPosition objects. * * @return A double List of TextPositions for all text strings on the page. */ protected List<List<TextPosition>> getCharactersByArticle() { return charactersByArticle; } /** * By default the text stripper will attempt to remove text that overlapps each other. Word paints the same * character several times in order to make it look bold. By setting this to false all text will be extracted, which * means that certain sections will be duplicated, but better performance will be noticed. * * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set. */ public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingTextValue) { suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue; } /** * This will tell if the text stripper should separate by beads. * * @return If the text will be grouped by beads. */ public boolean getSeparateByBeads() { return shouldSeparateByBeads; } /** * Set if the text stripper should group the text output by a list of beads. The default value is true! * * @param aShouldSeparateByBeads The new grouping of beads. */ public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) { shouldSeparateByBeads = aShouldSeparateByBeads; } /** * Get the bookmark where text extraction should end, inclusive. Default is null. * * @return The ending bookmark. */ public PDOutlineItem getEndBookmark() { return endBookmark; } /** * Set the bookmark where the text extraction should stop. * * @param aEndBookmark The ending bookmark. */ public void setEndBookmark(PDOutlineItem aEndBookmark) { endBookmark = aEndBookmark; } /** * Get the bookmark where text extraction should start, inclusive. Default is null. * * @return The starting bookmark. */ public PDOutlineItem getStartBookmark() { return startBookmark; } /** * Set the bookmark where text extraction should start, inclusive. * * @param aStartBookmark The starting bookmark. */ public void setStartBookmark(PDOutlineItem aStartBookmark) { startBookmark = aStartBookmark; } /** * This will tell if the text stripper should add some more text formatting. * * @return true if some more text formatting will be added */ public boolean getAddMoreFormatting() { return addMoreFormatting; } /** * There will some additional text formatting be added if addMoreFormatting is set to true. Default is false. * * @param newAddMoreFormatting Tell PDFBox to add some more text formatting */ public void setAddMoreFormatting(boolean newAddMoreFormatting) { addMoreFormatting = newAddMoreFormatting; } /** * This will tell if the text stripper should sort the text tokens before writing to the stream. * * @return true If the text tokens will be sorted before being written. */ public boolean getSortByPosition() { return sortByPosition; } /** * The order of the text tokens in a PDF file may not be in the same as they appear visually on the screen. For * example, a PDF writer may write out all text by font, so all bold or larger text, then make a second pass and * write out the normal text.<br/> * The default is to <b>not</b> sort by position.<br/> * <br/> * A PDF writer could choose to write each character in a different order. By default PDFBox does <b>not</b> sort * the text tokens before processing them due to performance reasons. * * @param newSortByPosition Tell PDFBox to sort the text positions. */ public void setSortByPosition(boolean newSortByPosition) { sortByPosition = newSortByPosition; } /** * Get the current space width-based tolerance value that is being used to estimate where spaces in text should be * added. Note that the default value for this has been determined from trial and error. * * @return The current tolerance / scaling factor */ public float getSpacingTolerance() { return spacingTolerance; } /** * Set the space width-based tolerance value that is used to estimate where spaces in text should be added. Note * that the default value for this has been determined from trial and error. Setting this value larger will reduce * the number of spaces added. * * @param spacingToleranceValue tolerance / scaling factor to use */ public void setSpacingTolerance(float spacingToleranceValue) { spacingTolerance = spacingToleranceValue; } /** * Get the current character width-based tolerance value that is being used to estimate where spaces in text should * be added. Note that the default value for this has been determined from trial and error. * * @return The current tolerance / scaling factor */ public float getAverageCharTolerance() { return averageCharTolerance; } /** * Set the character width-based tolerance value that is used to estimate where spaces in text should be added. Note * that the default value for this has been determined from trial and error. Setting this value larger will reduce * the number of spaces added. * * @param averageCharToleranceValue average tolerance / scaling factor to use */ public void setAverageCharTolerance(float averageCharToleranceValue) { averageCharTolerance = averageCharToleranceValue; } /** * returns the multiple of whitespace character widths for the current text which the current line start can be * indented from the previous line start beyond which the current line start is considered to be a paragraph start. * * @return the number of whitespace character widths to use when detecting paragraph indents. */ public float getIndentThreshold() { return indentThreshold; } /** * sets the multiple of whitespace character widths for the current text which the current line start can be * indented from the previous line start beyond which the current line start is considered to be a paragraph start. * The default value is 2.0. * * @param indentThresholdValue the number of whitespace character widths to use when detecting paragraph indents. */ public void setIndentThreshold(float indentThresholdValue) { indentThreshold = indentThresholdValue; } /** * the minimum whitespace, as a multiple of the max height of the current characters beyond which the current line * start is considered to be a paragraph start. * * @return the character height multiple for max allowed whitespace between lines in the same paragraph. */ public float getDropThreshold() { return dropThreshold; } /** * sets the minimum whitespace, as a multiple of the max height of the current characters beyond which the current * line start is considered to be a paragraph start. The default value is 2.5. * * @param dropThresholdValue the character height multiple for max allowed whitespace between lines in the same * paragraph. */ public void setDropThreshold(float dropThresholdValue) { dropThreshold = dropThresholdValue; } /** * Returns the string which will be used at the beginning of a paragraph. * * @return the paragraph start string */ public String getParagraphStart() { return paragraphStart; } /** * Sets the string which will be used at the beginning of a paragraph. * * @param s the paragraph start string */ public void setParagraphStart(String s) { paragraphStart = s; } /** * Returns the string which will be used at the end of a paragraph. * * @return the paragraph end string */ public String getParagraphEnd() { return paragraphEnd; } /** * Sets the string which will be used at the end of a paragraph. * * @param s the paragraph end string */ public void setParagraphEnd(String s) { paragraphEnd = s; } /** * Returns the string which will be used at the beginning of a page. * * @return the page start string */ public String getPageStart() { return pageStart; } /** * Sets the string which will be used at the beginning of a page. * * @param pageStartValue the page start string */ public void setPageStart(String pageStartValue) { pageStart = pageStartValue; } /** * Returns the string which will be used at the end of a page. * * @return the page end string */ public String getPageEnd() { return pageEnd; } /** * Sets the string which will be used at the end of a page. * * @param pageEndValue the page end string */ public void setPageEnd(String pageEndValue) { pageEnd = pageEndValue; } /** * Returns the string which will be used at the beginning of an article. * * @return the article start string */ public String getArticleStart() { return articleStart; } /** * Sets the string which will be used at the beginning of an article. * * @param articleStartValue the article start string */ public void setArticleStart(String articleStartValue) { articleStart = articleStartValue; } /** * Returns the string which will be used at the end of an article. * * @return the article end string */ public String getArticleEnd() { return articleEnd; } /** * Sets the string which will be used at the end of an article. * * @param articleEndValue the article end string */ public void setArticleEnd(String articleEndValue) { articleEnd = articleEndValue; } /** * handles the line separator for a new line given the specified current and previous TextPositions. * * @param current the current text position * @param lastPosition the previous text position * @param lastLineStartPosition the last text position that followed a line separator. * @param maxHeightForLine max height for positions since lastLineStartPosition * @return start position of the last line * @throws IOException if something went wrong */ private PositionWrapper handleLineSeparation(PositionWrapper current, PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, float maxHeightForLine) throws IOException { current.setLineStart(); isParagraphSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); lastLineStartPosition = current; if (current.isParagraphStart()) { if (lastPosition.isArticleStart()) { if (lastPosition.isLineStart()) { writeLineSeparator(); } writeParagraphStart(); } else { writeLineSeparator(); writeParagraphSeparator(); } } else { writeLineSeparator(); } return lastLineStartPosition; } /** * tests the relationship between the last text position, the current text position and the last text position that * followed a line separator to decide if the gap represents a paragraph separation. This should <i>only</i> be * called for consecutive text positions that first pass the line separation test. * <p> * This base implementation tests to see if the lastLineStartPosition is null OR if the current vertical position * has dropped below the last text vertical position by at least 2.5 times the current text height OR if the current * horizontal position is indented by at least 2 times the current width of a space character. * </p> * <p> * This also attempts to identify text that is indented under a hanging indent. * </p> * <p> * This method sets the isParagraphStart and isHangingIndent flags on the current position object. * </p> * * @param position the current text position. This may have its isParagraphStart or isHangingIndent flags set upon * return. * @param lastPosition the previous text position (should not be null). * @param lastLineStartPosition the last text position that followed a line separator, or null. * @param maxHeightForLine max height for text positions since lasLineStartPosition. */ private void isParagraphSeparation(PositionWrapper position, PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, float maxHeightForLine) { boolean result = false; if (lastLineStartPosition == null) { result = true; } else { float yGap = Math .abs(position.getTextPosition().getYDirAdj() - lastPosition.getTextPosition().getYDirAdj()); float newYVal = multiplyFloat(getDropThreshold(), maxHeightForLine); // do we need to flip this for rtl? float xGap = position.getTextPosition().getXDirAdj() - lastLineStartPosition.getTextPosition().getXDirAdj(); float newXVal = multiplyFloat(getIndentThreshold(), position.getTextPosition().getWidthOfSpace()); float positionWidth = multiplyFloat(0.25f, position.getTextPosition().getWidth()); if (yGap > newYVal) { result = true; } else if (xGap > newXVal) { // text is indented, but try to screen for hanging indent if (!lastLineStartPosition.isParagraphStart()) { result = true; } else { position.setHangingIndent(); } } else if (xGap < -position.getTextPosition().getWidthOfSpace()) { // text is left of previous line. Was it a hanging indent? if (!lastLineStartPosition.isParagraphStart()) { result = true; } } else if (Math.abs(xGap) < positionWidth) { // current horizontal position is within 1/4 a char of the last // linestart. We'll treat them as lined up. if (lastLineStartPosition.isHangingIndent()) { position.setHangingIndent(); } else if (lastLineStartPosition.isParagraphStart()) { // check to see if the previous line looks like // any of a number of standard list item formats Pattern liPattern = matchListItemPattern(lastLineStartPosition); if (liPattern != null) { Pattern currentPattern = matchListItemPattern(position); if (liPattern == currentPattern) { result = true; } } } } } if (result) { position.setParagraphStart(); } } private float multiplyFloat(float value1, float value2) { // multiply 2 floats and truncate the resulting value to 3 decimal places // to avoid wrong results when comparing with another float return Math.round(value1 * value2 * 1000) / 1000f; } /** * writes the paragraph separator string to the output. * * @throws IOException if something went wrong */ protected void writeParagraphSeparator() throws IOException { writeParagraphEnd(); writeParagraphStart(); } /** * Write something (if defined) at the start of a paragraph. * * @throws IOException if something went wrong */ protected void writeParagraphStart() throws IOException { if (inParagraph) { writeParagraphEnd(); inParagraph = false; } output.write(getParagraphStart()); inParagraph = true; } /** * Write something (if defined) at the end of a paragraph. * * @throws IOException if something went wrong */ protected void writeParagraphEnd() throws IOException { if (!inParagraph) { writeParagraphStart(); } output.write(getParagraphEnd()); inParagraph = false; } /** * Write something (if defined) at the start of a page. * * @throws IOException if something went wrong */ protected void writePageStart() throws IOException { output.write(getPageStart()); } /** * Write something (if defined) at the end of a page. * * @throws IOException if something went wrong */ protected void writePageEnd() throws IOException { output.write(getPageEnd()); } /** * returns the list item Pattern object that matches the text at the specified PositionWrapper or null if the text * does not match such a pattern. The list of Patterns tested against is given by the {@link #getListItemPatterns()} * method. To add to the list, simply override that method (if sub-classing) or explicitly supply your own list * using {@link #setListItemPatterns(List)}. * * @param pw position * @return the matching pattern */ private Pattern matchListItemPattern(PositionWrapper pw) { TextPosition tp = pw.getTextPosition(); String txt = tp.getUnicode(); return matchPattern(txt, getListItemPatterns()); } /** * a list of regular expressions that match commonly used list item formats, i.e. bullets, numbers, letters, Roman * numerals, etc. Not meant to be comprehensive. */ private static final String[] LIST_ITEM_EXPRESSIONS = { "\\.", "\\d+\\.", "\\[\\d+\\]", "\\d+\\)", "[A-Z]\\.", "[a-z]\\.", "[A-Z]\\)", "[a-z]\\)", "[IVXL]+\\.", "[ivxl]+\\.", }; private List<Pattern> listOfPatterns = null; /** * use to supply a different set of regular expression patterns for matching list item starts. * * @param patterns list of patterns */ protected void setListItemPatterns(List<Pattern> patterns) { listOfPatterns = patterns; } /** * returns a list of regular expression Patterns representing different common list item formats. For example * numbered items of form: * <ol> * <li>some text</li> * <li>more text</li> * </ol> * or * <ul> * <li>some text</li> * <li>more text</li> * </ul> * etc., all begin with some character pattern. The pattern "\\d+\." (matches "1.", "2.", ...) or "\[\\d+\]" * (matches "[1]", "[2]", ...). * <p> * This method returns a list of such regular expression Patterns. * * @return a list of Pattern objects. */ protected List<Pattern> getListItemPatterns() { if (listOfPatterns == null) { listOfPatterns = new ArrayList<Pattern>(); for (String expression : LIST_ITEM_EXPRESSIONS) { Pattern p = Pattern.compile(expression); listOfPatterns.add(p); } } return listOfPatterns; } /** * iterates over the specified list of Patterns until it finds one that matches the specified string. Then returns * the Pattern. * <p> * Order of the supplied list of patterns is important as most common patterns should come first. Patterns should be * strict in general, and all will be used with case sensitivity on. * </p> * * @param string the string to be searched * @param patterns list of patterns * @return matching pattern */ protected static Pattern matchPattern(String string, List<Pattern> patterns) { for (Pattern p : patterns) { if (p.matcher(string).matches()) { return p; } } return null; } /** * Write a list of string containing a whole line of a document. * * @param line a list with the words of the given line * @throws IOException if something went wrong */ //private void writeLine(List<WordWithTextPositions> line) protected void writeLine(List<WordWithTextPositions> line) throws IOException { int numberOfStrings = line.size(); for (int i = 0; i < numberOfStrings; i++) { WordWithTextPositions word = line.get(i); writeString(word.getText(), word.getTextPositions()); if (i < numberOfStrings - 1) { writeWordSeparator(); } } } /** * Normalize the given list of TextPositions. * * @param line list of TextPositions * @return a list of strings, one string for every word */ private List<WordWithTextPositions> normalize(List<LineItem> line) { List<WordWithTextPositions> normalized = new LinkedList<WordWithTextPositions>(); StringBuilder lineBuilder = new StringBuilder(); List<TextPosition> wordPositions = new ArrayList<TextPosition>(); for (LineItem item : line) { lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, item); } if (lineBuilder.length() > 0) { // kwa if ("Fe?legyha?zi,".equals(lineBuilder.toString())) { // String lbs = lineBuilder.toString(); // int len = lbs.length(); // System.out.println("It is here, too! " + len); } normalized.add(createWord(lineBuilder.toString(), wordPositions)); } return normalized; } /** * Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given * word. If the word is a full line, the results will be the best. If the word contains of single words or * characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and * characters! * * Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx * * @param word The word that shall be processed * @return new word with the correct direction of the containing characters */ // kwa //private String handleDirection(String word) protected String handleDirection(String word) { Bidi bidi = new Bidi(word, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT); // if there is pure LTR text no need to process further if (!bidi.isMixed() && bidi.getBaseLevel() == Bidi.DIRECTION_LEFT_TO_RIGHT) { return word; } // collect individual bidi information int runCount = bidi.getRunCount(); byte[] levels = new byte[runCount]; Integer[] runs = new Integer[runCount]; for (int i = 0; i < runCount; i++) { levels[i] = (byte) bidi.getRunLevel(i); runs[i] = i; } // reorder individual parts based on their levels Bidi.reorderVisually(levels, 0, runs, 0, runCount); // collect the parts based on the direction within the run StringBuilder result = new StringBuilder(); for (int i = 0; i < runCount; i++) { int index = runs[i]; int start = bidi.getRunStart(index); int end = bidi.getRunLimit(index); int level = levels[index]; if ((level & 1) != 0) { for (; --end >= start;) { char character = word.charAt(end); if (Character.isMirrored(word.codePointAt(end))) { if (MIRRORING_CHAR_MAP.containsKey(character)) { result.append(MIRRORING_CHAR_MAP.get(character)); } else { result.append(character); } } else { result.append(character); } } } else { result.append(word, start, end); } } return result.toString(); } private static Map<Character, Character> MIRRORING_CHAR_MAP = new HashMap<Character, Character>(); static { String path = "org/apache/pdfbox/resources/text/BidiMirroring.txt"; InputStream input = PDFTextStripper.class.getClassLoader().getResourceAsStream(path); try { parseBidiFile(input); } catch (IOException e) { LOG.warn("Could not parse BidiMirroring.txt, mirroring char map will be empty: " + e.getMessage()); } finally { try { input.close(); } catch (IOException e) { LOG.error("Could not close BidiMirroring.txt ", e); } } }; /** * This method parses the bidi file provided as inputstream. * * @param inputStream - The bidi file as inputstream * @throws IOException if any line could not be read by the LineNumberReader */ private static void parseBidiFile(InputStream inputStream) throws IOException { LineNumberReader rd = new LineNumberReader(new InputStreamReader(inputStream)); do { String s = rd.readLine(); if (s == null) { break; } int comment = s.indexOf('#'); // ignore comments if (comment != -1) { s = s.substring(0, comment); } if (s.length() < 2) { continue; } StringTokenizer st = new StringTokenizer(s, ";"); int nFields = st.countTokens(); Character[] fields = new Character[nFields]; for (int i = 0; i < nFields; i++) { fields[i] = (char) Integer.parseInt(st.nextToken().trim(), 16); } if (fields.length == 2) { // initialize the MIRRORING_CHAR_MAP MIRRORING_CHAR_MAP.put(fields[0], fields[1]); } } while (true); } /** * Used within {@link #normalize(List, boolean, boolean)} to create a single {@link WordWithTextPositions} entry. */ // kwa //private WordWithTextPositions createWord(String word, List<TextPosition> wordPositions) protected WordWithTextPositions createWord(String word, List<TextPosition> wordPositions) { return new WordWithTextPositions(normalizeWord(word), wordPositions); } /** * Normalize certain Unicode characters. For example, convert the single "fi" ligature to "f" and "i". Also * normalises Arabic and Hebrew presentation forms. * * @param word Word to normalize * @return Normalized word */ private String normalizeWord(String word) { StringBuilder builder = null; int p = 0; int q = 0; int strLength = word.length(); for (; q < strLength; q++) { // We only normalize if the codepoint is in a given range. // Otherwise, NFKC converts too many things that would cause // confusion. For example, it converts the micro symbol in // extended Latin to the value in the Greek script. We normalize // the Unicode Alphabetic and Arabic A&B Presentation forms. char c = word.charAt(q); if (0xFB00 <= c && c <= 0xFDFF || 0xFE70 <= c && c <= 0xFEFF) { if (builder == null) { builder = new StringBuilder(strLength * 2); } builder.append(word.substring(p, q)); // Some fonts map U+FDF2 differently than the Unicode spec. // They add an extra U+0627 character to compensate. // This removes the extra character for those fonts. if (c == 0xFDF2 && q > 0 && (word.charAt(q - 1) == 0x0627 || word.charAt(q - 1) == 0xFE8D)) { builder.append("\u0644\u0644\u0647"); } else { // Trim because some decompositions have an extra space, such as U+FC5E builder.append(Normalizer.normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim()); } p = q + 1; } } if (builder == null) { return handleDirection(word); } else { builder.append(word.substring(p, q)); return handleDirection(builder.toString()); } } /** * Used within {@link #normalize(List, boolean, boolean)} to handle a {@link TextPosition}. * * @return The StringBuilder that must be used when calling this method. */ // kwa // private StringBuilder normalizeAdd(List<WordWithTextPositions> normalized, // StringBuilder lineBuilder, List<TextPosition> wordPositions, LineItem item) protected StringBuilder normalizeAdd(List<WordWithTextPositions> normalized, StringBuilder lineBuilder, List<TextPosition> wordPositions, LineItem item) { if (item.isWordSeparator()) { normalized.add(createWord(lineBuilder.toString(), new ArrayList<TextPosition>(wordPositions))); lineBuilder = new StringBuilder(); wordPositions.clear(); } else { TextPosition text = item.getTextPosition(); if (text.getUnicode().length() != 1) { ; // System.out.println("Ha!"); } lineBuilder.append(text.getUnicode()); wordPositions.add(text); } return lineBuilder; } /** * internal marker class. Used as a place holder in a line of TextPositions. */ // kwa //private static final class LineItem protected static class LineItem { public static LineItem WORD_SEPARATOR = new LineItem(); public static LineItem getWordSeparator() { return WORD_SEPARATOR; } private final TextPosition textPosition; private LineItem() { textPosition = null; } LineItem(TextPosition textPosition) { this.textPosition = textPosition; } public TextPosition getTextPosition() { return textPosition; } public boolean isWordSeparator() { return textPosition == null; } } /** * Internal class that maps strings to lists of {@link TextPosition} arrays. Note that the number of entries in that * list may differ from the number of characters in the string due to normalization. * * @author Axel Drfler */ // kwa //private static final class WordWithTextPositions protected static class WordWithTextPositions { String text; List<TextPosition> textPositions; WordWithTextPositions(String word, List<TextPosition> positions) { text = word; textPositions = positions; } public String getText() { return text; } public List<TextPosition> getTextPositions() { return textPositions; } } /** * wrapper of TextPosition that adds flags to track status as linestart and paragraph start positions. * <p> * This is implemented as a wrapper since the TextPosition class doesn't provide complete access to its state fields * to subclasses. Also, conceptually TextPosition is immutable while these flags need to be set post-creation so it * makes sense to put these flags in this separate class. * </p> * * @author m.martinez@ll.mit.edu */ private static final class PositionWrapper { private boolean isLineStart = false; private boolean isParagraphStart = false; // private boolean isPageBreak = false; private boolean isHangingIndent = false; private boolean isArticleStart = false; private TextPosition position = null; /** * Constructs a PositionWrapper around the specified TextPosition object. * * @param position the text position. */ PositionWrapper(TextPosition position) { this.position = position; } /** * Returns the underlying TextPosition object. * * @return the text position */ public TextPosition getTextPosition() { return position; } public boolean isLineStart() { return isLineStart; } /** * Sets the isLineStart() flag to true. */ public void setLineStart() { this.isLineStart = true; } public boolean isParagraphStart() { return isParagraphStart; } /** * sets the isParagraphStart() flag to true. */ public void setParagraphStart() { this.isParagraphStart = true; } public boolean isArticleStart() { return isArticleStart; } /** * Sets the isArticleStart() flag to true. */ public void setArticleStart() { this.isArticleStart = true; } // public boolean isPageBreak() // { // return isPageBreak; // } // // /** // * Sets the isPageBreak() flag to true. // */ // public void setPageBreak() // { // this.isPageBreak = true; // } public boolean isHangingIndent() { return isHangingIndent; } /** * Sets the isHangingIndent() flag to true. */ public void setHangingIndent() { this.isHangingIndent = true; } } }