de.tudarmstadt.ukp.dkpro.core.io.pdf.PdfLayoutEventStripper.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.core.io.pdf.PdfLayoutEventStripper.java
Source

/*******************************************************************************
 * Copyright 2010
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * This code is based on the PDFTextStripper written by Ben Litchfield from
 * the PDFbox 0.7.x project and licensed under the BSD license. In accordance
 * with the terms of this license, the following copyright statement is retained:
 *
 * Copyright (c) 2003-2007, www.pdfbox.org
 * All rights reserved.
 *
 * Furthermore the modified code is re-licensed under the Apache License,
 * Version 2.0 as stated above.
 *****************************************************************************/
package de.tudarmstadt.ukp.dkpro.core.io.pdf;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Vector;

import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
import org.apache.pdfbox.util.PDFStreamEngine;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.ResourceLoader;
import org.apache.pdfbox.util.TextPosition;

/**
 * This class will take a PDF document and strip out all of the text and ignore the formatting and
 * such. Please note; it is up to clients of this class to verify that a specific user has the
 * correct permissions to extract text from the PDF document.
 * <p>
 * This class is based on the {@link PDFTextStripper} class and was substantially modified and
 * enhanced for basic paragraph and heading detection. Unfortunately it was not possible to add
 * these enhancements through sub-classing, thus the code was copied and adapted.
 */
public abstract class PdfLayoutEventStripper extends PDFStreamEngine {
    public static enum Values {
        LEFT, RIGHT, TOP, BOTTOM, LINESPACING, LINEHEIGHT
    }

    public static enum Style {
        PAGE, PARAGRAPH, HEADING
    }

    private PDDocument document;

    private int currentPageNo = 0;
    private int startPage = 1;
    private int maxPage = 0;
    private int endPage = Integer.MAX_VALUE;
    private boolean suppressDuplicateOverlappingText = true;
    private boolean shouldSeparateByBeads = true;

    private List<PDThreadBead> pageArticles = null;
    /**
     * The charactersByArticle is used to extract text by article divisions. For example a PDF that
     * has two columns like a newspaper, we want to extract the first column and then the second
     * column. In this example the PDF would have 2 beads(or articles), one for each column. The
     * size of the charactersByArticle would be 5, because not all text on the screen will fall into
     * one of the articles. The five divisions are shown below
     * 
     * Text before first article first article text text between first article and second article
     * second article text text after second article
     * 
     * Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
     */
    protected Vector<List<TextPosition>> charactersByArticle = new Vector<List<TextPosition>>();

    private final Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();

    /**
     * Instantiate a new PDFTextStripper object. This object will load properties from
     * Resources/PDFTextStripper.properties.
     * 
     * @throws IOException
     *             If there is an error loading the properties.
     */
    public PdfLayoutEventStripper() throws IOException {
        super(ResourceLoader.loadProperties("org/apache/pdfbox/resources/PDFTextStripper.properties", true));
    }

    /**
     * Instantiate a new PDFTextStripper object. Loading all of the operator mappings from the
     * properties object that is passed in.
     * 
     * @param props
     *            The properties containing the mapping of operators to PDFOperator classes.
     * 
     * @throws IOException
     *             If there is an error reading the properties.
     */
    public PdfLayoutEventStripper(final Properties props) throws IOException {
        super(props);
    }

    /**
     * This will take a PDDocument and write the text of that document to the print writer.
     * 
     * @param doc
     *            The document to get the data from.
     * 
     * @throws IOException
     *             If the doc is in an invalid state.
     */
    public void writeText(final PDDocument doc) throws IOException {
        resetEngine();

        currentPageNo = 0;
        document = doc;
        startDocument(document);

        if (document.isEncrypted()) {
            // We are expecting non-encrypted documents here, but it is common
            // for users to pass in a document that is encrypted with an empty
            // password (such a document appears to not be encrypted by
            // someone viewing the document, thus the confusion). We will
            // attempt to decrypt with the empty password to handle this case.
            //
            try {
                document.decrypt("");
            } catch (CryptographyException e) {
                throw new IOException("Error decrypting document, details: ", e);
            } catch (InvalidPasswordException e) {
                throw new IOException("Error: document is encrypted", e);
            }
        }

        processPages(document.getDocumentCatalog().getAllPages());
        endDocument(document);
    }

    /**
     * This will process all of the pages and the text that is in them.
     * 
     * @param pages
     *            The pages object in the document.
     * 
     * @throws IOException
     *             If there is an error parsing the text.
     */
    protected void processPages(List<PDPage> pages) throws IOException {
        maxPage = pages.size();

        for (final PDPage page : pages) {
            currentPageNo++;
            final PDStream contentStream = page.getContents();
            if (contentStream != null) {
                final COSStream contents = contentStream.getStream();
                processPage(page, contents);
            }
        }
    }

    /**
     * This will process the contents of a page.
     * 
     * @param page
     *            The page to process.
     * @param content
     *            The contents of the page.
     * 
     * @throws IOException
     *             If there is an error processing the page.
     */
    protected void processPage(final PDPage page, final COSStream content) throws IOException {
        if ((currentPageNo >= startPage) && (currentPageNo <= endPage)) {
            startPage(startPage, Math.min(maxPage, endPage), currentPageNo, page);
            pageArticles = page.getThreadBeads();
            int numberOfArticleSections = 1 + pageArticles.size() * 2;
            if (!shouldSeparateByBeads) {
                numberOfArticleSections = 1;
            }
            final int originalSize = charactersByArticle.size();
            charactersByArticle.setSize(numberOfArticleSections);
            for (int i = 0; i < numberOfArticleSections; i++) {
                if (numberOfArticleSections < originalSize) {
                    charactersByArticle.get(i).clear();
                } else {
                    charactersByArticle.set(i, new ArrayList<TextPosition>());
                }
            }

            characterListMapping.clear();

            // processStream will call showCharacter were we will simply
            // collect all the TextPositions for the page
            processStream(page, page.findResources(), content);

            // Now we do the real processing
            for (int i = 0; i < charactersByArticle.size(); i++) {
                processArticle(charactersByArticle.get(i));
            }

            endPage(startPage, endPage, currentPageNo, page);
        }
    }

    /**
     * This method tries do detect headings and paragraphs and line boundaries.
     * 
     * @param textList
     *            the text.
     * @throws IOException
     *             if there is an error writing to the stream.
     */
    protected void processArticle(final List<TextPosition> textList) throws IOException {
        // Nothing to do in this article?
        if (textList.size() == 0) {
            return;
        }

        // System.out.println("XScale: "+textList.get(0).getXScale());
        // System.out.println("YScale: "+textList.get(0).getYScale());

        final int prediction_depth = 10;
        Prediction pred = null;
        final Block block = new Block(textList, 0);
        Line currentLine = null;

        boolean newRegion = false;
        Style currentStyle = null;
        Style prevStyle = null;
        int cur = 0;
        while (cur < textList.size()) {
            // Initialize the line (if not already done)
            if (currentLine == null) {
                currentLine = new Line(textList, cur);

                // Get the style for the line (base on style for current
                // element)
                prevStyle = currentStyle;
                currentStyle = getStyle(textList.get(cur));

                // Test for a style change
                if ((newRegion) || (prevStyle != currentStyle)) {
                    if (newRegion) {
                        newRegion = false;
                    }
                    // On a style change issue the proper events
                    if (prevStyle != null) {
                        endRegion(prevStyle);
                    }
                    startRegion(currentStyle);
                    pred = predictGeneralStructure(textList, cur, prediction_depth);
                }
            }

            // Check if we left the line
            if (!currentLine.withinLine(textList.get(cur)) && !currentLine.isSuperscript(textList.get(cur))
                    && !currentLine.isSubscript(textList.get(cur))) {
                // We left the line
                currentLine = null;

                // Check if we left the region
                final boolean columnSwitch = isColumnSwitch(textList.get(cur), block);
                final boolean leftIndented = isLeftIndented(textList.get(cur), pred);
                final boolean leftOutdented = isLeftOutdented(textList.get(cur), pred);
                // boolean fontSwitch = (fontSize[cur] != fontSize[cur-1]);
                final boolean vAdjacent = isVerticallyAdjacent(textList.get(cur).getY(),
                        textList.get(cur - 1).getY(), block.linespacing);

                if (!columnSwitch && !leftIndented && !leftOutdented &&
                /* !fontSwitch && */vAdjacent) {
                    // Same region. Issue a line separator and restart
                    processLineSeparator();
                } else {
                    // New region
                    newRegion = true;
                    block.reset(cur);

                    if ((pred == null) || !vAdjacent) {
                        pred = predictGeneralStructure(textList, cur, prediction_depth);
                    } else if (vAdjacent) {
                        // If the block is directly adjacent, we may be better
                        // of
                        // with the old prediction... let's see if we can get a
                        // comparatively good new one.
                        final Prediction new_pred = predictGeneralStructure(textList, cur, prediction_depth);
                        final boolean badPred = isSignifiantlyWorse(new_pred.quality, pred.quality, 0.4);
                        if (!badPred) {
                            pred = new_pred;
                        }
                    }
                }

                continue; // Start again to create a new currentLine
            }

            // Ok, we are in the same line still.

            // Let's check if the block is adjacent or needs a space
            // if (!isRightAdjacent(textList, cur, cur-1, cur-2)) {
            if ((cur > 0) && !isNextChar(textList.get(cur), textList.get(cur - 1))) {
                processWordSeparator();
            }

            // Grow the current block to calculate better spacings.
            block.grow(cur);

            // Write of the characters and advance.
            writeCharacters(textList.get(cur));
            cur++;
        }

        // Close region
        if (currentStyle != null) {
            endRegion(currentStyle);
        }
    }

    /**
     * This will show add a character to the list of characters to be printed to the text file.
     * 
     * @param text
     *            The description of the character to display.
     */
    @Override
    protected void processTextPosition(final TextPosition text) {
        boolean showCharacter = true;
        if (suppressDuplicateOverlappingText) {
            showCharacter = false;
            final String textCharacter = text.getCharacter();
            final float textX = text.getX();
            final float textY = text.getY();
            List<TextPosition> sameTextCharacters = characterListMapping.get(textCharacter);
            if (sameTextCharacters == null) {
                sameTextCharacters = new ArrayList<TextPosition>();
                characterListMapping.put(textCharacter, sameTextCharacters);
            }

            // RDD - Here we compute the value that represents the end of the
            // rendered
            // text. This value is used to determine whether subsequent text
            // rendered
            // on the same line overwrites the current text.
            //
            // We subtract any positive padding to handle cases where extreme
            // amounts
            // of padding are applied, then backed off (not sure why this is
            // done, but there
            // are cases where the padding is on the order of 10x the character
            // width, and
            // the TJ just backs up to compensate after each character). Also,
            // we subtract
            // an amount to allow for kerning (a percentage of the width of the
            // last
            // character).
            //
            boolean suppressCharacter = false;
            final float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f;
            for (int i = 0; i < sameTextCharacters.size() && textCharacter != null; i++) {
                final TextPosition character = sameTextCharacters.get(i);
                final String charCharacter = character.getCharacter();
                final float charX = character.getX();
                final float charY = character.getY();
                // only want to suppress

                if (charCharacter != null &&
                // charCharacter.equals( textCharacter ) &&
                        within(charX, textX, tolerance) && within(charY, textY, tolerance)) {
                    suppressCharacter = true;
                }
            }
            if (!suppressCharacter && (text.getCharacter() != null) && (text.getCharacter().length() > 0)) {
                sameTextCharacters.add(text);
                showCharacter = true;
            }
        }

        if (showCharacter) {
            // if we are showing the character then we need to determine which
            // article it belongs to.
            int foundArticleDivisionIndex = -1;
            int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
            int notFoundButFirstLeftArticleDivisionIndex = -1;
            int notFoundButFirstAboveArticleDivisionIndex = -1;
            final float x = text.getX();
            final float y = text.getY();
            if (shouldSeparateByBeads) {
                for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) {
                    final PDThreadBead bead = pageArticles.get(i);
                    if (bead != null) {
                        final PDRectangle rect = bead.getRectangle();
                        if (rect.contains(x, y)) {
                            foundArticleDivisionIndex = i * 2 + 1;
                        } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
                                && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
                            notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
                        } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) {
                            notFoundButFirstLeftArticleDivisionIndex = i * 2;
                        } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) {
                            notFoundButFirstAboveArticleDivisionIndex = i * 2;
                        }
                    } else {
                        foundArticleDivisionIndex = 0;
                    }
                }
            } else {
                foundArticleDivisionIndex = 0;
            }
            int articleDivisionIndex = -1;
            if (foundArticleDivisionIndex != -1) {
                articleDivisionIndex = foundArticleDivisionIndex;
            } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
                articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
            } else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
                articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
            } else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
                articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
            } else {
                articleDivisionIndex = charactersByArticle.size() - 1;
            }
            final List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex);
            textList.add(text);
        }
    }

    /**
     * This will determine of two floating point numbers are within a specified variance.
     * 
     * @param first
     *            The first number to compare to.
     * @param second
     *            The second number to compare to.
     * @param variance
     *            The allowed variance.
     * @return if the number is within the specified variance.
     */
    private static boolean within(final float first, final float second, final float variance) {
        return second > first - variance && second < first + variance;
    }

    private static float getWordSpacing(final TextPosition position) {
        if (position == null) {
            return 0;
        }

        float wordSpacing = 0;

        if (wordSpacing == 0) {
            // try to get width of a space character
            wordSpacing = position.getWidthOfSpace();
            // if still zero fall back to getting the width of the current
            // character
            if (wordSpacing == 0) {
                wordSpacing = position.getWidth();
            }
        }

        return wordSpacing;
    }

    private static boolean validPosition(final List<TextPosition> textList, final int pos) {
        return (pos >= 0) && (pos < textList.size());
    }

    /**
     * Detects whether text in two positions is on the same line. This method is a bit fuzzy so we
     * also get potential superscripts and subscripts.
     * 
     * @param cur current position.
     * @param prev previous position.
     * @return if both are in the same line.
     */
    private static boolean isSameLine(final TextPosition cur, final TextPosition prev) {
        if (cur.getY() == prev.getY()) {
            return true;
        } else {
            final float prevCenter = prev.getY() + prev.getHeight() / 2.0f;
            final float prevHeight = prev.getHeight();
            final float curCenter = cur.getY() + cur.getHeight() / 2.0f;

            final boolean result = Math.abs(curCenter - prevCenter) < (prevHeight * 0.25f);

            // if (!result) {
            // _log.debug("sameLine ["+result+"]"+
            // "[px:"+f_y1[prev]+"-"+f_y2[prev]+":"+contents[prev]+"]"+
            // "[cx:"+f_y1[cur]+"-"+f_y2[cur]+":"+contents[cur]+"]");
            // }

            return result;
        }
    }

    /**
     * Tests if two objects are vertically adjacent or if they are so far away from each other that
     * they have to be considered different blocks.
     * 
     * @param cur_top
     *            current top.
     * @param prev_top
     *            previous top.
     * @param spacing
     *            spacing.
     * @return if the two objects are verticalla adjacent.
     */
    private static boolean isVerticallyAdjacent(final float cur_top, final float prev_top, final float spacing) {
        /* set vertical error margin */
        final float verterr = (float) (spacing * 1.27);

        final boolean aboveThreshold = (cur_top < (prev_top + verterr));
        final boolean belowprev = (cur_top > prev_top);

        return aboveThreshold && belowprev;
    }

    private static boolean isLeftIndented(final TextPosition cur, final Prediction pred) {
        return cur.getX() > (pred.left + (pred.linespacing * 0.2));
    }

    private static boolean isLeftOutdented(final TextPosition cur, final Prediction pred) {
        return cur.getX() < (pred.left - (pred.linespacing * 0.2));
    }

    /**
     * Check if the current fragment is in a new column.
     * 
     * @param cur
     *            current text position.
     * @param block
     *            current block.
     * @return if the fragment is in a new column.
     */
    private static boolean isColumnSwitch(final TextPosition cur, final Block block) {
        return (cur.getY() < block.top); // && (f_x1[cur] > block.right);
    }

    private static boolean isSignifiantlyWorse(final double qnew, final double qold, final double limit) {
        final double deviation = Math.abs(((qnew - qold) / (qnew + qold)));
        final boolean result = (deviation > limit) && (qnew < qold);
        // if (_log.isTraceEnabled()) {
        // _log.trace("Deviation: "+deviation+ " - "+(result?"BAD":"OK"));
        // }
        return result;
    }

    /**
     * Determine whether we need to insert a word separator between the two positions or not.
     * 
     * Adapted from PDFBox PDFTextStripper.flushText()
     * 
     * @param cur
     *            current position.
     * @param prev
     *            previous position.
     * @return if the two positions are immediately adjacent.
     */
    private static boolean isNextChar(final TextPosition cur, final TextPosition prev) {
        float lastWordSpacing = getWordSpacing(prev);
        final float wordSpacing = getWordSpacing(cur);
        float startOfNextWordX;
        final float endOfLastTextX = prev.getX() + prev.getWidth();

        // RDD - We add a conservative approximation for space determination.
        // basically if there is a blank area between two characters that is
        // equal to some percentage of the word spacing then that will be the
        // start of the next word
        if (lastWordSpacing <= 0) {
            startOfNextWordX = endOfLastTextX + (wordSpacing * 0.50f);
        } else {
            startOfNextWordX = endOfLastTextX + (((wordSpacing + lastWordSpacing) / 2f) * 0.50f);
        }

        lastWordSpacing = wordSpacing;

        // if (startOfNextWordX > cur.getX()) {
        // System.out.print("{O:"+(startOfNextWordX - cur.getX())+"}");
        // }

        if (startOfNextWordX != -1 && startOfNextWordX < cur.getX() && prev != null &&
        // only bother adding a space if the last character was not a
        // space
                prev.getCharacter() != null && !prev.getCharacter().endsWith(" ")) {
            return false;
        } else {
            return true;
        }
    }

    private List<Line> collectLines(final List<TextPosition> textList, final int blk_start, final int depth) {
        final ArrayList<Line> lines = new ArrayList<Line>(depth);
        Line l = new Line(textList, blk_start);
        lines.add(l);
        for (int i = 1; i < depth && l.hasNextLine(); i++) {
            l = l.getNextLine();

            // Bail out if we have a potential column switch
            if (l.top < lines.get(lines.size() - 1).bottom) {
                break;
            }
            lines.add(l);
        }
        return lines;
    }

    /**
     * Return a block with the probable linespacing, lineheight and left and right borders.
     * 
     * @param textList
     *            text.
     * @param blk_start
     *            block start.
     * @param depth
     *            depth.
     * @return structure prediction.
     */
    private Prediction predictGeneralStructure(final List<TextPosition> textList, final int blk_start,
            final int depth) {
        // Try to fetch the next lines up to depth
        final List<Line> lines = collectLines(textList, blk_start, depth);

        // Calculate the line block parameters
        LineBlock lb = new LineBlock(lines);

        // Iterate once more over the lines because we may have a big spacing
        // indicating a new block.

        final List<Line> lines2 = new ArrayList<Line>(depth);
        final Line l = lines.get(0);
        lines2.add(l);
        for (int i = 1; i < lines.size(); i++) {
            // Bail out if we have too much distance
            if (!isVerticallyAdjacent(lines.get(i).top, lines.get(i - 1).top, lb.linespacing)) {
                break;
            }
            lines2.add(lines.get(i));
        }

        // Get the bounds in buckets
        final Buckets left_buckets = new Buckets(lb.linespacing * 0.1);
        final Buckets right_buckets = new Buckets(lb.linespacing * 0.1);
        for (final Line ln : lines2) {
            left_buckets.put(ln.left);
            right_buckets.put(ln.right);
        }

        // if (_log.isTraceEnabled()) {
        // _log.trace("Left:  size:"+left_buckets.getBest().size()+" - lines:"+lines2.size()+" - depth:"+depth);
        // }

        lb = new LineBlock(lines2);

        // Return values
        final Prediction result = new Prediction();
        result.linespacing = lb.linespacing;
        result.lineheight = lb.avglineheight;
        result.left = (float) left_buckets.getBest().getValue();
        result.right = (float) right_buckets.getBest().getValue();
        result.quality = (float) left_buckets.getBest().size() / (float) depth;

        return result;
    }

    protected Style getStyle(final TextPosition pos) {
        if ((pos.getFontSize() * pos.getYScale()) > 14) {
            return Style.HEADING;
        } else {
            return Style.PARAGRAPH;
        }
    }

    /**
     * This method is available for subclasses of this class. It will be called before processing of
     * the document start.
     * 
     * @param pdf
     *            The PDF document that is being processed.
     * @throws IOException
     *             If an IO error occurs.
     */
    protected abstract void startDocument(PDDocument pdf) throws IOException;

    /**
     * This method is available for subclasses of this class. It will be called after processing of
     * the document finishes.
     * 
     * @param pdf
     *            The PDF document that is being processed.
     * @throws IOException
     *             If an IO error occurs.
     */
    protected abstract void endDocument(PDDocument pdf) throws IOException;

    /**
     * Start a new region.
     * 
     * @param style
     *            the style.
     * @throws IOException
     *             If there is any error writing to the stream.
     */
    protected abstract void startRegion(Style style) throws IOException;

    /**
     * End a region.
     * 
     * @param style
     *            the style.
     * @throws IOException
     *             If there is any error writing to the stream.
     */
    protected abstract void endRegion(Style style) throws IOException;

    /**
     * Start a new page.
     * 
     * @param firstPage
     *            first page.
     * @param lastPage
     *            last page.
     * @param currentPage
     *            current page.
     * @param page
     *            The page we are about to process.
     * 
     * @throws IOException
     *             If there is any error writing to the stream.
     */
    protected abstract void startPage(int firstPage, int lastPage, int currentPage, PDPage page) throws IOException;

    /**
     * End a page.
     * 
     * @param firstPage
     *            first page.
     * @param lastPage
     *            last page.
     * @param currentPage
     *            current page.
     * @param page
     *            The page we are about to process.
     * 
     * @throws IOException
     *             If there is any error writing to the stream.
     */
    protected abstract void endPage(int firstPage, int lastPage, int currentPage, PDPage page) throws IOException;

    protected abstract void processLineSeparator() throws IOException;

    protected abstract void processWordSeparator() throws IOException;

    /**
     * Write the string to the output stream.
     * 
     * @param text
     *            The text to write to the stream.
     * @throws IOException
     *             If there is an error when writing the text.
     */
    protected abstract void writeCharacters(TextPosition text) throws IOException;

    /**
     * This is the page that the text extraction will start on. The pages start at page 1. For
     * example in a 5 page PDF document, if the start page is 1 then all pages will be extracted. If
     * the start page is 4 then pages 4 and 5 will be extracted. The default value is 1.
     * 
     * @return Value of property startPage.
     */
    public int getStartPage() {
        return startPage;
    }

    /**
     * This will set the first page to be extracted by this class.
     * 
     * @param startPageValue
     *            New value of property startPage.
     */
    public void setStartPage(final int startPageValue) {
        startPage = startPageValue;
    }

    /**
     * This will get the last page that will be extracted. This is inclusive, for example if a 5
     * page PDF an endPage value of 5 would extract the entire document, an end page of 2 would
     * extract pages 1 and 2. This defaults to Integer.MAX_VALUE such that all pages of the pdf will
     * be extracted.
     * 
     * @return Value of property endPage.
     */
    public int getEndPage() {
        return endPage;
    }

    /**
     * This will set the last page to be extracted by this class.
     * 
     * @param endPageValue
     *            New value of property endPage.
     */
    public void setEndPage(final int endPageValue) {
        endPage = endPageValue;
    }

    /**
     * @return Returns the suppressDuplicateOverlappingText.
     */
    public boolean shouldSuppressDuplicateOverlappingText() {
        return suppressDuplicateOverlappingText;
    }

    /**
     * Get the current page number that is being processed.
     * 
     * @return A 1 based number representing the current page.
     */
    protected int getCurrentPageNo() {
        return currentPageNo;
    }

    /**
     * Character strings are grouped by articles. It is quite common that there will only be a
     * single article. This returns a List that contains List objects, the inner lists will contain
     * TextPosition objects.
     * 
     * @return A double List of TextPositions for all text strings on the page.
     */
    protected List<List<TextPosition>> getCharactersByArticle() {
        return charactersByArticle;
    }

    /**
     * By default the text stripper will attempt to remove text that overlapps each other. Word
     * paints the same character several times in order to make it look bold. By setting this to
     * false all text will be extracted, which means that certain sections will be duplicated, but
     * better performance will be noticed.
     * 
     * @param suppressDuplicateOverlappingTextValue
     *            The suppressDuplicateOverlappingText to set.
     */
    public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingTextValue) {
        this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
    }

    /**
     * This will tell if the text stripper should separate by beads.
     * 
     * @return If the text will be grouped by beads.
     */
    public boolean shouldSeparateByBeads() {
        return shouldSeparateByBeads;
    }

    /**
     * Set if the text stripper should group the text output by a list of beads. The default value
     * is true!
     * 
     * @param aShouldSeparateByBeads
     *            The new grouping of beads.
     */
    public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) {
        this.shouldSeparateByBeads = aShouldSeparateByBeads;
    }

    static class LineBlock {
        final List<Line> lines;
        final float linespacing;
        final float avglineheight;

        LineBlock(final List<Line> ls) {
            lines = ls;
            linespacing = calcLinespacing();
            avglineheight = calcAvgLineheight();
        }

        float calcLinespacing() {
            if (lines.size() == 1) {
                return Math.abs(lines.get(0).top - lines.get(0).bottom);
            }

            float avgls = 0.0f;
            for (int i = 0; i < (lines.size() - 1); i++) {
                avgls += Math.abs(lines.get(i).top - lines.get(i + 1).top);
            }
            return avgls / (lines.size() - 1);
        }

        private float calcAvgLineheight() {
            float avglh = 0.0f;
            for (final Line l : lines) {
                avglh += l.lineheight;
            }
            return avglh / lines.size();
        }
    }

    static class Prediction {
        float lineheight;
        float linespacing;
        float left;
        float right;
        float quality;
    }

    static class Line extends BasicBlock {
        final int start;
        final int end;
        final float lineheight;

        Line(final List<TextPosition> tl, final int pos) {
            super(tl);
            start = pos;
            end = findEnd();
            lineheight = growAndCalcLineheight();
        }

        private float growAndCalcLineheight() {
            float h = textList.get(start).getHeight();
            reset(start);
            for (int i = start + 1; i < end; i++) {
                h = Math.max(h, textList.get(i).getHeight());
                grow(i);
            }
            return h;
        }

        private int findEnd() {
            int cur = start;
            while (validPosition(textList, cur) && isSameLine(textList.get(cur), textList.get(start))) {
                cur++;
            }
            return cur;
        }

        boolean hasNextLine() {
            return validPosition(textList, end);
        }

        Line getNextLine() {
            if (hasNextLine()) {
                return new Line(textList, end);
            } else {
                return null;
            }
        }

        /**
         * Return true if the text position is within the line height boundaries. Left and right
         * boundaries are not checked.
         * 
         * @param pos
         *            text position.
         * @return if the position is within the line.
         */
        boolean withinLine(final TextPosition pos) {
            final boolean underTop = top <= pos.getY();
            final boolean overBottom = (pos.getY() + pos.getHeight()) <= bottom;
            return underTop && overBottom;
        }

        boolean isSuperscript(final TextPosition pos) {
            final boolean underTop = (top - lineheight * 0.6f) <= pos.getY();
            final boolean overBottom = (pos.getY() + pos.getHeight()) <= bottom;
            return underTop && overBottom;
        }

        boolean isSubscript(final TextPosition pos) {
            final boolean underTop = (top <= pos.getY());
            final boolean overBottom = (pos.getY() + pos.getHeight() + lineheight * 0.6f) <= bottom;
            return underTop && overBottom;
        }

        @Override
        public String toString() {
            return "[t:" + top + " b:" + bottom + "|" + content + "]";
        }
    }

    static class BasicBlock {
        float left;
        float top;
        float right;
        float bottom;
        int lines;
        int last_pos;
        final List<TextPosition> textList;

        // This is for debugging purposes only.
        final StringBuilder content = new StringBuilder();

        public BasicBlock(final List<TextPosition> tl) {
            textList = tl;
        }

        float getValue(final Values v) {
            switch (v) {
            case BOTTOM:
                return bottom;
            case TOP:
                return top;
            case RIGHT:
                return right;
            case LEFT:
                return left;
            default:
                throw new IllegalArgumentException("Unsupported value");
            }
        }

        void normalize() {
            if (top < bottom) {
                final float b = top;
                top = bottom;
                bottom = b;
            }

            if (left > right) {
                final float l = left;
                left = right;
                right = l;
            }
        }

        void reset(final int pos) {
            final TextPosition p = textList.get(pos);

            last_pos = pos;
            lines = 0;
            left = p.getX();
            right = p.getX() + p.getWidth();
            top = p.getY();
            bottom = p.getY() + p.getHeight();

            content.setLength(0);
            content.append(p.getCharacter());
        }

        void grow(final int pos) {
            final TextPosition p = textList.get(pos);

            if (!isSameLine(p, textList.get(last_pos))) {
                lines++;
            }

            last_pos = pos;
            left = Math.min(p.getX(), left);
            right = Math.max(p.getX() + p.getWidth(), right);
            top = Math.min(p.getY(), top);
            bottom = Math.max(p.getY() + p.getHeight(), bottom);

            content.append(" ");
            content.append(p.getCharacter());
        }
    }

    class Block extends BasicBlock {
        float linespacing;
        float lineheight;

        Block(final List<TextPosition> textList, final int pos) {
            super(textList);
            reset(pos);
        }

        @Override
        void reset(final int pos) {
            super.reset(pos);
            linespacing = new LineBlock(collectLines(textList, pos, 3)).linespacing;
            lineheight = Math.abs(bottom - top);
        }

        @Override
        void grow(final int pos) {
            super.grow(pos);
            lineheight = Math.max(lineheight, textList.get(pos).getHeight());
        }
    }
}