org.elacin.pdfextract.datasource.pdfbox.PDFBoxIntegration.java Source code

Java tutorial

Introduction

Here is the source code for org.elacin.pdfextract.datasource.pdfbox.PDFBoxIntegration.java

Source

/*
 * ---------------
 *
 * This file is derivative work.
  *
 * Copyright 2010-2011 yvind Berg (elacin [at] gmail.com)
 *
 * ---------------- Original notice: ----------------
 *
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *
 *
 */

package org.elacin.pdfextract.datasource.pdfbox;

import org.apache.fontbox.util.BoundingBox;
import org.apache.log4j.Logger;
import org.apache.log4j.MDC;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdfviewer.PageDrawer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDMatrix;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.TextNormalize;
import org.apache.pdfbox.util.TextPosition;
import org.elacin.pdfextract.Constants;
import org.elacin.pdfextract.content.PhysicalText;
import org.elacin.pdfextract.datasource.DocumentContent;
import org.elacin.pdfextract.datasource.PageContent;
import org.elacin.pdfextract.datasource.graphics.DrawingSurface;
import org.elacin.pdfextract.datasource.graphics.DrawingSurfaceImpl;
import org.elacin.pdfextract.geom.MathUtils;
import org.elacin.pdfextract.geom.Rectangle;
import org.jetbrains.annotations.NotNull;

import java.awt.*;
import java.awt.geom.AffineTransform;
import java.io.IOException;
import java.util.*;
import java.util.List;

import static org.elacin.pdfextract.Constants.USE_EXISTING_WHITESPACE;

public class PDFBoxIntegration extends PageDrawer {

    // ------------------------------ FIELDS ------------------------------
    private static final Logger log = Logger.getLogger(PDFBoxIntegration.class);
    @NotNull
    private static final byte[] SPACE_BYTES = { (byte) 32 };
    private static final boolean NO_DESCENDERS = true;

    /**
     * page state
     */
    @NotNull
    protected final DrawingSurface graphicsDrawer = new DrawingSurfaceImpl();

    /* i couldnt find this information anywhere, so calculate it and cache it here */
    @NotNull
    protected Map<PDFont, Boolean> areFontsMonospaced = new HashMap<PDFont, Boolean>();

    /* The normalizer is used to remove text ligatures/presentation forms and to correct
    the direction of right to left text, such as Arabic and Hebrew. */
    @NotNull
    private final TextNormalize normalize = new TextNormalize("UTF-8");
    @NotNull
    private final List<ETextPosition> charactersForPage = new ArrayList<ETextPosition>();
    @NotNull
    private final Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
    private BasicStroke basicStroke;
    private int currentPageNo;

    /**
     * pdfbox state
     */

    /* used to filter out text which is written several times to create a bold effect */
    private final PDDocument doc;

    /**
     * document state
     */
    public DocumentContent docContent;
    private final int endPage;
    public Fonts fonts;
    public float rotation;
    private final int startPage;

    // --------------------------- CONSTRUCTORS ---------------------------
    public PDFBoxIntegration(final PDDocument doc, final int startPage, final int endPage) throws IOException {

        this.doc = doc;
        this.startPage = startPage;
        this.endPage = endPage;
    }

    // ------------------------ OVERRIDING METHODS ------------------------
    @Override
    public void drawImage(Image awtImage, AffineTransform at) {

        final Shape currentClippingPath = getGraphicsState().getCurrentClippingPath();

        graphicsDrawer.drawImage(awtImage, at, currentClippingPath);
    }

    @Override
    public void drawPage(final Graphics g, final PDPage p, final Dimension pageDimension) throws IOException {
        super.drawPage(g, p, pageDimension);
    }

    @Override
    public void fillPath(int windingRule) throws IOException {

        Color currentColor = getGraphicsState().getNonStrokingColor().getJavaColor();

        getLinePath().setWindingRule(windingRule);

        final Shape currentClippingPath = getGraphicsState().getCurrentClippingPath();

        graphicsDrawer.fill(getLinePath(), currentColor, currentClippingPath);
        getLinePath().reset();
    }

    @NotNull
    @Override
    public Graphics2D getGraphics() {
        throw new RuntimeException("PDFBoxSource does not have Graphics2D");
    }

    @Override
    public BasicStroke getStroke() {
        return basicStroke;
    }

    /**
     * Old version
     */
    public void processEncodedText(@NotNull byte[] string) throws IOException {

        /*
         *  Note on variable names.  There are three different units being used
         *     in this code.  Character sizes are given in glyph units, text locations
         *     are initially given in text units, and we want to save the data in
         *     display units. The variable names should end with Text or Disp to
         *     represent if the values are in text or disp units (no glyph units are saved).
         */
        final float fontSizeText = getGraphicsState().getTextState().getFontSize();
        final float horizontalScalingText = getGraphicsState().getTextState().getHorizontalScalingPercent() / 100f;

        // float verticalScalingText = horizontalScaling;//not sure if this is right but what else to
        // do???
        final float riseText = getGraphicsState().getTextState().getRise();
        final float wordSpacingText = getGraphicsState().getTextState().getWordSpacing();
        final float characterSpacingText = getGraphicsState().getTextState().getCharacterSpacing();

        /*
         *  We won't know the actual number of characters until
         * we process the byte data(could be two bytes each) but
         * it won't ever be more than string.length*2(there are some cases
         * were a single byte will result in two output characters "fi"
         */
        final PDFont font = getGraphicsState().getTextState().getFont();

        /*
         *  This will typically be 1000 but in the case of a type3 font this might be a different
         * number
         */
        final float glyphSpaceToTextSpaceFactor;

        if (font instanceof PDType3Font) {
            PDMatrix fontMatrix = font.getFontMatrix();
            float fontMatrixXScaling = fontMatrix.getValue(0, 0);

            glyphSpaceToTextSpaceFactor = 1.0f / fontMatrixXScaling;
        } else {
            glyphSpaceToTextSpaceFactor = /* 1.0f / */ 1000f;
        }

        float spaceWidthText = 0.0F;

        try {
            spaceWidthText = (font.getFontWidth(SPACE_BYTES, 0, 1) / glyphSpaceToTextSpaceFactor);
        } catch (Throwable exception) {
            log.warn(exception, exception);
        }

        if (spaceWidthText == 0.0F) {
            spaceWidthText = (font.getAverageFontWidth() / glyphSpaceToTextSpaceFactor);
            spaceWidthText *= .80f;
        }

        /* Convert textMatrix to display units */
        final Matrix initialMatrix = new Matrix();

        initialMatrix.setValue(0, 0, 1.0F);
        initialMatrix.setValue(0, 1, 0.0F);
        initialMatrix.setValue(0, 2, 0.0F);
        initialMatrix.setValue(1, 0, 0.0F);
        initialMatrix.setValue(1, 1, 1.0F);
        initialMatrix.setValue(1, 2, 0.0F);
        initialMatrix.setValue(2, 0, 0.0F);
        initialMatrix.setValue(2, 1, riseText);
        initialMatrix.setValue(2, 2, 1.0F);

        final Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
        final Matrix dispMatrix = initialMatrix.multiply(ctm);
        Matrix textMatrixStDisp = getTextMatrix().multiply(dispMatrix);
        final float xScaleDisp = textMatrixStDisp.getXScale();
        final float yScaleDisp = textMatrixStDisp.getYScale();
        final float spaceWidthDisp = spaceWidthText * xScaleDisp * fontSizeText;
        final float wordSpacingDisp = wordSpacingText * xScaleDisp * fontSizeText;
        float maxVerticalDisplacementText = 0.0F;
        StringBuilder characterBuffer = new StringBuilder(string.length);
        int codeLength = 1;

        for (int i = 0; i < string.length; i += codeLength) {

            // Decode the value to a Unicode character
            codeLength = 1;

            String c = font.encode(string, i, codeLength);

            if ((c == null) && (i + 1 < string.length)) {

                // maybe a multibyte encoding
                codeLength++;
                c = font.encode(string, i, codeLength);
            }

            c = inspectFontEncoding(c);

            // todo, handle horizontal displacement
            // get the width and height of this character in text units
            float fontWidth = font.getFontWidth(string, i, codeLength) * 0.95f;

            if (fontWidth == 0.0f) {
                fontWidth = spaceWidthDisp;
            }

            float characterHorizontalDisplacementText = (fontWidth / glyphSpaceToTextSpaceFactor);

            maxVerticalDisplacementText = Math.max(maxVerticalDisplacementText,
                    font.getFontHeight(string, i, codeLength) / glyphSpaceToTextSpaceFactor);

            if (maxVerticalDisplacementText <= 0.0f) {
                maxVerticalDisplacementText = font.getFontBoundingBox().getHeight() / glyphSpaceToTextSpaceFactor;
            }

            /**
             * PDF Spec - 5.5.2 Word Spacing
             *
             * Word spacing works the same was as character spacing, but applies
             * only to the space character, code 32.
             *
             * Note: Word spacing is applied to every occurrence of the single-byte
             * character code 32 in a string.  This can occur when using a simple
             * font or a composite font that defines code 32 as a single-byte code.
             * It does not apply to occurrences of the byte value 32 in multiple-byte
             * codes.
             *
             * RDD - My interpretation of this is that only character code 32's that
             * encode to spaces should have word spacing applied.  Cases have been
             * observed where a font has a space character with a character code
             * other than 32, and where word spacing (Tw) was used.  In these cases,
             * applying word spacing to either the non-32 space or to the character
             * code 32 non-space resulted in errors consistent with this interpretation.
             */
            float spacingText = characterSpacingText;

            if ((string[i] == (byte) 0x20) && (codeLength == 1)) {
                spacingText += wordSpacingText;
            }

            /*
             *  The text matrix gets updated after each glyph is placed.  The updated
             *          version will have the X and Y coordinates for the next glyph.
             */
            Matrix glyphMatrixStDisp = getTextMatrix().multiply(dispMatrix);

            // The adjustment will always be zero.  The adjustment as shown in the
            // TJ operator will be handled separately.
            float adjustment = 0.0F;

            // TODO : tx should be set for horizontal text and ty for vertical text
            // which seems to be specified in the font (not the direction in the matrix).
            float tx = ((characterHorizontalDisplacementText - adjustment / glyphSpaceToTextSpaceFactor)
                    * fontSizeText) * horizontalScalingText;
            Matrix td = new Matrix();

            td.setValue(2, 0, tx);

            float ty = 0.0F;

            td.setValue(2, 1, ty);
            setTextMatrix(td.multiply(getTextMatrix()));

            Matrix glyphMatrixEndDisp = getTextMatrix().multiply(dispMatrix);
            float sx = spacingText * horizontalScalingText;
            Matrix sd = new Matrix();

            sd.setValue(2, 0, sx);

            float sy = 0.0F;

            sd.setValue(2, 1, sy);
            setTextMatrix(sd.multiply(getTextMatrix()));

            float widthText = glyphMatrixEndDisp.getXPosition() - glyphMatrixStDisp.getXPosition();

            characterBuffer.append(c);

            Matrix textMatrixEndDisp = glyphMatrixEndDisp;
            float totalVerticalDisplacementDisp = maxVerticalDisplacementText * fontSizeText * yScaleDisp;

            try {
                final ETextPosition text = new ETextPosition(page, textMatrixStDisp, textMatrixEndDisp,
                        totalVerticalDisplacementDisp, new float[] { widthText }, spaceWidthDisp,
                        characterBuffer.toString(), font, fontSizeText,
                        (int) (fontSizeText * getTextMatrix().getXScale()), wordSpacingDisp);

                correctPosition(font, string, i, c, fontSizeText, glyphSpaceToTextSpaceFactor,
                        horizontalScalingText, codeLength, text);
                processTextPosition(text);
            } catch (Exception e) {
                log.warn("LOG00570:Error adding '" + characterBuffer + "': " + e.getMessage());
            }

            textMatrixStDisp = getTextMatrix().multiply(dispMatrix);
            characterBuffer.setLength(0);
        }
    }

    /**
     * This will process a TextPosition object and add the text to the list of characters on a page.
     * <p/> This method also filter out unwanted textpositions .
     *
     * @param text The text to process.
     */
    protected void processTextPosition(@NotNull TextPosition text_) {

        ETextPosition text = (ETextPosition) text_;

        if (text.getFontSize() == 0.0f) {
            if (log.isDebugEnabled()) {
                log.debug("LOG01100:ignoring text " + text.getCharacter() + " because fontSize is 0");
            }

            return;
        }

        if (!USE_EXISTING_WHITESPACE && "".equals(text.getCharacter().trim())) {
            return;
        }

        if (text.getCharacter().length() == 0) {
            if (log.isDebugEnabled()) {
                log.debug("LOG01110:Tried to render no text. wtf?");
            }

            return;
        }

        //        java.awt.Rectangle javapos = new java.awt.Rectangle((int) text.getPos().x,
        //                                         (int) text.getPos().y, (int) text.getPos().width,
        //                                         (int) text.getPos().height);
        //
        //        if (!getGraphicsState().getCurrentClippingPath().intersects(javapos)) {
        //            if (log.isDebugEnabled()) {
        //                log.debug("LOG01090:Dropping text \"" + text.getCharacter() + "\" because it "
        //                          + "was outside clipping path");
        //            }
        //
        //            return;
        //        }

        if (!textAlreadyRenderedAtSamePlace(text)) {
            if (log.isDebugEnabled()) {
                log.debug("LOG00770: ignoring text " + text.getCharacter()
                        + " because it seems to be rendered two times");
            }

            return;
        }

        if (!MathUtils.isWithinPercent(text.getDir(), rotation, 1)) {
            if (log.isDebugEnabled()) {
                log.debug("LOG00560: ignoring textposition " + text.getCharacter() + "because it has "
                        + "wrong rotation. TODO :)");
            }

            return;
        }

        /**
         * In the wild, some PDF encoded documents put diacritics (accents on
         * top of characters) into a separate Tj element.  When displaying them
         * graphically, the two chunks get overlayed.  With text output though,
         * we need to do the overlay. This code recombines the diacritic with
         * its associated character if the two are consecutive.
         */
        if (charactersForPage.isEmpty()) {
            charactersForPage.add(text);
        } else {

            /**
             * test if we overlap the previous entry. Note that we are making an assumption that we
             * need to only look back one TextPosition to find what we are overlapping.
             * This may not always be true.
             */
            TextPosition previousTextPosition = charactersForPage.get(charactersForPage.size() - 1);

            if (text.isDiacritic() && previousTextPosition.contains(text)) {
                previousTextPosition.mergeDiacritic(text, normalize);
            }

            /**
             * If the previous TextPosition was the diacritic, merge it into this one and remove it
             * from the list.
             */
            else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) {
                text.mergeDiacritic(previousTextPosition, normalize);
                charactersForPage.remove(charactersForPage.size() - 1);
                charactersForPage.add(text);
            } else {
                charactersForPage.add(text);
            }
        }
    }

    @Override
    public void setStroke(final BasicStroke newStroke) {
        basicStroke = newStroke;
    }

    @Override
    public void strokePath() throws IOException {

        Color currentColor = getGraphicsState().getStrokingColor().getJavaColor();
        final Shape currentClippingPath = getGraphicsState().getCurrentClippingPath();

        graphicsDrawer.strokePath(getLinePath(), currentColor, currentClippingPath);
        getLinePath().reset();
    }

    // -------------------------- PUBLIC METHODS --------------------------
    public DocumentContent getContents() {
        return docContent;
    }

    public void processDocument() throws IOException {

        resetEngine();

        try {
            if (doc.isEncrypted()) {
                doc.decrypt("");
            }
        } catch (Exception e) {
            throw new RuntimeException("Could not decrypt document", e);
        }

        currentPageNo = 0;
        docContent = new DocumentContent();
        fonts = new Fonts();

        for (final PDPage nextPage : (List<PDPage>) doc.getDocumentCatalog().getAllPages()) {
            PDStream contentStream = nextPage.getContents();

            currentPageNo++;

            if (contentStream != null) {
                COSStream contents = contentStream.getStream();

                processPage(nextPage, contents);
            }
        }

        docContent.setStyles(fonts.styles.values());
    }

    // -------------------------- OTHER METHODS --------------------------
    private void correctPosition(@NotNull final PDFont fontObj, final byte[] string, final int i,
            @NotNull final String c, final float fontSizeText, final float glyphSpaceToTextSpaceFactor,
            float horizontalScalingText, final int codeLength, @NotNull final ETextPosition text)
            throws IOException {

        /**
         * Provide precise positioning of glyphs.
         *
         * There are several problems right which needs to be worked around:
         *
         * 1. Sometimes the PDF will make room for a glyph which belongs to a font with
         *      one or more very tall glyphs by jumping up on the page before drawing.
         *   Since most glyphs are (much) shorter than the tallest one, we need to make
         *      up for that by adjusting the Y coordinate back down. The distance which
         *      is jumped up is embedded in the PDF files, so there is no other way to go
         *      about this.
         *
         *  'beforeRoomForGlyph' is the position we were at before the jump back.
         *   Then we need to add spaceOverChar which is my estimate of where the glyph
         *      should begin. the result is kept in 'startY'
         *
         * 2. The default height we get might also be too big, so recalculate that based
         *      on character bounding
         *
         */
        final BoundingBox character = fontObj.getCharacterBoundingBox(string, i, codeLength);
        PDRectangle fontBB = null;

        try {
            fontBB = fontObj.getFontBoundingBox();
        } catch (RuntimeException e) {

            // ignore, this is frequently not implemented
        }

        final Rectangle pos = text.getPos();
        float adjust = (fontSizeText * horizontalScalingText) / glyphSpaceToTextSpaceFactor;

        adjust *= getTextMatrix().getXScale();

        final Rectangle newPos;

        if ((character != null) && (fontBB != null) && (character.getHeight() > 0.0f)
                && (fontBB.getHeight() > 0.0f)) {

            /* remove the upper and lower bounds filtered away by character */
            final float spaceUnderChar = Math.min(fontBB.getLowerLeftY(), character.getLowerLeftY());
            final float spaceOverChar = fontBB.getUpperRightY() - character.getUpperRightY();
            final float fontHeight = fontBB.getHeight();

            /* calculate the upper left corner of the rendered glyph */
            float yStart = pos.endY - adjust * fontHeight;
            yStart += adjust * spaceOverChar;
            yStart -= adjust * spaceUnderChar;
            yStart -= pos.height;

            /* determine start X coordinate. */
            final float x;

            if (isMonoSpacedFont(fontObj)) {
                x = pos.x;
            } else {
                //                float leftOfText = text.getX() - (adjust * fontBB.getWidth());
                //
                //                x = leftOfText + adjust * character.getLowerLeftX();
                x = pos.x;
            }

            /*
             *  It was much easier to write the word segmentation code with full font width,
             *   so lets keep that. I havent seen this causing any problems
             */
            float w = pos.width;

            /*
             *  Line segmentation code was obviously much easier by not having any descenders which
             *   can even overlap into the following line. Math symbols need to stay full length
             */
            final float characterHeight;

            if (NO_DESCENDERS && (Character.getType(c.charAt(0)) != (int) Character.MATH_SYMBOL)) {
                characterHeight = character.getUpperRightY();
            } else {
                characterHeight = character.getHeight();
            }

            float h = adjust * (characterHeight);

            /* correct if the NO_DESCENDERS hack made this character have no height*/
            if (NO_DESCENDERS && h < 0.1f) {
                h = pos.height;
            }

            newPos = new Rectangle(x, yStart, w, h);
        } else {

            /*
             *  here we have a lot less information, so keep most of what was calculated. Just offset
             *   the Y coordinate
             */
            float h = pos.height;
            float w = pos.width;
            float startY = pos.y - h;// * 0.8f;

            if (fontObj instanceof PDType3Font) {

                /*
                 *  type 3 fonts typically have almost no information
                 * try to mitigate the damage by keeping them small.
                 */
                h *= 0.5f;
                startY += h; /* this is a _very_ quick and dirty hack */
            }

            newPos = new Rectangle(pos.x, startY, w, h);
        }

        if (log.isTraceEnabled()) {
            log.trace("LOG00730:Text " + c + ", " + "pos from " + pos + " to " + newPos);
        }

        text.setBaseLine(pos.y);
        text.setPos(newPos);
    }

    private void filterOutBadFonts(@NotNull List<ETextPosition> text) {

        final Map<PDFont, Integer> badCharsForStyle = new HashMap<PDFont, Integer>();
        final Map<PDFont, Integer> numCharsForStyle = new HashMap<PDFont, Integer>();

        for (TextPosition tp : text) {
            if (!badCharsForStyle.containsKey(tp.getFont())) {
                badCharsForStyle.put(tp.getFont(), 0);
                numCharsForStyle.put(tp.getFont(), 0);
            }

            char c = tp.getCharacter().charAt(0);

            if (Character.isISOControl(c)) {
                badCharsForStyle.put(tp.getFont(), badCharsForStyle.get(tp.getFont()) + 1);
            }

            numCharsForStyle.put(tp.getFont(), numCharsForStyle.get(tp.getFont()) + 1);
        }

        final Collection<PDFont> ignoredFonts = new ArrayList<PDFont>();

        for (Map.Entry<PDFont, Integer> pdFontIntegerEntry : numCharsForStyle.entrySet()) {
            int badChars = badCharsForStyle.get(pdFontIntegerEntry.getKey());
            int totalChars = pdFontIntegerEntry.getValue();

            if (badChars > totalChars * 0.10f) {
                ignoredFonts.add(pdFontIntegerEntry.getKey());
                log.warn("LOG01060:Ignoring all content using font " + pdFontIntegerEntry.getKey().getBaseFont()
                        + " as it " + "seems to be missing UTF-8 conversion information");
            }
        }

        for (Iterator<ETextPosition> iterator = text.iterator(); iterator.hasNext();) {
            TextPosition tp = iterator.next();

            if (ignoredFonts.contains(tp.getFont())) {
                iterator.remove();
            }
        }
    }

    private void filterOutControlCodes(@NotNull List<ETextPosition> text) {

        for (Iterator<ETextPosition> iterator = text.iterator(); iterator.hasNext();) {
            TextPosition tp = iterator.next();

            if (Character.isISOControl(tp.getCharacter().charAt(0))) {
                if (log.isDebugEnabled()) {
                    log.debug("Removing character \"" + tp.getCharacter() + "\"");
                }

                iterator.remove();
            }
        }
    }

    private boolean textAlreadyRenderedAtSamePlace(@NotNull final TextPosition text) {

        String c = text.getCharacter();
        List<TextPosition> sameTextCharacters = characterListMapping.get(c);

        if (sameTextCharacters == null) {
            sameTextCharacters = new ArrayList<TextPosition>();
            characterListMapping.put(c, sameTextCharacters);

            return true;
        }

        /**
         * RDD - Here we compute the value that represents the end of the rendered
         * text.  This value is used to determine whether subsequent text rendered
         * on the same line overwrites the current text.
         *
         * We subtract any positive padding to handle cases where extreme amounts
         * of padding are applied, then backed off (not sure why this is done, but there
         * are cases where the padding is on the order of 10x the character width, and
         * the TJ just backs up to compensate after each character).  Also, we subtract
         * an amount to allow for kerning (a percentage of the width of the last
         * character).
         */
        boolean suppressCharacter = false;
        final float tolerance = (text.getWidth() / (float) c.length()) / 3.0f;

        for (TextPosition other : sameTextCharacters) {
            String otherChar = other.getCharacter();
            float charX = other.getX();
            float charY = other.getY();

            if ((otherChar != null) && MathUtils.isWithinVariance(charX, text.getX(), tolerance)
                    && MathUtils.isWithinVariance(charY, text.getY(), tolerance)) {
                suppressCharacter = true;
            }
        }

        boolean alreadyThere = true;

        if (!suppressCharacter) {
            sameTextCharacters.add(text);
            alreadyThere = false;
        }

        return !alreadyThere;
    }

    private boolean isMonoSpacedFont(@NotNull PDFont fontObj) {

        if (areFontsMonospaced.containsKey(fontObj)) {
            return areFontsMonospaced.get(fontObj);
        }

        List<Float> widths = fontObj.getWidths();
        boolean monospaced = true;

        if (widths == null) {
            monospaced = false;
        } else {
            final float firstWidth = widths.get(0);

            for (int i = 1; i < widths.size(); i++) {
                final float width = widths.get(i);

                if (!MathUtils.isWithinPercent(width, firstWidth, 1.0f)) {
                    monospaced = false;

                    break;
                }
            }
        }

        if (monospaced) {
            log.debug("LOG01080:Font " + fontObj.getBaseFont() + " is monospaced");
        }

        areFontsMonospaced.put(fontObj, monospaced);

        return monospaced;
    }

    /**
     * This will process the contents of a page.
     *
     * @param page    The page to process.
     * @param content The contents of the page.
     * @throws IOException If there is an error processing the page.
     */
    protected void processPage(@NotNull PDPage page, COSStream content) throws IOException {

        if ((currentPageNo >= startPage) && (currentPageNo <= endPage)) {

            /* show which page we are working on in the log */
            MDC.put("page", currentPageNo);
            charactersForPage.clear();
            characterListMapping.clear();
            pageSize = page.findCropBox().createDimension();
            rotation = (float) page.findRotation();

            /* this is used to 'draw' images on during pdf parsing */
            graphicsDrawer.clearSurface();
            setGraphicsState(null);
            resetEngine();
            processStream(page, page.findResources(), content);
            filterOutBadFonts(charactersForPage);

            /* filter out remaining definite bad characters */
            filterOutControlCodes(charactersForPage);

            List<PhysicalText> texts = new ArrayList<PhysicalText>(charactersForPage.size());

            for (ETextPosition tp : charactersForPage) {
                texts.add(tp.convertText(fonts));
            }

            final PDRectangle mediaBox = page.findMediaBox();
            Rectangle dimensions = new Rectangle(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(),
                    mediaBox.getWidth(), mediaBox.getHeight());
            PageContent thisPage = new PageContent(texts, graphicsDrawer.getGraphicContents(), currentPageNo,
                    dimensions);

            docContent.addPage(thisPage);
            MDC.remove("page");
        }
    }

    @Override
    public void SHFill(final COSName ShadingName) throws IOException {
        super.SHFill(ShadingName);
    }
}