org.apache.pdfbox.util.PDFStreamEngine.java Source code

Introduction

Here is the source code for org.apache.pdfbox.util.PDFStreamEngine.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.util;

import java.io.IOException;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.exceptions.WrappedIOException;

import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;

import org.apache.pdfbox.pdmodel.common.PDMatrix;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType3Font;

import org.apache.pdfbox.pdmodel.graphics.PDExtendedGraphicsState;
import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState;
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;

import org.apache.pdfbox.util.operator.OperatorProcessor;

/**
 * This class will run through a PDF content stream and execute certain operations
 * and provide a callback interface for clients that want to do things with the stream.
 * See the PDFTextStripper class for an example of how to use this class.
 *
 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
 * @version $Revision: 1.38 $
 */
public class PDFStreamEngine {

    /**
     * Log instance.
     */
    private static final Log LOG = LogFactory.getLog(PDFStreamEngine.class);

    /**
     * The PDF operators that are ignored by this engine.
     */
    private final Set<String> unsupportedOperators = new HashSet<String>();

    private static final byte[] SPACE_BYTES = { (byte) 32 };

    private PDGraphicsState graphicsState = null;

    private Matrix textMatrix = null;
    private Matrix textLineMatrix = null;
    private Stack<PDGraphicsState> graphicsStack = new Stack<PDGraphicsState>();

    private Map<String, OperatorProcessor> operators = new HashMap<String, OperatorProcessor>();

    private Stack<PDResources> streamResourcesStack = new Stack<PDResources>();

    private PDPage page;

    private int validCharCnt;
    private int totalCharCnt;

    /**
     * Flag to skip malformed or otherwise unparseable input where possible.
     */
    private boolean forceParsing = false;

    /**
     * Constructor.
     */
    public PDFStreamEngine() {
        //default constructor
        validCharCnt = 0;
        totalCharCnt = 0;

    }

    /**
     * Constructor with engine properties.  The property keys are all
     * PDF operators, the values are class names used to execute those
     * operators. An empty value means that the operator will be silently
     * ignored.
     *
     * @param properties The engine properties.
     *
     * @throws IOException If there is an error setting the engine properties.
     */
    public PDFStreamEngine(Properties properties) throws IOException {
        if (properties == null) {
            throw new NullPointerException("properties cannot be null");
        }
        Enumeration<?> names = properties.propertyNames();
        for (Object name : Collections.list(names)) {
            String operator = name.toString();
            String processorClassName = properties.getProperty(operator);
            if ("".equals(processorClassName)) {
                unsupportedOperators.add(operator);
            } else {
                try {
                    Class<?> klass = Class.forName(processorClassName);
                    OperatorProcessor processor = (OperatorProcessor) klass.newInstance();
                    registerOperatorProcessor(operator, processor);
                } catch (Exception e) {
                    throw new WrappedIOException(
                            "OperatorProcessor class " + processorClassName + " could not be instantiated", e);
                }
            }
        }
        validCharCnt = 0;
        totalCharCnt = 0;
    }

    /**
     * Indicates if force parsing is activated.
     * 
     * @return true if force parsing is active
     */
    public boolean isForceParsing() {
        return forceParsing;
    }

    /**
     * Enable/Disable force parsing.
     * 
     * @param forceParsingValue true activates force parsing
     */
    public void setForceParsing(boolean forceParsingValue) {
        forceParsing = forceParsingValue;
    }

    /**
     * Register a custom operator processor with the engine.
     *
     * @param operator The operator as a string.
     * @param op Processor instance.
     */
    public void registerOperatorProcessor(String operator, OperatorProcessor op) {
        op.setContext(this);
        operators.put(operator, op);
    }

    /**
     * This method must be called between processing documents.  The
     * PDFStreamEngine caches information for the document between pages
     * and this will release the cached information.  This only needs
     * to be called if processing a new document.
     *
     */
    public void resetEngine() {
        validCharCnt = 0;
        totalCharCnt = 0;
    }

    /**
     * This will process the contents of the stream.
     *
     * @param aPage The page.
     * @param resources The location to retrieve resources.
     * @param cosStream the Stream to execute.
     *
     *
     * @throws IOException if there is an error accessing the stream.
     */
    public void processStream(PDPage aPage, PDResources resources, COSStream cosStream) throws IOException {
        graphicsState = new PDGraphicsState(aPage.findCropBox());
        textMatrix = null;
        textLineMatrix = null;
        graphicsStack.clear();
        streamResourcesStack.clear();
        processSubStream(aPage, resources, cosStream);
    }

    /**
     * Process a sub stream of the current stream.
     *
     * @param aPage The page used for drawing.
     * @param resources The resources used when processing the stream.
     * @param cosStream The stream to process.
     *
     * @throws IOException If there is an exception while processing the stream.
     */
    public void processSubStream(PDPage aPage, PDResources resources, COSStream cosStream) throws IOException {
        page = aPage;
        if (resources != null) {
            streamResourcesStack.push(resources);
            try {
                processSubStream(cosStream);
            } finally {
                streamResourcesStack.pop().clear();
            }
        } else {
            processSubStream(cosStream);
        }
    }

    private void processSubStream(COSStream cosStream) throws IOException {
        List<COSBase> arguments = new ArrayList<COSBase>();
        PDFStreamParser parser = new PDFStreamParser(cosStream, forceParsing);
        try {
            Iterator<Object> iter = parser.getTokenIterator();
            while (iter.hasNext()) {
                Object next = iter.next();
                if (LOG.isDebugEnabled()) {
                    LOG.debug("processing substream token: " + next);
                }
                if (next instanceof COSObject) {
                    arguments.add(((COSObject) next).getObject());
                } else if (next instanceof PDFOperator) {
                    processOperator((PDFOperator) next, arguments);
                    arguments = new ArrayList<COSBase>();
                } else {
                    arguments.add((COSBase) next);
                }
            }
        } finally {
            parser.close();
        }
    }

    /**
     * A method provided as an event interface to allow a subclass to perform
     * some specific functionality when text needs to be processed.
     *
     * @param text The text to be processed.
     */
    protected void processTextPosition(TextPosition text) {
        //subclasses can override to provide specific functionality.
    }

    /**
     * A method provided as an event interface to allow a subclass to perform
     * some specific functionality on the string encoded by a glyph.
     *
     * @param str The string to be processed.
     */
    protected String inspectFontEncoding(String str) {
        return str;
    }

    /**
     * Process encoded text from the PDF Stream. 
     * You should override this method if you want to perform an action when 
     * encoded text is being processed.
     *
     * @param string The encoded text
     *
     * @throws IOException If there is an error processing the string
     */
    public void processEncodedText(byte[] string) throws IOException {
        /* Note on variable names.  There are three different units being used
         * in this code.  Character sizes are given in glyph units, text locations
         * are initially given in text units, and we want to save the data in 
         * display units. The variable names should end with Text or Disp to 
         * represent if the values are in text or disp units (no glyph units are saved).
         */
        final float fontSizeText = graphicsState.getTextState().getFontSize();
        final float horizontalScalingText = graphicsState.getTextState().getHorizontalScalingPercent() / 100f;
        //float verticalScalingText = horizontalScaling;//not sure if this is right but what else to do???
        final float riseText = graphicsState.getTextState().getRise();
        final float wordSpacingText = graphicsState.getTextState().getWordSpacing();
        final float characterSpacingText = graphicsState.getTextState().getCharacterSpacing();

        //We won't know the actual number of characters until
        //we process the byte data(could be two bytes each) but
        //it won't ever be more than string.length*2(there are some cases
        //were a single byte will result in two output characters "fi"

        final PDFont font = graphicsState.getTextState().getFont();
        // all fonts are providing the width/height of a character in thousandths of a unit of text space
        float fontMatrixXScaling = 1 / 1000f;
        float fontMatrixYScaling = 1 / 1000f;
        float glyphSpaceToTextSpaceFactor = 1 / 1000f;
        // expect Type3 fonts, those are providing the width of a character in glyph space units
        if (font instanceof PDType3Font) {
            PDMatrix fontMatrix = font.getFontMatrix();
            fontMatrixXScaling = fontMatrix.getValue(0, 0);
            fontMatrixYScaling = fontMatrix.getValue(1, 1);
            //This will typically be 1000 but in the case of a type3 font
            //this might be a different number
            glyphSpaceToTextSpaceFactor = 1f / fontMatrix.getValue(0, 0);
        }
        float spaceWidthText = 0;
        try {
            // to avoid crash as described in PDFBOX-614
            // lets see what the space displacement should be
            spaceWidthText = (font.getFontWidth(SPACE_BYTES, 0, 1) * glyphSpaceToTextSpaceFactor);
        } catch (Throwable exception) {
            LOG.warn(exception, exception);
        }

        if (spaceWidthText == 0) {
            spaceWidthText = (font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor);
            //The average space width appears to be higher than necessary
            //so lets make it a little bit smaller.
            spaceWidthText *= .80f;
        }

        float maxVerticalDisplacementText = 0;

        Matrix textStateParameters = new Matrix();
        textStateParameters.setValue(0, 0, fontSizeText * horizontalScalingText);
        textStateParameters.setValue(1, 1, fontSizeText);
        textStateParameters.setValue(2, 1, riseText);

        int pageRotation = page.findRotation();
        float pageHeight = page.findMediaBox().getHeight();
        float pageWidth = page.findMediaBox().getWidth();

        Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
        Matrix textXctm = new Matrix();
        Matrix textMatrixEnd = new Matrix();
        Matrix td = new Matrix();
        Matrix tempMatrix = new Matrix();

        int codeLength = 1;
        for (int i = 0; i < string.length; i += codeLength) {
            // Decode the value to a Unicode character
            codeLength = 1;
            String c = font.encode(string, i, codeLength);
            int[] codePoints = null;
            if (c == null && i + 1 < string.length) {
                //maybe a multibyte encoding
                codeLength++;
                c = font.encode(string, i, codeLength);
                codePoints = new int[] { font.getCodeFromArray(string, i, codeLength) };
            }

            // the space width has to be transformed into display units
            float spaceWidthDisp = spaceWidthText * fontSizeText * horizontalScalingText * textMatrix.getValue(0, 0)
                    * ctm.getValue(0, 0);

            //todo, handle horizontal displacement
            // get the width and height of this character in text units 
            float characterHorizontalDisplacementText = font.getFontWidth(string, i, codeLength);
            float characterVerticalDisplacementText = font.getFontHeight(string, i, codeLength);

            // multiply the width/height with the scaling factor
            characterHorizontalDisplacementText = characterHorizontalDisplacementText * fontMatrixXScaling;
            characterVerticalDisplacementText = characterVerticalDisplacementText * fontMatrixYScaling;

            maxVerticalDisplacementText = Math.max(maxVerticalDisplacementText, characterVerticalDisplacementText);

            // PDF Spec - 5.5.2 Word Spacing
            //
            // Word spacing works the same was as character spacing, but applies
            // only to the space character, code 32.
            //
            // Note: Word spacing is applied to every occurrence of the single-byte
            // character code 32 in a string.  This can occur when using a simple
            // font or a composite font that defines code 32 as a single-byte code.
            // It does not apply to occurrences of the byte value 32 in multiple-byte
            // codes.
            //
            // RDD - My interpretation of this is that only character code 32's that
            // encode to spaces should have word spacing applied.  Cases have been
            // observed where a font has a space character with a character code
            // other than 32, and where word spacing (Tw) was used.  In these cases,
            // applying word spacing to either the non-32 space or to the character
            // code 32 non-space resulted in errors consistent with this interpretation.
            //
            float spacingText = 0;
            if ((string[i] == 0x20) && codeLength == 1) {
                spacingText += wordSpacingText;
            }
            textXctm = textMatrix.multiply(ctm, textXctm);
            // Convert textMatrix to display units
            // We need to instantiate a new Matrix instance here as it is passed to the TextPosition constructor below.
            Matrix textMatrixStart = textStateParameters.multiply(textXctm);

            // TODO : tx should be set for horizontal text and ty for vertical text
            // which seems to be specified in the font (not the direction in the matrix).
            float tx = ((characterHorizontalDisplacementText) * fontSizeText) * horizontalScalingText;
            float ty = 0;
            // reset the matrix instead of creating a new one
            td.reset();
            td.setValue(2, 0, tx);
            td.setValue(2, 1, ty);

            // The text matrix gets updated after each glyph is placed.  The updated
            // version will have the X and Y coordinates for the next glyph.
            // textMatrixEnd contains the coordinates of the end of the last glyph without 
            // taking characterSpacingText and spacintText into account, otherwise it'll be
            // impossible to detect new words within text extraction
            tempMatrix = textStateParameters.multiply(td, tempMatrix);
            textMatrixEnd = tempMatrix.multiply(textXctm, textMatrixEnd);
            final float endXPosition = textMatrixEnd.getXPosition();
            final float endYPosition = textMatrixEnd.getYPosition();

            // add some spacing to the text matrix (see comment above)
            tx = ((characterHorizontalDisplacementText) * fontSizeText + characterSpacingText + spacingText)
                    * horizontalScalingText;
            td.setValue(2, 0, tx);
            textMatrix = td.multiply(textMatrix, textMatrix);

            // determine the width of this character
            // XXX: Note that if we handled vertical text, we should be using Y here
            float startXPosition = textMatrixStart.getXPosition();
            float widthText = endXPosition - startXPosition;

            //there are several cases where one character code will
            //output multiple characters.  For example "fi" or a
            //glyphname that has no mapping like "visiblespace"
            if (c != null) {
                validCharCnt++;
            } else {
                // PDFBOX-373: Replace a null entry with "?" so it is
                // not printed as "(null)"
                c = "?";
            }
            totalCharCnt++;

            float totalVerticalDisplacementDisp = maxVerticalDisplacementText * fontSizeText * textXctm.getYScale();

            // process the decoded text
            processTextPosition(new TextPosition(pageRotation, pageWidth, pageHeight, textMatrixStart, endXPosition,
                    endYPosition, totalVerticalDisplacementDisp, widthText, spaceWidthDisp, c, codePoints, font,
                    fontSizeText, (int) (fontSizeText * textMatrix.getXScale())));
        }
    }

    /**
     * This is used to handle an operation.
     *
     * @param operation The operation to perform.
     * @param arguments The list of arguments.
     *
     * @throws IOException If there is an error processing the operation.
     */
    public void processOperator(String operation, List<COSBase> arguments) throws IOException {
        try {
            PDFOperator oper = PDFOperator.getOperator(operation);
            processOperator(oper, arguments);
        } catch (IOException e) {
            LOG.warn(e, e);
        }
    }

    /**
     * This is used to handle an operation.
     *
     * @param operator The operation to perform.
     * @param arguments The list of arguments.
     *
     * @throws IOException If there is an error processing the operation.
     */
    protected void processOperator(PDFOperator operator, List<COSBase> arguments) throws IOException {
        try {
            String operation = operator.getOperation();
            OperatorProcessor processor = (OperatorProcessor) operators.get(operation);
            if (processor != null) {
                processor.setContext(this);
                processor.process(operator, arguments);
            } else {
                if (!unsupportedOperators.contains(operation)) {
                    LOG.info("unsupported/disabled operation: " + operation);
                    unsupportedOperators.add(operation);
                }
            }
        } catch (Exception e) {
            LOG.warn(e, e);
        }
    }

    /**
     * @return Returns the colorSpaces.
     */
    public Map<String, PDColorSpace> getColorSpaces() {
        return streamResourcesStack.peek().getColorSpaces();
    }

    /**
     * @return Returns the colorSpaces.
     */
    public Map<String, PDXObject> getXObjects() {
        return streamResourcesStack.peek().getXObjects();
    }

    /**
     * @param value The colorSpaces to set.
     */
    public void setColorSpaces(Map<String, PDColorSpace> value) {
        streamResourcesStack.peek().setColorSpaces(value);
    }

    /**
     * @return Returns the fonts.
     */
    public Map<String, PDFont> getFonts() {
        if (streamResourcesStack.isEmpty()) {
            return Collections.emptyMap();
        }

        return streamResourcesStack.peek().getFonts();
    }

    /**
     * @param value The fonts to set.
     */
    public void setFonts(Map<String, PDFont> value) {
        streamResourcesStack.peek().setFonts(value);
    }

    /**
     * @return Returns the graphicsStack.
     */
    public Stack<PDGraphicsState> getGraphicsStack() {
        return graphicsStack;
    }

    /**
     * @param value The graphicsStack to set.
     */
    public void setGraphicsStack(Stack<PDGraphicsState> value) {
        graphicsStack = value;
    }

    /**
     * @return Returns the graphicsState.
     */
    public PDGraphicsState getGraphicsState() {
        return graphicsState;
    }

    /**
     * @param value The graphicsState to set.
     */
    public void setGraphicsState(PDGraphicsState value) {
        graphicsState = value;
    }

    /**
     * @return Returns the graphicsStates.
     */
    public Map<String, PDExtendedGraphicsState> getGraphicsStates() {
        return streamResourcesStack.peek().getGraphicsStates();
    }

    /**
     * @param value The graphicsStates to set.
     */
    public void setGraphicsStates(Map<String, PDExtendedGraphicsState> value) {
        streamResourcesStack.peek().setGraphicsStates(value);
    }

    /**
     * @return Returns the textLineMatrix.
     */
    public Matrix getTextLineMatrix() {
        return textLineMatrix;
    }

    /**
     * @param value The textLineMatrix to set.
     */
    public void setTextLineMatrix(Matrix value) {
        textLineMatrix = value;
    }

    /**
     * @return Returns the textMatrix.
     */
    public Matrix getTextMatrix() {
        return textMatrix;
    }

    /**
     * @param value The textMatrix to set.
     */
    public void setTextMatrix(Matrix value) {
        textMatrix = value;
    }

    /**
     * @return Returns the resources.
     */
    public PDResources getResources() {
        return streamResourcesStack.peek();
    }

    /**
     * Get the current page that is being processed.
     *
     * @return The page being processed.
     */
    public PDPage getCurrentPage() {
        return page;
    }

    /** 
     * Get the total number of valid characters in the doc 
     * that could be decoded in processEncodedText(). 
     * @return The number of valid characters. 
     */
    public int getValidCharCnt() {
        return validCharCnt;
    }

    /**
     * Get the total number of characters in the doc
     * (including ones that could not be mapped).  
     * @return The number of characters. 
     */
    public int getTotalCharCnt() {
        return totalCharCnt;
    }

}