org.apache.pdfbox.pdfparser.PDFStreamParser.java Source code

Introduction

Here is the source code for org.apache.pdfbox.pdfparser.PDFStreamParser.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.pdfparser;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSBoolean;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;

/**
 * This will parse a PDF byte stream and extract operands and such.
 *
 * @author Ben Litchfield
 */
public class PDFStreamParser extends BaseParser {
    /**
     * Log instance.
     */
    private static final Log LOG = LogFactory.getLog(PDFStreamParser.class);

    private final List<Object> streamObjects = new ArrayList<>(100);

    private static final int MAX_BIN_CHAR_TEST_LENGTH = 10;
    private final byte[] binCharTestArr = new byte[MAX_BIN_CHAR_TEST_LENGTH];

    /**
     * Constructor.
     *
     * @param stream The content stream to parse.
     * @throws IOException If there is an error initializing the stream.
     */
    public PDFStreamParser(InputStream stream) throws IOException {
        super(new InputStreamSource(stream));
    }

    /**
     * Constructor.
     *
     * @param bytes the bytes to parse.
     * @throws IOException If there is an error initializing the stream.
     */
    public PDFStreamParser(byte[] bytes) throws IOException {
        super(new InputStreamSource(new ByteArrayInputStream(bytes)));
    }

    /**
     * This will parse all the tokens in the stream. This will close the stream when it is finished
     * parsing. You can then access these with {@link #getTokens() getTokens()}.
     *
     * @throws IOException If there is an error while parsing the stream.
     */
    public void parse() throws IOException {
        Object token;
        while ((token = parseNextToken()) != null) {
            streamObjects.add(token);
        }
    }

    /**
     * This will get the tokens that were parsed from the stream by the {@link #parse() parse()} method.
     *
     * @return All of the tokens in the stream.
     */
    public List<Object> getTokens() {
        return streamObjects;
    }

    /**
     * This will parse the next token in the stream.
     *
     * @return The next token in the stream or null if there are no more tokens in the stream.
     *
     * @throws IOException If an io error occurs while parsing the stream.
     */
    public Object parseNextToken() throws IOException {
        Object retval;

        skipSpaces();
        int nextByte = seqSource.peek();
        if (((byte) nextByte) == -1) {
            return null;
        }
        char c = (char) nextByte;
        switch (c) {
        case '<': {
            // pull off first left bracket
            int leftBracket = seqSource.read();

            // check for second left bracket
            c = (char) seqSource.peek();

            // put back first bracket
            seqSource.unread(leftBracket);

            if (c == '<') {
                retval = parseCOSDictionary();
            } else {
                retval = parseCOSString();
            }
            break;
        }
        case '[': {
            // array
            retval = parseCOSArray();
            break;
        }
        case '(':
            // string
            retval = parseCOSString();
            break;
        case '/':
            // name
            retval = parseCOSName();
            break;
        case 'n': {
            // null
            String nullString = readString();
            if (nullString.equals("null")) {
                retval = COSNull.NULL;
            } else {
                retval = Operator.getOperator(nullString);
            }
            break;
        }
        case 't':
        case 'f': {
            String next = readString();
            if (next.equals("true")) {
                retval = COSBoolean.TRUE;
                break;
            } else if (next.equals("false")) {
                retval = COSBoolean.FALSE;
            } else {
                retval = Operator.getOperator(next);
            }
            break;
        }
        case 'R': {
            String line = readString();
            if (line.equals("R")) {
                retval = new COSObject(null);
            } else {
                retval = Operator.getOperator(line);
            }
            break;
        }
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
        case '-':
        case '+':
        case '.': {
            /* We will be filling buf with the rest of the number.  Only
             * allow 1 "." and "-" and "+" at start of number. */
            StringBuilder buf = new StringBuilder();
            buf.append(c);
            seqSource.read();

            // Ignore double negative (this is consistent with Adobe Reader)
            if (c == '-' && seqSource.peek() == c) {
                seqSource.read();
            }

            boolean dotNotRead = c != '.';
            while (Character.isDigit(c = (char) seqSource.peek()) || dotNotRead && c == '.' || c == '-') {
                if (c != '-') {
                    // PDFBOX-4064: ignore "-" in the middle of a number
                    buf.append(c);
                }
                seqSource.read();

                if (dotNotRead && c == '.') {
                    dotNotRead = false;
                }
            }
            retval = COSNumber.get(buf.toString());
            break;
        }
        case 'B': {
            String next = readString();
            retval = Operator.getOperator(next);
            if (next.equals(OperatorName.BEGIN_INLINE_IMAGE)) {
                Operator beginImageOP = (Operator) retval;
                COSDictionary imageParams = new COSDictionary();
                beginImageOP.setImageParameters(imageParams);
                Object nextToken = null;
                while ((nextToken = parseNextToken()) instanceof COSName) {
                    Object value = parseNextToken();
                    imageParams.setItem((COSName) nextToken, (COSBase) value);
                }
                //final token will be the image data, maybe??
                if (nextToken instanceof Operator) {
                    Operator imageData = (Operator) nextToken;
                    if (imageData.getImageData() == null || imageData.getImageData().length == 0) {
                        LOG.warn("empty inline image at stream offset " + seqSource.getPosition());
                    }
                    beginImageOP.setImageData(imageData.getImageData());
                }
            }
            break;
        }
        case 'I': {
            //Special case for ID operator
            String id = Character.toString((char) seqSource.read()) + (char) seqSource.read();
            if (!id.equals(OperatorName.BEGIN_INLINE_IMAGE_DATA)) {
                throw new IOException("Error: Expected operator 'ID' actual='" + id + "' at stream offset "
                        + seqSource.getPosition());
            }
            ByteArrayOutputStream imageData = new ByteArrayOutputStream();
            if (isWhitespace()) {
                //pull off the whitespace character
                seqSource.read();
            }
            int lastByte = seqSource.read();
            int currentByte = seqSource.read();
            // PDF spec is kinda unclear about this. Should a whitespace
            // always appear before EI? Not sure, so that we just read
            // until EI<whitespace>.
            // Be aware not all kind of whitespaces are allowed here. see PDFBOX-1561
            while (!(lastByte == 'E' && currentByte == 'I' && hasNextSpaceOrReturn()
                    && hasNoFollowingBinData(seqSource)) && !seqSource.isEOF()) {
                imageData.write(lastByte);
                lastByte = currentByte;
                currentByte = seqSource.read();
            }
            // the EI operator isn't unread, as it won't be processed anyway
            retval = Operator.getOperator(OperatorName.BEGIN_INLINE_IMAGE_DATA);
            // save the image data to the operator, so that it can be accessed later
            ((Operator) retval).setImageData(imageData.toByteArray());
            break;
        }
        case ']': {
            // some ']' around without its previous '['
            // this means a PDF is somewhat corrupt but we will continue to parse.
            seqSource.read();

            // must be a better solution than null...
            retval = COSNull.NULL;
            break;
        }
        default: {
            //we must be an operator
            String operator = readOperator();
            if (operator.trim().length() == 0) {
                //we have a corrupt stream, stop reading here
                retval = null;
            } else {
                retval = Operator.getOperator(operator);
            }
        }
        }
        return retval;
    }

    /**
     * Looks up an amount of bytes if they contain only ASCII characters (no
     * control sequences etc.), and that these ASCII characters begin with a
     * sequence of 1-3 non-blank characters between blanks
     *
     * @return <code>true</code> if next bytes are probably printable ASCII
     * characters starting with a PDF operator, otherwise <code>false</code>
     */
    private boolean hasNoFollowingBinData(SequentialSource pdfSource) throws IOException {
        // as suggested in PDFBOX-1164
        final int readBytes = pdfSource.read(binCharTestArr, 0, MAX_BIN_CHAR_TEST_LENGTH);
        boolean noBinData = true;
        int startOpIdx = -1;
        int endOpIdx = -1;

        if (readBytes > 0) {
            for (int bIdx = 0; bIdx < readBytes; bIdx++) {
                final byte b = binCharTestArr[bIdx];
                if (b != 0 && b < 0x09 || b > 0x0a && b < 0x20 && b != 0x0d) {
                    // control character or > 0x7f -> we have binary data
                    noBinData = false;
                    break;
                }
                // find the start of a PDF operator
                if (startOpIdx == -1 && !(b == 0 || b == 9 || b == 0x20 || b == 0x0a || b == 0x0d)) {
                    startOpIdx = bIdx;
                } else if (startOpIdx != -1 && endOpIdx == -1
                        && (b == 0 || b == 9 || b == 0x20 || b == 0x0a || b == 0x0d)) {
                    endOpIdx = bIdx;
                }
            }

            // PDFBOX-3742: just assuming that 1-3 non blanks is a PDF operator isn't enough
            if (endOpIdx != -1 && startOpIdx != -1) {
                // usually, the operator here is Q, sometimes EMC (PDFBOX-2376), S (PDFBOX-3784).
                String s = new String(binCharTestArr, startOpIdx, endOpIdx - startOpIdx);
                if (!"Q".equals(s) && !"EMC".equals(s) && !"S".equals(s)) {
                    noBinData = false;
                }
            }

            // only if not close to eof
            if (readBytes == MAX_BIN_CHAR_TEST_LENGTH) {
                // a PDF operator is 1-3 bytes long
                if (startOpIdx != -1 && endOpIdx == -1) {
                    endOpIdx = MAX_BIN_CHAR_TEST_LENGTH;
                }
                if (endOpIdx != -1 && startOpIdx != -1 && endOpIdx - startOpIdx > 3) {
                    noBinData = false;
                }
            }
            pdfSource.unread(binCharTestArr, 0, readBytes);
        }
        if (!noBinData) {
            LOG.warn("ignoring 'EI' assumed to be in the middle of inline image at stream offset "
                    + pdfSource.getPosition());
        }
        return noBinData;
    }

    /**
     * This will read an operator from the stream.
     *
     * @return The operator that was read from the stream.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    protected String readOperator() throws IOException {
        skipSpaces();

        //average string size is around 2 and the normal string buffer size is
        //about 16 so lets save some space.
        StringBuilder buffer = new StringBuilder(4);
        int nextChar = seqSource.peek();
        while (nextChar != -1 && // EOF
                !isWhitespace(nextChar) && !isClosing(nextChar) && nextChar != '[' && nextChar != '<'
                && nextChar != '(' && nextChar != '/' && (nextChar < '0' || nextChar > '9')) {
            char currentChar = (char) seqSource.read();
            nextChar = seqSource.peek();
            buffer.append(currentChar);
            // Type3 Glyph description has operators with a number in the name
            if (currentChar == 'd' && (nextChar == '0' || nextChar == '1')) {
                buffer.append((char) seqSource.read());
                nextChar = seqSource.peek();
            }
        }
        return buffer.toString();
    }

    private boolean isSpaceOrReturn(int c) {
        return c == 10 || c == 13 || c == 32;
    }

    /**
     * Checks if the next char is a space or a return.
     * 
     * @return true if the next char is a space or a return
     * @throws IOException if something went wrong
     */
    private boolean hasNextSpaceOrReturn() throws IOException {
        return isSpaceOrReturn(seqSource.peek());
    }
}