org.apache.pdfbox.pdfparser.BaseParser.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pdfbox.pdfparser.BaseParser.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.pdfparser;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.OutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.io.PushBackInputStream;
import org.apache.pdfbox.io.RandomAccess;

import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSBoolean;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.exceptions.WrappedIOException;

import org.apache.pdfbox.persistence.util.COSObjectKey;

/**
 * This class is used to contain parsing logic that will be used by both the
 * PDFParser and the COSStreamParser.
 *
 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
 * @version $Revision: 1.61 $
 */
public abstract class BaseParser {

    /** system property allowing to define size of push back buffer */
    public static final String PROP_PUSHBACK_SIZE = "org.apache.pdfbox.baseParser.pushBackSize";

    /**
     * Log instance.
     */
    private static final Log LOG = LogFactory.getLog(BaseParser.class);

    private static final int E = 'e';
    private static final int N = 'n';
    private static final int D = 'd';

    private static final int S = 's';
    private static final int T = 't';
    private static final int R = 'r';
    private static final int A = 'a';
    private static final int M = 'm';

    private static final int O = 'o';
    private static final int B = 'b';
    private static final int J = 'j';

    private final int strmBufLen = 2048;
    private final byte[] strmBuf = new byte[strmBufLen];

    /**
     * This is a byte array that will be used for comparisons.
     */
    public static final byte[] ENDSTREAM = new byte[] { E, N, D, S, T, R, E, A, M };

    /**
     * This is a byte array that will be used for comparisons.
     */
    public static final byte[] ENDOBJ = new byte[] { E, N, D, O, B, J };

    /**
     * This is a string constant that will be used for comparisons.
     */
    public static final String DEF = "def";
    /**
     * This is a string constant that will be used for comparisons.
     */
    private static final String ENDOBJ_STRING = "endobj";
    /**
     * This is a string constant that will be used for comparisons.
     */
    private static final String ENDSTREAM_STRING = "endstream";
    /**
     * This is a string constant that will be used for comparisons.
     */
    private static final String STREAM_STRING = "stream";
    /**
     * This is a string constant that will be used for comparisons.
     */
    private static final String TRUE = "true";
    /**
     * This is a string constant that will be used for comparisons.
     */
    private static final String FALSE = "false";
    /**
     * This is a string constant that will be used for comparisons.
     */
    private static final String NULL = "null";

    /**
     * Default value of the {@link #forceParsing} flag.
     */
    protected static final boolean FORCE_PARSING = Boolean.getBoolean("org.apache.pdfbox.forceParsing");

    /**
     * This is the stream that will be read from.
     */
    protected PushBackInputStream pdfSource;

    /**
     * This is the document that will be parsed.
     */
    protected COSDocument document;

    /**
     * Flag to skip malformed or otherwise unparseable input where possible.
     */
    protected final boolean forceParsing;

    /**
     * Default constructor.
     */
    public BaseParser() {
        this.forceParsing = FORCE_PARSING;
    }

    /**
     * Constructor.
     *
     * @since Apache PDFBox 1.3.0
     * @param input The input stream to read the data from.
     * @param forceParsingValue flag to skip malformed or otherwise unparseable
     *                     input where possible
     * @throws IOException If there is an error reading the input stream.
     */
    public BaseParser(InputStream input, boolean forceParsingValue) throws IOException {
        this.pdfSource = new PushBackInputStream(new BufferedInputStream(input, 16384),
                Integer.getInteger(PROP_PUSHBACK_SIZE, 65536));
        this.forceParsing = forceParsingValue;
    }

    /**
     * Constructor.
     *
     * @param input The input stream to read the data from.
     * @throws IOException If there is an error reading the input stream.
     */
    public BaseParser(InputStream input) throws IOException {
        this(input, FORCE_PARSING);
    }

    /**
     * Constructor.
     *
     * @param input The array to read the data from.
     * @throws IOException If there is an error reading the byte data.
     */
    protected BaseParser(byte[] input) throws IOException {
        this(new ByteArrayInputStream(input));
    }

    /**
     * Set the document for this stream.
     *
     * @param doc The current document.
     */
    public void setDocument(COSDocument doc) {
        document = doc;
    }

    private static boolean isHexDigit(char ch) {
        return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
        // the line below can lead to problems with certain versions of the IBM JIT compiler
        // (and is slower anyway)
        //return (HEXDIGITS.indexOf(ch) != -1);
    }

    /**
     * This will parse a PDF dictionary value.
     *
     * @return The parsed Dictionary object.
     *
     * @throws IOException If there is an error parsing the dictionary object.
     */
    private COSBase parseCOSDictionaryValue() throws IOException {
        COSBase retval = null;
        COSBase number = parseDirObject();
        skipSpaces();
        char next = (char) pdfSource.peek();
        if (next >= '0' && next <= '9') {
            COSBase generationNumber = parseDirObject();
            skipSpaces();
            char r = (char) pdfSource.read();
            if (r != 'R') {
                throw new IOException("expected='R' actual='" + r + "' " + pdfSource);
            }
            COSObjectKey key = new COSObjectKey(((COSInteger) number).intValue(),
                    ((COSInteger) generationNumber).intValue());
            retval = document.getObjectFromPool(key);
        } else {
            retval = number;
        }
        return retval;
    }

    /**
     * This will parse a PDF dictionary.
     *
     * @return The parsed dictionary.
     *
     * @throws IOException IF there is an error reading the stream.
     */
    protected COSDictionary parseCOSDictionary() throws IOException {
        char c = (char) pdfSource.read();
        if (c != '<') {
            throw new IOException("expected='<' actual='" + c + "'");
        }
        c = (char) pdfSource.read();
        if (c != '<') {
            throw new IOException("expected='<' actual='" + c + "' " + pdfSource);
        }
        skipSpaces();
        COSDictionary obj = new COSDictionary();
        boolean done = false;
        while (!done) {
            skipSpaces();
            c = (char) pdfSource.peek();
            if (c == '>') {
                done = true;
            } else if (c != '/') {
                //an invalid dictionary, we are expecting
                //the key, read until we can recover
                LOG.warn("Invalid dictionary, found: '" + c + "' but expected: '/'");
                int read = pdfSource.read();
                while (read != -1 && read != '/' && read != '>') {
                    // in addition to stopping when we find / or >, we also want
                    // to stop when we find endstream or endobj.
                    if (read == E) {
                        read = pdfSource.read();
                        if (read == N) {
                            read = pdfSource.read();
                            if (read == D) {
                                read = pdfSource.read();
                                if (read == S) {
                                    read = pdfSource.read();
                                    if (read == T) {
                                        read = pdfSource.read();
                                        if (read == R) {
                                            read = pdfSource.read();
                                            if (read == E) {
                                                read = pdfSource.read();
                                                if (read == A) {
                                                    read = pdfSource.read();
                                                    if (read == M) {
                                                        return obj; // we're done reading this object!
                                                    }
                                                }
                                            }
                                        }
                                    }
                                } else if (read == O) {
                                    read = pdfSource.read();
                                    if (read == B) {
                                        read = pdfSource.read();
                                        if (read == J) {
                                            return obj; // we're done reading this object!
                                        }
                                    }
                                }
                            }
                        }
                    }
                    read = pdfSource.read();
                }
                if (read != -1) {
                    pdfSource.unread(read);
                } else {
                    return obj;
                }
            } else {
                COSName key = parseCOSName();
                COSBase value = parseCOSDictionaryValue();
                skipSpaces();
                if (((char) pdfSource.peek()) == 'd') {
                    //if the next string is 'def' then we are parsing a cmap stream
                    //and want to ignore it, otherwise throw an exception.
                    String potentialDEF = readString();
                    if (!potentialDEF.equals(DEF)) {
                        pdfSource.unread(potentialDEF.getBytes("ISO-8859-1"));
                    } else {
                        skipSpaces();
                    }
                }

                if (value == null) {
                    LOG.warn("Bad Dictionary Declaration " + pdfSource);
                } else {
                    obj.setItem(key, value);
                }
            }
        }
        char ch = (char) pdfSource.read();
        if (ch != '>') {
            throw new IOException("expected='>' actual='" + ch + "'");
        }
        ch = (char) pdfSource.read();
        if (ch != '>') {
            throw new IOException("expected='>' actual='" + ch + "'");
        }
        return obj;
    }

    /**
     * This will read a COSStream from the input stream.
     *
     * @param file The file to write the stream to when reading.
     * @param dic The dictionary that goes with this stream.
     *
     * @return The parsed pdf stream.
     *
     * @throws IOException If there is an error reading the stream.
     */
    protected COSStream parseCOSStream(COSDictionary dic, RandomAccess file) throws IOException {
        COSStream stream = new COSStream(dic, file);
        OutputStream out = null;
        try {
            String streamString = readString();
            //long streamLength;

            if (!streamString.equals(STREAM_STRING)) {
                throw new IOException("expected='stream' actual='" + streamString + "'");
            }

            //PDF Ref 3.2.7 A stream must be followed by either
            //a CRLF or LF but nothing else.

            int whitespace = pdfSource.read();

            //see brother_scan_cover.pdf, it adds whitespaces
            //after the stream but before the start of the
            //data, so just read those first
            while (whitespace == 0x20) {
                whitespace = pdfSource.read();
            }

            if (whitespace == 0x0D) {
                whitespace = pdfSource.read();
                if (whitespace != 0x0A) {
                    pdfSource.unread(whitespace);
                    //The spec says this is invalid but it happens in the real
                    //world so we must support it.
                }
            } else if (whitespace == 0x0A) {
                //that is fine
            } else {
                //we are in an error.
                //but again we will do a lenient parsing and just assume that everything
                //is fine
                pdfSource.unread(whitespace);
            }

            /*This needs to be dic.getItem because when we are parsing, the underlying object
             * might still be null.
             */
            COSBase streamLength = dic.getItem(COSName.LENGTH);

            //Need to keep track of the
            out = stream.createFilteredStream(streamLength);

            // try to read stream length - even if it is an indirect object
            int length = -1;
            if (streamLength instanceof COSNumber) {
                length = ((COSNumber) streamLength).intValue();
            }
            // commented out next chunk since for the sequentially working PDFParser
            // we do not know if length object is redefined later on and the currently
            // read indirect object might be obsolete (e.g. not referenced in xref table);
            // this would result in reading wrong number of bytes;
            // Thus the only reliable information is a direct length. 
            // This exclusion shouldn't harm much since in case of indirect objects they will
            // typically be defined after the stream object, thus keeping the directly
            // provided length will fix most cases
            //            else if ( ( streamLength instanceof COSObject ) &&
            //                      ( ( (COSObject) streamLength ).getObject() instanceof COSNumber ) )
            //            {
            //                length = ( (COSNumber) ( (COSObject) streamLength ).getObject() ).intValue();
            //            } 

            if (length == -1) {
                // Couldn't determine length from dict: just
                // scan until we find endstream:
                readUntilEndStream(out);
            } else {
                // Copy length bytes over:
                int left = length;
                while (left > 0) {
                    final int chunk = Math.min(left, strmBufLen);
                    final int readCount = pdfSource.read(strmBuf, 0, chunk);
                    if (readCount == -1) {
                        break;
                    }
                    out.write(strmBuf, 0, readCount);
                    left -= readCount;
                }

                // in order to handle broken documents we test if 'endstream' is reached
                // if not, length value possibly was wrong, fall back to scanning for endstream

                // fill buffer with next bytes and test for 'endstream' (with leading whitespaces)
                int readCount = pdfSource.read(strmBuf, 0, 20);
                if (readCount > 0) {
                    boolean foundEndstream = false;
                    int nextEndstreamCIdx = 0;
                    for (int cIdx = 0; cIdx < readCount; cIdx++) {
                        final int ch = strmBuf[cIdx] & 0xff;
                        if (ch == ENDSTREAM[nextEndstreamCIdx]) {
                            if (++nextEndstreamCIdx >= ENDSTREAM.length) {
                                foundEndstream = true;
                                break;
                            }
                        } else if ((nextEndstreamCIdx > 0) || (!isWhitespace(ch))) {
                            // not found
                            break;
                        }
                    }

                    // push back test bytes
                    pdfSource.unread(strmBuf, 0, readCount);

                    // if 'endstream' was not found fall back to scanning
                    if (!foundEndstream) {
                        LOG.warn("Specified stream length " + length
                                + " is wrong. Fall back to reading stream until 'endstream'.");

                        // push back all read stream bytes
                        out.flush(); // we got a buffered stream wrapper around filteredStream thus first flush to underlying stream
                        InputStream writtenStreamBytes = stream.getFilteredStream();
                        ByteArrayOutputStream bout = new ByteArrayOutputStream(length);

                        while ((readCount = writtenStreamBytes.read(strmBuf)) >= 0) {
                            bout.write(strmBuf, 0, readCount);
                        }
                        try {
                            pdfSource.unread(bout.toByteArray());
                        } catch (IOException ioe) {
                            throw new WrappedIOException("Could not push back " + bout.size()
                                    + " bytes in order to reparse stream. "
                                    + "Try increasing push back buffer using system property " + PROP_PUSHBACK_SIZE,
                                    ioe);
                        }
                        // create new filtered stream
                        out = stream.createFilteredStream(streamLength);
                        // scan until we find endstream:
                        readUntilEndStream(out);
                    }
                }
            }

            skipSpaces();
            String endStream = readString();

            if (!endStream.equals(ENDSTREAM_STRING)) {
                /*
                 * Sometimes stream objects don't have an endstream tag so readUntilEndStream(out)
                 * also can stop on endobj tags. If that's the case we need to make sure to unread
                 * the endobj so parseObject() can handle that case normally.
                 */
                if (endStream.startsWith(ENDOBJ_STRING)) {
                    byte[] endobjarray = endStream.getBytes("ISO-8859-1");
                    pdfSource.unread(endobjarray);
                }
                /*
                 * Some PDF files don't contain a new line after endstream so we
                 * need to make sure that the next object number is getting read separately
                 * and not part of the endstream keyword. Ex. Some files would have "endstream8"
                 * instead of "endstream"
                 */
                else if (endStream.startsWith(ENDSTREAM_STRING)) {
                    String extra = endStream.substring(9, endStream.length());
                    endStream = endStream.substring(0, 9);
                    byte[] array = extra.getBytes("ISO-8859-1");
                    pdfSource.unread(array);
                } else {
                    /*
                     * If for some reason we get something else here, Read until we find the next
                     * "endstream"
                     */
                    readUntilEndStream(out);
                    endStream = readString();
                    if (!endStream.equals(ENDSTREAM_STRING)) {
                        throw new IOException("expected='endstream' actual='" + endStream + "' " + pdfSource);
                    }
                }
            }
        } finally {
            if (out != null) {
                out.close();
            }
        }
        return stream;
    }

    /**
     * This method will read through the current stream object until
     * we find the keyword "endstream" meaning we're at the end of this
     * object. Some pdf files, however, forget to write some endstream tags
     * and just close off objects with an "endobj" tag so we have to handle
     * this case as well.
     * 
     * This method is optimized using buffered IO and reduced number of
     * byte compare operations.
     * 
     * @param out  stream we write out to.
     * 
     * @throws IOException
     */
    private void readUntilEndStream(final OutputStream out) throws IOException {

        int bufSize;
        int charMatchCount = 0;
        byte[] keyw = ENDSTREAM;

        final int quickTestOffset = 5; // last character position of shortest keyword ('endobj')

        // read next chunk into buffer; already matched chars are added to beginning of buffer
        while ((bufSize = pdfSource.read(strmBuf, charMatchCount, strmBufLen - charMatchCount)) > 0) {
            bufSize += charMatchCount;

            int bIdx = charMatchCount;
            int quickTestIdx;

            // iterate over buffer, trying to find keyword match
            for (int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++) {
                // reduce compare operations by first test last character we would have to
                // match if current one matches; if it is not a character from keywords
                // we can move behind the test character;
                // this shortcut is inspired by BoyerMoore string search algorithm
                // and can reduce parsing time by approx. 20%
                if ((charMatchCount == 0) && ((quickTestIdx = bIdx + quickTestOffset) < maxQuicktestIdx)) {

                    final byte ch = strmBuf[quickTestIdx];
                    if ((ch > 't') || (ch < 'a')) {
                        // last character we would have to match if current character would match
                        // is not a character from keywords -> jump behind and start over
                        bIdx = quickTestIdx;
                        continue;
                    }
                }

                final byte ch = strmBuf[bIdx]; // could be negative - but we only compare to ASCII

                if (ch == keyw[charMatchCount]) {
                    if (++charMatchCount == keyw.length) {
                        // match found
                        bIdx++;
                        break;
                    }
                } else {
                    if ((charMatchCount == 3) && (ch == ENDOBJ[charMatchCount])) {
                        // maybe ENDSTREAM is missing but we could have ENDOBJ
                        keyw = ENDOBJ;
                        charMatchCount++;

                    } else {
                        // no match; incrementing match start by 1 would be dumb since we already know matched chars
                        // depending on current char read we may already have beginning of a new match:
                        // 'e': first char matched;
                        // 'n': if we are at match position idx 7 we already read 'e' thus 2 chars matched
                        // for each other char we have to start matching first keyword char beginning with next 
                        // read position
                        charMatchCount = (ch == E) ? 1 : ((ch == N) && (charMatchCount == 7)) ? 2 : 0;
                        // search again for 'endstream'
                        keyw = ENDSTREAM;
                    }
                }
            } // for

            int contentBytes = Math.max(0, bIdx - charMatchCount);

            // write buffer content until first matched char to output stream
            if (contentBytes > 0) {
                out.write(strmBuf, 0, contentBytes);
            }
            if (charMatchCount == keyw.length) {
                // keyword matched; unread matched keyword (endstream/endobj) and following buffered content
                pdfSource.unread(strmBuf, contentBytes, bufSize - contentBytes);
                break;

            } else {
                // copy matched chars at start of buffer
                System.arraycopy(keyw, 0, strmBuf, 0, charMatchCount);
            }

        } // while
    }

    /**
     * This is really a bug in the Document creators code, but it caused a crash
     * in PDFBox, the first bug was in this format:
     * /Title ( (5)
     * /Creator which was patched in 1 place.
     * However it missed the case where the Close Paren was escaped
     *
     * The second bug was in this format
     * /Title (c:\)
     * /Producer
     *
     * This patch  moves this code out of the parseCOSString method, so it can be used twice.
     *
     *
     * @param bracesParameter the number of braces currently open.
     *
     * @return the corrected value of the brace counter
     * @throws IOException
     */
    private int checkForMissingCloseParen(final int bracesParameter) throws IOException {
        int braces = bracesParameter;
        byte[] nextThreeBytes = new byte[3];
        int amountRead = pdfSource.read(nextThreeBytes);

        //lets handle the special case seen in Bull  River Rules and Regulations.pdf
        //The dictionary looks like this
        //    2 0 obj
        //    <<
        //        /Type /Info
        //        /Creator (PaperPort http://www.scansoft.com)
        //        /Producer (sspdflib 1.0 http://www.scansoft.com)
        //        /Title ( (5)
        //        /Author ()
        //        /Subject ()
        //
        // Notice the /Title, the braces are not even but they should
        // be.  So lets assume that if we encounter an this scenario
        //   <end_brace><new_line><opening_slash> then that
        // means that there is an error in the pdf and assume that
        // was the end of the document.
        //
        if (amountRead == 3) {
            if ((nextThreeBytes[0] == 0x0d // Look for a carriage return
                    && nextThreeBytes[1] == 0x0a // Look for a new line
                    && nextThreeBytes[2] == 0x2f) // Look for a slash /
                    // Add a second case without a new line
                    || (nextThreeBytes[0] == 0x0d // Look for a carriage return
                            && nextThreeBytes[1] == 0x2f)) // Look for a slash /
            {
                braces = 0;
            }
        }
        if (amountRead > 0) {
            pdfSource.unread(nextThreeBytes, 0, amountRead);
        }
        return braces;
    }

    /**
     * This will parse a PDF string.
     *
     * @return The parsed PDF string.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    protected COSString parseCOSString() throws IOException {
        char nextChar = (char) pdfSource.read();
        COSString retval = new COSString();
        char openBrace;
        char closeBrace;
        if (nextChar == '(') {
            openBrace = '(';
            closeBrace = ')';
        } else if (nextChar == '<') {
            return parseCOSHexString();
        } else {
            throw new IOException(
                    "parseCOSString string should start with '(' or '<' and not '" + nextChar + "' " + pdfSource);
        }

        //This is the number of braces read
        //
        int braces = 1;
        int c = pdfSource.read();
        while (braces > 0 && c != -1) {
            char ch = (char) c;
            int nextc = -2; // not yet read

            if (ch == closeBrace) {

                braces--;
                braces = checkForMissingCloseParen(braces);
                if (braces != 0) {
                    retval.append(ch);
                }
            } else if (ch == openBrace) {
                braces++;
                retval.append(ch);
            } else if (ch == '\\') {
                //patched by ram
                char next = (char) pdfSource.read();
                switch (next) {
                case 'n':
                    retval.append('\n');
                    break;
                case 'r':
                    retval.append('\r');
                    break;
                case 't':
                    retval.append('\t');
                    break;
                case 'b':
                    retval.append('\b');
                    break;
                case 'f':
                    retval.append('\f');
                    break;
                case ')':
                    // PDFBox 276 /Title (c:\)
                    braces = checkForMissingCloseParen(braces);
                    if (braces != 0) {
                        retval.append(next);
                    } else {
                        retval.append('\\');
                    }
                    break;
                case '(':
                case '\\':
                    retval.append(next);
                    break;
                case 10:
                case 13:
                    //this is a break in the line so ignore it and the newline and continue
                    c = pdfSource.read();
                    while (isEOL(c) && c != -1) {
                        c = pdfSource.read();
                    }
                    nextc = c;
                    break;
                case '0':
                case '1':
                case '2':
                case '3':
                case '4':
                case '5':
                case '6':
                case '7': {
                    StringBuffer octal = new StringBuffer();
                    octal.append(next);
                    c = pdfSource.read();
                    char digit = (char) c;
                    if (digit >= '0' && digit <= '7') {
                        octal.append(digit);
                        c = pdfSource.read();
                        digit = (char) c;
                        if (digit >= '0' && digit <= '7') {
                            octal.append(digit);
                        } else {
                            nextc = c;
                        }
                    } else {
                        nextc = c;
                    }

                    int character = 0;
                    try {
                        character = Integer.parseInt(octal.toString(), 8);
                    } catch (NumberFormatException e) {
                        throw new IOException("Error: Expected octal character, actual='" + octal + "'");
                    }
                    retval.append(character);
                    break;
                }
                default: {
                    retval.append('\\');
                    retval.append(next);
                    //another problem with PDF's, sometimes the \ doesn't really
                    //mean escape like the PDF spec says it does, sometimes is should be literal
                    //which is what we will assume here.
                    //throw new IOException( "Unexpected break sequence '" + next + "' " + pdfSource );
                }
                }
            } else {
                retval.append(ch);
            }
            if (nextc != -2) {
                c = nextc;
            } else {
                c = pdfSource.read();
            }
        }
        if (c != -1) {
            pdfSource.unread(c);
        }
        return retval;
    }

    /**
     * This will parse a PDF HEX string with fail fast semantic
     * meaning that we stop if a not allowed character is found.
     * This is necessary in order to detect malformed input and
     * be able to skip to next object start.
     *
     * We assume starting '<' was already read.
     * 
     * @return The parsed PDF string.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    private final COSString parseCOSHexString() throws IOException {
        final StringBuilder sBuf = new StringBuilder();
        while (true) {
            int c = pdfSource.read();
            if (isHexDigit((char) c)) {
                sBuf.append((char) c);
            } else if (c == '>') {
                break;
            } else if (c < 0) {
                throw new IOException("Missing closing bracket for hex string. Reached EOS.");
            } else if ((c == ' ') || (c == '\n') || (c == '\t') || (c == '\r') || (c == '\b') || (c == '\f')) {
                continue;
            } else {
                // character is neither a hex char nor end of string not EOS nor whitespace
                throw new IOException("Not allowed character in hex string; char code: " + c);
            }
        }
        return COSString.createFromHexString(sBuf.toString(), forceParsing);
    }

    /**
     * This will parse a PDF array object.
     *
     * @return The parsed PDF array.
     *
     * @throws IOException If there is an error parsing the stream.
     */
    protected COSArray parseCOSArray() throws IOException {
        char ch = (char) pdfSource.read();
        if (ch != '[') {
            throw new IOException("expected='[' actual='" + ch + "'");
        }
        COSArray po = new COSArray();
        COSBase pbo = null;
        skipSpaces();
        int i = 0;
        while (((i = pdfSource.peek()) > 0) && ((char) i != ']')) {
            pbo = parseDirObject();
            if (pbo instanceof COSObject) {
                // We have to check if the expected values are there or not PDFBOX-385
                if (po.get(po.size() - 1) instanceof COSInteger) {
                    COSInteger genNumber = (COSInteger) po.remove(po.size() - 1);
                    if (po.get(po.size() - 1) instanceof COSInteger) {
                        COSInteger number = (COSInteger) po.remove(po.size() - 1);
                        COSObjectKey key = new COSObjectKey(number.intValue(), genNumber.intValue());
                        pbo = document.getObjectFromPool(key);
                    } else {
                        // the object reference is somehow wrong
                        pbo = null;
                    }
                } else {
                    pbo = null;
                }
            }
            if (pbo != null) {
                po.add(pbo);
            } else {
                //it could be a bad object in the array which is just skipped
                LOG.warn("Corrupt object reference");

                // This could also be an "endobj" or "endstream" which means we can assume that
                // the array has ended.
                String isThisTheEnd = readString();
                pdfSource.unread(isThisTheEnd.getBytes("ISO-8859-1"));
                if (ENDOBJ_STRING.equals(isThisTheEnd) || ENDSTREAM_STRING.equals(isThisTheEnd)) {
                    return po;
                }
            }
            skipSpaces();
        }
        pdfSource.read(); //read ']'
        skipSpaces();
        return po;
    }

    /**
     * Determine if a character terminates a PDF name.
     *
     * @param ch The character
     * @return <code>true</code> if the character terminates a PDF name, otherwise <code>false</code>.
     */
    protected boolean isEndOfName(char ch) {
        return (ch == ' ' || ch == 13 || ch == 10 || ch == 9 || ch == '>' || ch == '<' || ch == '[' || ch == '/'
                || ch == ']' || ch == ')' || ch == '(' || ch == -1 //EOF
        );
    }

    /**
     * This will parse a PDF name from the stream.
     *
     * @return The parsed PDF name.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    protected COSName parseCOSName() throws IOException {
        COSName retval = null;
        int c = pdfSource.read();
        if ((char) c != '/') {
            throw new IOException("expected='/' actual='" + (char) c + "'-" + c + " " + pdfSource);
        }
        // costruisce il nome
        StringBuilder buffer = new StringBuilder();
        c = pdfSource.read();
        while (c != -1) {
            char ch = (char) c;
            if (ch == '#') {
                char ch1 = (char) pdfSource.read();
                char ch2 = (char) pdfSource.read();

                // Prior to PDF v1.2, the # was not a special character.  Also,
                // it has been observed that various PDF tools do not follow the
                // spec with respect to the # escape, even though they report
                // PDF versions of 1.2 or later.  The solution here is that we
                // interpret the # as an escape only when it is followed by two
                // valid hex digits.
                //
                if (isHexDigit(ch1) && isHexDigit(ch2)) {
                    String hex = "" + ch1 + ch2;
                    try {
                        buffer.append((char) Integer.parseInt(hex, 16));
                    } catch (NumberFormatException e) {
                        throw new IOException("Error: expected hex number, actual='" + hex + "'");
                    }
                    c = pdfSource.read();
                } else {
                    pdfSource.unread(ch2);
                    c = ch1;
                    buffer.append(ch);
                }
            } else if (isEndOfName(ch)) {
                break;
            } else {
                buffer.append(ch);
                c = pdfSource.read();
            }
        }
        if (c != -1) {
            pdfSource.unread(c);
        }
        retval = COSName.getPDFName(buffer.toString());
        return retval;
    }

    /**
     * This will parse a boolean object from the stream.
     *
     * @return The parsed boolean object.
     *
     * @throws IOException If an IO error occurs during parsing.
     */
    protected COSBoolean parseBoolean() throws IOException {
        COSBoolean retval = null;
        char c = (char) pdfSource.peek();
        if (c == 't') {
            String trueString = new String(pdfSource.readFully(4), "ISO-8859-1");
            if (!trueString.equals(TRUE)) {
                throw new IOException("Error parsing boolean: expected='true' actual='" + trueString + "'");
            } else {
                retval = COSBoolean.TRUE;
            }
        } else if (c == 'f') {
            String falseString = new String(pdfSource.readFully(5), "ISO-8859-1");
            if (!falseString.equals(FALSE)) {
                throw new IOException("Error parsing boolean: expected='true' actual='" + falseString + "'");
            } else {
                retval = COSBoolean.FALSE;
            }
        } else {
            throw new IOException("Error parsing boolean expected='t or f' actual='" + c + "'");
        }
        return retval;
    }

    /**
     * This will parse a directory object from the stream.
     *
     * @return The parsed object.
     *
     * @throws IOException If there is an error during parsing.
     */
    protected COSBase parseDirObject() throws IOException {
        COSBase retval = null;

        skipSpaces();
        int nextByte = pdfSource.peek();
        char c = (char) nextByte;
        switch (c) {
        case '<': {
            int leftBracket = pdfSource.read();//pull off first left bracket
            c = (char) pdfSource.peek(); //check for second left bracket
            pdfSource.unread(leftBracket);
            if (c == '<') {

                retval = parseCOSDictionary();
                skipSpaces();
            } else {
                retval = parseCOSString();
            }
            break;
        }
        case '[': // array
        {
            retval = parseCOSArray();
            break;
        }
        case '(':
            retval = parseCOSString();
            break;
        case '/': // name
            retval = parseCOSName();
            break;
        case 'n': // null
        {
            String nullString = readString();
            if (!nullString.equals(NULL)) {
                throw new IOException("Expected='null' actual='" + nullString + "'");
            }
            retval = COSNull.NULL;
            break;
        }
        case 't': {
            String trueString = new String(pdfSource.readFully(4), "ISO-8859-1");
            if (trueString.equals(TRUE)) {
                retval = COSBoolean.TRUE;
            } else {
                throw new IOException("expected true actual='" + trueString + "' " + pdfSource);
            }
            break;
        }
        case 'f': {
            String falseString = new String(pdfSource.readFully(5), "ISO-8859-1");
            if (falseString.equals(FALSE)) {
                retval = COSBoolean.FALSE;
            } else {
                throw new IOException("expected false actual='" + falseString + "' " + pdfSource);
            }
            break;
        }
        case 'R':
            pdfSource.read();
            retval = new COSObject(null);
            break;
        case (char) -1:
            return null;
        default: {
            if (Character.isDigit(c) || c == '-' || c == '+' || c == '.') {
                StringBuilder buf = new StringBuilder();
                int ic = pdfSource.read();
                c = (char) ic;
                while (Character.isDigit(c) || c == '-' || c == '+' || c == '.' || c == 'E' || c == 'e') {
                    buf.append(c);
                    ic = pdfSource.read();
                    c = (char) ic;
                }
                if (ic != -1) {
                    pdfSource.unread(ic);
                }
                retval = COSNumber.get(buf.toString());
            } else {
                //This is not suppose to happen, but we will allow for it
                //so we are more compatible with POS writers that don't
                //follow the spec
                String badString = readString();
                //throw new IOException( "Unknown dir object c='" + c +
                //"' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
                if (badString == null || badString.length() == 0) {
                    int peek = pdfSource.peek();
                    // we can end up in an infinite loop otherwise
                    throw new IOException("Unknown dir object c='" + c + "' cInt=" + (int) c + " peek='"
                            + (char) peek + "' peekInt=" + peek + " " + pdfSource.getOffset());
                }

                // if it's an endstream/endobj, we want to put it back so the caller will see it
                if (ENDOBJ_STRING.equals(badString) || ENDSTREAM_STRING.equals(badString)) {
                    pdfSource.unread(badString.getBytes("ISO-8859-1"));
                }
            }
        }
        }
        return retval;
    }

    /**
     * This will read the next string from the stream.
     *
     * @return The string that was read from the stream.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    protected String readString() throws IOException {
        skipSpaces();
        StringBuilder buffer = new StringBuilder();
        int c = pdfSource.read();
        while (!isEndOfName((char) c) && !isClosing(c) && c != -1) {
            buffer.append((char) c);
            c = pdfSource.read();
        }
        if (c != -1) {
            pdfSource.unread(c);
        }
        return buffer.toString();
    }

    /**
     * This will read bytes until the end of line marker occurs.
     *
     * @param theString The next expected string in the stream.
     *
     * @return The characters between the current position and the end of the line.
     *
     * @throws IOException If there is an error reading from the stream or theString does not match what was read.
     */
    protected String readExpectedString(String theString) throws IOException {
        int c = pdfSource.read();
        while (isWhitespace(c) && c != -1) {
            c = pdfSource.read();
        }
        StringBuilder buffer = new StringBuilder(theString.length());
        int charsRead = 0;
        while (!isEOL(c) && c != -1 && charsRead < theString.length()) {
            char next = (char) c;
            buffer.append(next);
            if (theString.charAt(charsRead) == next) {
                charsRead++;
            } else {
                pdfSource.unread(buffer.toString().getBytes("ISO-8859-1"));
                throw new IOException("Error: Expected to read '" + theString + "' instead started reading '"
                        + buffer.toString() + "'");
            }
            c = pdfSource.read();
        }
        while (isEOL(c) && c != -1) {
            c = pdfSource.read();
        }
        if (c != -1) {
            pdfSource.unread(c);
        }
        return buffer.toString();
    }

    /**
     * This will read the next string from the stream up to a certain length.
     *
     * @param length The length to stop reading at.
     *
     * @return The string that was read from the stream of length 0 to length.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    protected String readString(int length) throws IOException {
        skipSpaces();

        int c = pdfSource.read();

        //average string size is around 2 and the normal string buffer size is
        //about 16 so lets save some space.
        StringBuilder buffer = new StringBuilder(length);
        while (!isWhitespace(c) && !isClosing(c) && c != -1 && buffer.length() < length && c != '[' && c != '<'
                && c != '(' && c != '/') {
            buffer.append((char) c);
            c = pdfSource.read();
        }
        if (c != -1) {
            pdfSource.unread(c);
        }
        return buffer.toString();
    }

    /**
     * This will tell if the next character is a closing brace( close of PDF array ).
     *
     * @return true if the next byte is ']', false otherwise.
     *
     * @throws IOException If an IO error occurs.
     */
    protected boolean isClosing() throws IOException {
        return isClosing(pdfSource.peek());
    }

    /**
     * This will tell if the next character is a closing brace( close of PDF array ).
     *
     * @param c The character to check against end of line
     * @return true if the next byte is ']', false otherwise.
     */
    protected boolean isClosing(int c) {
        return c == ']';
    }

    /**
     * This will read bytes until the first end of line marker occurs.
     * Note: if you later unread the results of this function, you'll
     * need to add a newline character to the end of the string.
     *
     * @return The characters between the current position and the end of the line.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    protected String readLine() throws IOException {
        if (pdfSource.isEOF()) {
            throw new IOException("Error: End-of-File, expected line");
        }

        StringBuilder buffer = new StringBuilder(11);

        int c;
        while ((c = pdfSource.read()) != -1) {
            if (isEOL(c)) {
                break;
            }
            buffer.append((char) c);
        }
        return buffer.toString();
    }

    /**
     * This will tell if the next byte to be read is an end of line byte.
     *
     * @return true if the next byte is 0x0A or 0x0D.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    protected boolean isEOL() throws IOException {
        return isEOL(pdfSource.peek());
    }

    /**
     * This will tell if the next byte to be read is an end of line byte.
     *
     * @param c The character to check against end of line
     * @return true if the next byte is 0x0A or 0x0D.
     */
    protected boolean isEOL(int c) {
        return c == 10 || c == 13;
    }

    /**
     * This will tell if the next byte is whitespace or not.
     *
     * @return true if the next byte in the stream is a whitespace character.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    protected boolean isWhitespace() throws IOException {
        return isWhitespace(pdfSource.peek());
    }

    /**
     * This will tell if the next byte is whitespace or not.  These values are
     * specified in table 1 (page 12) of ISO 32000-1:2008.
     * @param c The character to check against whitespace
     * @return true if the next byte in the stream is a whitespace character.
     */
    protected boolean isWhitespace(int c) {
        return c == 0 || c == 9 || c == 12 || c == 10 || c == 13 || c == 32;
    }

    /**
     * This will skip all spaces and comments that are present.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    protected void skipSpaces() throws IOException {
        //log( "skipSpaces() " + pdfSource );
        int c = pdfSource.read();
        // identical to, but faster as: isWhiteSpace(c) || c == 37
        while (c == 0 || c == 9 || c == 12 || c == 10 || c == 13 || c == 32 || c == 37)//37 is the % character, a comment
        {
            if (c == 37) {
                // skip past the comment section
                c = pdfSource.read();
                while (!isEOL(c) && c != -1) {
                    c = pdfSource.read();
                }
            } else {
                c = pdfSource.read();
            }
        }
        if (c != -1) {
            pdfSource.unread(c);
        }
        //log( "skipSpaces() done peek='" + (char)pdfSource.peek() + "'" );
    }

    /**
     * This will read an integer from the stream.
     *
     * @return The integer that was read from the stream.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    protected int readInt() throws IOException {
        skipSpaces();
        int retval = 0;

        int lastByte = 0;
        StringBuffer intBuffer = new StringBuffer();
        while ((lastByte = pdfSource.read()) != 32 && lastByte != 10 && lastByte != 13 && lastByte != 60 && //see sourceforge bug 1714707
                lastByte != 0 && //See sourceforge bug 853328
                lastByte != -1) {
            intBuffer.append((char) lastByte);
        }
        if (lastByte != -1) {
            pdfSource.unread(lastByte);
        }

        try {
            retval = Integer.parseInt(intBuffer.toString());
        } catch (NumberFormatException e) {
            pdfSource.unread(intBuffer.toString().getBytes("ISO-8859-1"));
            throw new IOException("Error: Expected an integer type, actual='" + intBuffer + "'");
        }
        return retval;
    }
}