org.apache.pdfbox.pdfparser.NonSequentialPDFParser.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pdfbox.pdfparser.NonSequentialPDFParser.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.pdfparser;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.security.KeyStore;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.Set;
import java.util.TreeMap;
import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.io.PushBackInputStream;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.DecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
import org.apache.pdfbox.pdmodel.encryption.PublicKeyDecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandlersManager;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.persistence.util.COSObjectKey;

/**
 * PDFParser which first reads startxref and xref tables in order to know valid
 * objects and parse only these objects. Thus it is closer to a conforming parser
 * than the sequential reading of {@link PDFParser}.
 * 
 * This class can be used as a {@link PDFParser} replacement. First {@link #parse()}
 * must be called before page objects can be retrieved, e.g. {@link #getPDDocument()}.
 * 
 * This class is a much enhanced version of <code>QuickParser</code> presented in 
 * <a href="https://issues.apache.org/jira/browse/PDFBOX-1104">PDFBOX-1104</a>
 * by Jeremy Villalobos.
 */
public class NonSequentialPDFParser extends PDFParser {

    public static final String SYSPROP_PARSEMINIMAL = "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.parseMinimal";
    public static final String SYSPROP_EOFLOOKUPRANGE = "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange";

    private static final InputStream EMPTY_INPUT_STREAM = new ByteArrayInputStream(new byte[0]);

    private static final int DEFAULT_TRAIL_BYTECOUNT = 2048;
    private static final char[] EOF_MARKER = new char[] { '%', '%', 'E', 'O', 'F' };
    private static final char[] STARTXREF_MARKER = new char[] { 's', 't', 'a', 'r', 't', 'x', 'r', 'e', 'f' };
    private static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };

    private final File pdfFile;
    private final RandomAccessBufferedFileInputStream raStream;

    private SecurityHandler securityHandler = null;

    private String keyStoreFilename = null;
    private String alias = null;
    private String password = "";
    private int readTrailBytes = DEFAULT_TRAIL_BYTECOUNT; // how many trailing bytes to read for EOF marker

    /** If <code>true</code> object references in catalog are not followed;
     *  pro: page objects will be only parsed when needed; cons: some information of catalog
     *  might not be available (e.g. outline).
     *  Catalog parsing without pages is not an option since a number of entries will
     *  also refer to page objects (like OpenAction).
     */
    private boolean parseMinimalCatalog = "true".equals(System.getProperty(SYSPROP_PARSEMINIMAL));

    private boolean initialParseDone = false;
    private boolean allPagesParsed = false;

    private static final Log LOG = LogFactory.getLog(NonSequentialPDFParser.class);

    // ------------------------------------------------------------------------
    /** 
     * Constructs parser for given file using memory buffer. 
     * 
     * @param filename the filename of the pdf to be parsed
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(String filename) throws IOException {
        this(new File(filename), null);
    }

    /** 
     * Constructs parser for given file using given buffer for temporary storage. 
     * 
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     *  
     * @throws IOException If something went wrong.
     */
    /** 
     * Constructs parser for given file using given buffer for temporary storage. 
     * 
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     *  
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(File file, RandomAccess raBuf) throws IOException {
        this(file, raBuf, "");
    }

    /** 
     * Constructs parser for given file using given buffer for temporary storage. 
     * 
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     *  
     * @throws IOException If something went wrong.
     */
    /** 
     * Constructs parser for given file using given buffer for temporary storage. 
     * 
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     * @param decryptionPassword password to be used for decryption
     *  
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(File file, RandomAccess raBuf, String decryptionPassword) throws IOException {
        super(EMPTY_INPUT_STREAM, null, false);

        String eofLookupRangeStr = System.getProperty(SYSPROP_EOFLOOKUPRANGE);
        if (eofLookupRangeStr != null) {
            try {
                setEOFLookupRange(Integer.parseInt(eofLookupRangeStr));
            } catch (NumberFormatException nfe) {
                LOG.warn("System property " + SYSPROP_EOFLOOKUPRANGE + " does not contain an integer value, but: '"
                        + eofLookupRangeStr + "'");
            }
        }

        pdfFile = file;
        raStream = new RandomAccessBufferedFileInputStream(pdfFile);

        setDocument(
                (raBuf == null) ? new COSDocument(new RandomAccessBuffer(), false) : new COSDocument(raBuf, false));

        pdfSource = new PushBackInputStream(raStream, 4096);

        password = decryptionPassword;
    }

    // ------------------------------------------------------------------------
    /** 
     *  Sets how many trailing bytes of PDF file are searched for
     *  EOF marker and 'startxref' marker.
     *  If not set we use default value {@link #DEFAULT_TRAIL_BYTECOUNT}.
     *  
     *  <p<We check that new value is at least 16. However for practical use
     *  cases this value should not be lower than 1000; even 2000
     *  was found to not be enough in some cases where some trailing
     *  garbage like HTML snippets followed the EOF marker.</p>
     *  
     *  <p>In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined
     *  this value will be set on initialization but can be overwritten later.</p>
     *  
     *  @param byteCount number of trailing bytes
     */
    public void setEOFLookupRange(int byteCount) {
        if (byteCount > 15) {
            readTrailBytes = byteCount;
        }
    }

    // ------------------------------------------------------------------------
    /**
     * The initial parse will first parse only the trailer, the xrefstart and 
     * all xref tables to have a pointer (offset) to all the pdf's objects.
     * It can handle linearized pdfs, which will have an xref at the 
     * end pointing to an xref at the beginning of the file.
     * Last the root object is parsed.
     * 
     * @throws IOException
     */
    private void initialParse() throws IOException {
        final long startxrefOff = getStartxrefOffset();

        // ---- parse startxref
        setPdfSource(startxrefOff);
        parseStartXref();

        final long xrefOffset = document.getStartXref();
        long prev = xrefOffset;

        // ---- parse whole chain of xref tables/object streams using PREV reference
        while (prev > -1) {
            // seek to xref table
            setPdfSource(prev);

            // -- parse xref
            if (pdfSource.peek() == 'x') {
                // xref table and trailer
                // use existing parser to parse xref table
                parseXrefTable(prev);

                // parse the last trailer.
                if (!parseTrailer()) {
                    throw new IOException("Expected trailer object at position: " + pdfSource.getOffset());
                }
                COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer();
                prev = trailer.getInt(COSName.PREV);
            } else {
                // xref stream
                prev = parseXrefObjStream(prev);
            }
        }

        // ---- build valid xrefs out of the xref chain
        xrefTrailerResolver.setStartxref(xrefOffset);
        document.setTrailer(xrefTrailerResolver.getTrailer());

        // ---- prepare encryption if necessary
        COSBase trailerEncryptItem = document.getTrailer().getItem(COSName.ENCRYPT);
        if (trailerEncryptItem != null) {
            if (trailerEncryptItem instanceof COSObject) {
                COSObject trailerEncryptObj = (COSObject) trailerEncryptItem;
                parseObjectDynamically(trailerEncryptObj, true);
            }

            try {
                PDEncryptionDictionary encParameters = new PDEncryptionDictionary(
                        document.getEncryptionDictionary());

                DecryptionMaterial decryptionMaterial = null;
                if (keyStoreFilename != null) {
                    KeyStore ks = KeyStore.getInstance("PKCS12");
                    ks.load(new FileInputStream(keyStoreFilename), password.toCharArray());

                    decryptionMaterial = new PublicKeyDecryptionMaterial(ks, alias, password);
                } else {
                    decryptionMaterial = new StandardDecryptionMaterial(password);
                }

                securityHandler = SecurityHandlersManager.getInstance()
                        .getSecurityHandler(encParameters.getFilter());
                securityHandler.prepareForDecryption(encParameters, document.getDocumentID(), decryptionMaterial);

                AccessPermission permission = securityHandler.getCurrentAccessPermission();
                if (!permission.canExtractContent()) {
                    LOG.warn("PDF file '" + pdfFile.getPath() + "' does not allow extracting content.");
                }

            } catch (Exception e) {
                throw new IOException("Error (" + e.getClass().getSimpleName()
                        + ") while creating security handler for decryption: "
                        + e.getMessage() /*, e // TODO: remove remark with Java 1.6 */);
            }
        }

        // ---- parse catalog or root object
        COSObject root = (COSObject) xrefTrailerResolver.getTrailer().getItem(COSName.ROOT);

        if (root == null) {
            throw new IOException("Missing root object specification in trailer.");
        }

        parseObjectDynamically(root, false);

        // ---- resolve all objects (including pages)
        if (!parseMinimalCatalog) {
            COSObject catalogObj = document.getCatalog();
            if (catalogObj != null) {
                if (catalogObj.getObject() instanceof COSDictionary) {
                    parseDictObjects((COSDictionary) catalogObj.getObject(), (COSName[]) null);
                    allPagesParsed = true;
                    document.setDecrypted();
                }
            }
        }
        initialParseDone = true;
    }

    // ------------------------------------------------------------------------
    /** Parses an xref object stream starting with indirect object id.
     *  
     *  @return value of PREV item in dictionary or <code>-1</code> if no such item exists
     */
    private long parseXrefObjStream(long objByteOffset) throws IOException {
        // ---- parse indirect object head
        readInt();
        readInt();
        readPattern(OBJ_MARKER);

        COSDictionary dict = parseCOSDictionary();
        COSStream xrefStream = parseCOSStream(dict, getDocument().getScratchFile());
        parseXrefStream(xrefStream, (int) objByteOffset);

        return dict.getLong(COSName.PREV);
    }

    // ------------------------------------------------------------------------
    /** Get current offset in file at which next byte would be read. */
    private final long getPdfSourceOffset() {
        return pdfSource.getOffset();
    }

    /** Sets {@link #pdfSource} to start next parsing at given file offset. */
    private final void setPdfSource(long fileOffset) throws IOException {

        pdfSource.seek(fileOffset);

        // alternative using 'old fashioned' input stream
        //        if ( pdfSource != null )
        //            pdfSource.close();
        //        
        //        pdfSource = new PushBackInputStream(
        //                            new BufferedInputStream(
        //                                new FileInputStream( file ), 16384),  4096);
        //        pdfSource.skip( _fileOffset );
    }

    /** Enable handling of alternative pdfSource implementation. */
    private final void releasePdfSourceInputStream() throws IOException {
        //        if ( pdfSource != null )
        //            pdfSource.close();
    }

    private final void closeFileStream() throws IOException {
        if (pdfSource != null) {
            pdfSource.close();
        }
    }

    // ------------------------------------------------------------------------
    /** Looks for and parses startxref. We first look for last '%%EOF' marker
     *  (within last {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via
     *  {@link #setEOFLookupRange(int)}) and go back to find <code>startxref</code>. */
    private final long getStartxrefOffset() throws IOException {
        byte[] buf;
        long skipBytes;

        // ---- read trailing bytes into buffer
        final long fileLen = pdfFile.length();

        FileInputStream fIn = null;
        try {
            fIn = new FileInputStream(pdfFile);

            final int trailByteCount = (fileLen < readTrailBytes) ? (int) fileLen : readTrailBytes;
            buf = new byte[trailByteCount];
            fIn.skip(skipBytes = fileLen - trailByteCount);

            int off = 0;
            int readBytes;
            while (off < trailByteCount) {
                readBytes = fIn.read(buf, off, trailByteCount - off);
                // in order to not get stuck in a loop we check readBytes (this should never happen)
                if (readBytes < 1) {
                    throw new IOException(
                            "No more bytes to read for trailing buffer, but expected: " + (trailByteCount - off));
                }
                off += readBytes;
            }
        } finally {
            if (fIn != null) {
                try {
                    fIn.close();
                } catch (IOException ioe) {
                }
            }
        }

        // ---- find last '%%EOF'
        int bufOff = lastIndexOf(EOF_MARKER, buf, buf.length);

        if (bufOff < 0) {
            throw new IOException("Missing end of file marker '" + (new String(EOF_MARKER)) + "'");
        }
        // ---- find last startxref preceding EOF marker
        bufOff = lastIndexOf(STARTXREF_MARKER, buf, bufOff);

        if (bufOff < 0) {
            throw new IOException("Missing 'startxref' marker.");
        }
        return skipBytes + bufOff;
    }

    // ------------------------------------------------------------------------
    /** Searches last appearance of pattern within buffer. Lookup before _lastOff
     *  and goes back until 0.
     *  
     *  @param pattern  pattern to search for
     *  @param buf      buffer to search pattern in
     *  @param endOff   offset (exclusive) where lookup starts at
     *  
     *  @return  start offset of pattern within buffer or <code>-1</code> if pattern could not be found 
     */
    private final int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff) {
        final int lastPatternChOff = pattern.length - 1;

        int bufOff = endOff;
        int patOff = lastPatternChOff;
        char lookupCh = pattern[patOff];

        while (--bufOff >= 0) {
            if (buf[bufOff] == lookupCh) {
                if (--patOff < 0) {
                    // whole pattern matched
                    return bufOff;
                }
                // matched current char, advance to preceding one
                lookupCh = pattern[patOff];
            } else if (patOff < lastPatternChOff) {
                // no char match but already matched some chars; reset 
                lookupCh = pattern[patOff = lastPatternChOff];
            }
        }

        return -1;
    }

    // ------------------------------------------------------------------------
    /** Reads given pattern from {@link #pdfSource}. Skipping whitespace at start and end.
     * 
     * @throws IOException if pattern could not be read
     */
    private final void readPattern(final char[] pattern) throws IOException {
        skipSpaces();

        for (char c : pattern) {
            if (pdfSource.read() != c) {
                throw new IOException(
                        "Expected pattern '" + new String(pattern) + " but missed at character '" + c + "'");
            }
        }

        skipSpaces();
    }

    // ------------------------------------------------------------------------
    private COSDictionary pagesDictionary = null;

    /** Returns PAGES {@link COSDictionary} object or throws {@link IOException}
     *  if PAGES dictionary does not exist. */
    private COSDictionary getPagesObject() throws IOException {
        if (pagesDictionary != null) {
            return pagesDictionary;
        }
        COSObject pages = (COSObject) document.getCatalog().getItem(COSName.PAGES);

        if (pages == null) {
            throw new IOException("Missing PAGES entry in document catalog.");
        }

        COSBase object = parseObjectDynamically(pages, false);

        if (!(object instanceof COSDictionary)) {
            throw new IOException("PAGES not a dictionary object, but: " + object.getClass().getSimpleName());
        }

        pagesDictionary = (COSDictionary) object;

        return pagesDictionary;
    }

    // ------------------------------------------------------------------------
    /** Parses all objects needed by pages and closes input stream. */
    /**
     * {@inheritDoc}
     */
    @Override
    public void parse() throws IOException {
        boolean exceptionOccurred = true; // set to false if all is processed

        try {
            if (!initialParseDone) {
                initialParse();
            }

            final int pageCount = getPageNumber();

            if (!allPagesParsed) {
                for (int pNr = 0; pNr < pageCount; pNr++) {
                    getPage(pNr);
                }
                allPagesParsed = true;
                document.setDecrypted();
            }

            exceptionOccurred = false;
        } finally {
            try {
                closeFileStream();
            } catch (IOException ioe) {
            }

            if (exceptionOccurred && (document != null)) {
                try {
                    document.close();
                } catch (IOException ioe) {
                }
            }
        }
    }

    // ------------------------------------------------------------------------
    /** 
     * Returns security handler of the document or <code>null</code> if document
     * is not encrypted or {@link #parse()} wasn't called before. 
     *
     * @return the security handler.
     */
    public SecurityHandler getSecurityHandler() {
        return securityHandler;
    }

    // ------------------------------------------------------------------------
    /**
     * This will get the PD document that was parsed.  When you are done with
     * this document you must call close() on it to release resources.
     *
     * Overwriting super method was necessary in order to set security handler.
     *
     * @return The document at the PD layer.
     *
     * @throws IOException If there is an error getting the document.
     */
    @Override
    public PDDocument getPDDocument() throws IOException {
        PDDocument pdDocument = super.getPDDocument();
        if (securityHandler != null)
            pdDocument.setSecurityHandler(securityHandler);

        return pdDocument;
    }

    // ------------------------------------------------------------------------
    /**
     * Returns the number of pages in a document.
     * 
     * @return the number of pages.
     * 
     * @throws IOException  if PAGES or other needed object is missing 
     */
    public int getPageNumber() throws IOException {
        int pageCount = getPagesObject().getInt(COSName.COUNT);

        if (pageCount < 0) {
            throw new IOException("No page number specified.");
        }
        return pageCount;
    }

    // ------------------------------------------------------------------------
    /**
     * Returns the page requested with all the objects loaded into it.
     * 
     * @param pageNr starts from 0 to the number of pages.
     * @return the page with the given pagenumber.
     * @throws IOException If something went wrong.
     */
    public PDPage getPage(int pageNr) throws IOException {
        getPagesObject();

        // ---- get list of top level pages
        COSArray kids = (COSArray) pagesDictionary.getDictionaryObject(COSName.KIDS);

        if (kids == null) {
            throw new IOException("Missing 'Kids' entry in pages dictionary.");
        }

        // ---- get page we are looking for (possibly going recursively into subpages)
        COSObject pageObj = getPageObject(pageNr, kids, 0);

        if (pageObj == null) {
            throw new IOException("Page " + pageNr + " not found.");
        }

        // ---- parse all objects necessary to load page.
        COSDictionary pageDict = (COSDictionary) pageObj.getObject();

        if (parseMinimalCatalog && (!allPagesParsed)) {
            // parse page resources since we did not do this on start
            COSDictionary resDict = (COSDictionary) pageDict.getDictionaryObject(COSName.RESOURCES);
            parseDictObjects(resDict);
        }

        return new PDPage(pageDict);
    }

    /**
     * Returns the object for a specific page.
     * The page tree is made up of kids.  The kids have COSArray with COSObjects
     * inside of them. The COSObject can be parsed using the dynamic parsing method
     * We want to only parse the minimum COSObjects and still return a complete page.
     * ready to be used.
     * 
     * @param num  the requested page number; numbering starts with 0
     * @param startKids Kids array to start with looking up page number
     * @param startPageCount
     * 
     * @return  page object or <code>null</code> if no such page exists
     * 
     * @throws IOException
     */
    private COSObject getPageObject(int num, COSArray startKids, int startPageCount) throws IOException {
        int curPageCount = startPageCount;
        Iterator<COSBase> kidsIter = startKids.iterator();

        while (kidsIter.hasNext()) {
            COSObject obj = (COSObject) kidsIter.next();
            COSBase base = obj.getObject();
            if (base == null) {
                base = parseObjectDynamically(obj, false);
                obj.setObject(base);
            }

            COSDictionary dic = (COSDictionary) base;
            int count = dic.getInt(COSName.COUNT);
            if (count >= 0) {
                // skip this branch if requested page comes later
                if ((curPageCount + count) <= num) {
                    curPageCount += count;
                    continue;
                }
            }

            COSArray kids = (COSArray) dic.getDictionaryObject(COSName.KIDS);
            if (kids != null) {
                // recursively scan subpages
                COSObject ans = getPageObject(num, kids, curPageCount);
                // if ans is not null, we got what we were looking for
                if (ans != null) {
                    return ans;
                }
            } else {
                // found page?
                if (curPageCount == num) {
                    return obj;
                }
                // page has no kids and it is not the page we are looking for 
                curPageCount++;
            }
        }
        return null;
    }

    /** Creates a unique object id using object number and object generation number. 
     *  (requires object number < 2^31)) */
    private final long getObjectId(final COSObject obj) {
        return (obj.getObjectNumber().longValue() << 32) | obj.getGenerationNumber().longValue();
    }

    /** Adds all from newObjects to toBeParsedList if it is not an COSObject
     *  or we didn't add this COSObject already (checked via addedObjects). */
    private final void addNewToList(final Queue<COSBase> toBeParsedList, final Collection<COSBase> newObjects,
            final Set<Long> addedObjects) {
        for (COSBase newObject : newObjects) {
            if (newObject instanceof COSObject) {
                final long objId = getObjectId((COSObject) newObject);
                if (!addedObjects.add(objId)) {
                    continue;
                }
            }
            toBeParsedList.add(newObject);
        }
    }

    /** Adds newObject to toBeParsedList if it is not an COSObject
     *  or we didn't add this COSObject already (checked via addedObjects). */
    private final void addNewToList(final Queue<COSBase> toBeParsedList, final COSBase newObject,
            final Set<Long> addedObjects) {
        if (newObject instanceof COSObject) {
            final long objId = getObjectId((COSObject) newObject);
            if (!addedObjects.add(objId)) {
                return;
            }
        }
        toBeParsedList.add(newObject);
    }

    /**
     * Will parse every object necessary to load a single page from the pdf document.
     * We try our best to order objects according to offset in file before reading
     * to minimize seek operations.
     * 
     * @param dict the COSObject from the parent pages.
     * @param excludeObjects dictionary object reference entries with these names will not be parsed
     * 
     * @throws IOException
     */
    private void parseDictObjects(COSDictionary dict, COSName... excludeObjects) throws IOException {
        // ---- create queue for objects waiting for further parsing
        final Queue<COSBase> toBeParsedList = new LinkedList<COSBase>();
        // offset ordered object map
        final TreeMap<Long, List<COSObject>> objToBeParsed = new TreeMap<Long, List<COSObject>>();
        // in case of compressed objects offset points to stmObj
        final Set<Long> parsedObjects = new HashSet<Long>();
        final Set<Long> addedObjects = new HashSet<Long>();

        // ---- add objects not to be parsed to list of already parsed objects
        if (excludeObjects != null) {
            for (COSName objName : excludeObjects) {
                COSBase baseObj = dict.getItem(objName);
                if (baseObj instanceof COSObject) {
                    parsedObjects.add(getObjectId((COSObject) baseObj));
                }
            }
        }

        addNewToList(toBeParsedList, dict.getValues(), addedObjects);

        // ---- go through objects to be parsed
        while (!(toBeParsedList.isEmpty() && objToBeParsed.isEmpty())) {
            // -- first get all COSObject from other kind of objects and
            //    put them in objToBeParsed; afterwards toBeParsedList is empty
            COSBase baseObj;
            while ((baseObj = toBeParsedList.poll()) != null) {
                if (baseObj instanceof COSStream) {
                    addNewToList(toBeParsedList, ((COSStream) baseObj).getValues(), addedObjects);
                } else if (baseObj instanceof COSDictionary) {
                    addNewToList(toBeParsedList, ((COSDictionary) baseObj).getValues(), addedObjects);
                } else if (baseObj instanceof COSArray) {
                    final Iterator<COSBase> arrIter = ((COSArray) baseObj).iterator();
                    while (arrIter.hasNext()) {
                        addNewToList(toBeParsedList, arrIter.next(), addedObjects);
                    }
                } else if (baseObj instanceof COSObject) {
                    COSObject obj = (COSObject) baseObj;
                    long objId = getObjectId(obj);
                    COSObjectKey objKey = new COSObjectKey(obj.getObjectNumber().intValue(),
                            obj.getGenerationNumber().intValue());

                    if (!(parsedObjects.contains(objId) /*|| document.hasObjectInPool( objKey ) */ )) {
                        Long fileOffset = xrefTrailerResolver.getXrefTable().get(objKey);
                        //  it is allowed that object references point to null, thus we have to test
                        if (fileOffset != null) {
                            if (fileOffset > 0) {
                                objToBeParsed.put(fileOffset, Collections.singletonList(obj));
                            } else {
                                // negative offset means we have a compressed object within object stream;
                                // get offset of object stream
                                fileOffset = xrefTrailerResolver.getXrefTable()
                                        .get(new COSObjectKey(-fileOffset, 0));
                                if ((fileOffset == null) || (fileOffset <= 0)) {
                                    throw new IOException(
                                            "Invalid object stream xref object reference: " + fileOffset);
                                }

                                List<COSObject> stmObjects = objToBeParsed.get(fileOffset);
                                if (stmObjects == null) {
                                    objToBeParsed.put(fileOffset, stmObjects = new ArrayList<COSObject>());
                                }
                                stmObjects.add(obj);
                            }
                        } else {
                            // NULL object
                            COSObject pdfObject = document.getObjectFromPool(objKey);
                            pdfObject.setObject(COSNull.NULL);
                        }
                    }
                }
            }

            // ---- read first COSObject with smallest offset;
            //      resulting object will be added to toBeParsedList
            if (objToBeParsed.isEmpty()) {
                break;
            }

            for (COSObject obj : objToBeParsed.remove(objToBeParsed.firstKey())) {
                COSBase parsedObj = parseObjectDynamically(obj, false);

                obj.setObject(parsedObj);
                addNewToList(toBeParsedList, parsedObj, addedObjects);

                parsedObjects.add(getObjectId(obj));
            }
        }
    }

    /**
     * This will parse the next object from the stream and add it to 
     * the local state. 
     * This is taken from {@link PDFParser} and reduced to parsing
     * an indirect object.
     *
     * @param  obj object to be parsed (we only take object number and generation number for lookup start offset)
     * @param  requireExistingNotCompressedObj  if <code>true</code> object to be parsed must 
     *          not be contained within compressed stream
     * @return  the parsed object (which is also added to document object)
     * 
     * @throws IOException If an IO error occurs.
     */
    private COSBase parseObjectDynamically(COSObject obj, boolean requireExistingNotCompressedObj)
            throws IOException {
        return parseObjectDynamically(obj.getObjectNumber().intValue(), obj.getGenerationNumber().intValue(),
                requireExistingNotCompressedObj);
    }

    /**
     * This will parse the next object from the stream and add it to 
     * the local state. 
     * This is taken from {@link PDFParser} and reduced to parsing
     * an indirect object.
     *
     * @param  objNr object number of object to be parsed
     * @param  objGenNr object generation number of object to be parsed
     * @param requireExistingNotCompressedObj  if <code>true</code> the object to be parsed must be defined
     *                                          in xref (comment: null objects may be missing from xref) and
     *                                          it must not be a compressed object within object stream
     *                                          (this is used to circumvent being stuck in a loop in a malicious PDF) 
     * 
     * @return  the parsed object (which is also added to document object)
     * 
     * @throws IOException If an IO error occurs.
     */
    private COSBase parseObjectDynamically(int objNr, int objGenNr, boolean requireExistingNotCompressedObj)
            throws IOException {
        // ---- create object key and get object (container) from pool
        final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr);
        final COSObject pdfObject = document.getObjectFromPool(objKey);

        if (pdfObject.getObject() == null) {
            // not previously parsed
            // ---- read offset or object stream object number from xref table
            Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get(objKey);

            // sanity test to circumvent loops with broken documents
            if (requireExistingNotCompressedObj && ((offsetOrObjstmObNr == null) || (offsetOrObjstmObNr <= 0))) {
                throw new IOException("Object must be defined and must not be compressed object: "
                        + objKey.getNumber() + ":" + objKey.getGeneration());
            }

            if (offsetOrObjstmObNr == null) {
                // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9)
                pdfObject.setObject(COSNull.NULL);
            } else if (offsetOrObjstmObNr > 0) {
                // offset of indirect object in file
                // ---- go to object start
                setPdfSource(offsetOrObjstmObNr);

                // ---- we must have an indirect object
                final int readObjNr = readInt();
                final int readObjGen = readInt();
                readPattern(OBJ_MARKER);

                // ---- consistency check
                if ((readObjNr != objKey.getNumber()) || (readObjGen != objKey.getGeneration())) {
                    throw new IOException("XREF for " + objKey.getNumber() + ":" + objKey.getGeneration()
                            + " points to wrong object: " + readObjNr + ":" + readObjGen);
                }

                skipSpaces();
                COSBase pb = parseDirObject();
                String endObjectKey = readString();

                if (endObjectKey.equals("stream")) {
                    pdfSource.unread(endObjectKey.getBytes("ISO-8859-1"));
                    pdfSource.unread(' ');
                    if (pb instanceof COSDictionary) {
                        COSStream stream = parseCOSStream((COSDictionary) pb, getDocument().getScratchFile());

                        if (securityHandler != null) {
                            try {
                                securityHandler.decryptStream(stream, objNr, objGenNr);
                            } catch (CryptographyException ce) {
                                throw new IOException(
                                        "Error decrypting stream object " + objNr + ": " + ce.getMessage()
                                /*, ce // TODO: remove remark with Java 1.6 */ );
                            }
                        }
                        pb = stream;
                    } else {
                        // this is not legal
                        // the combination of a dict and the stream/endstream forms a complete stream object
                        throw new IOException(
                                "Stream not preceded by dictionary (offset: " + offsetOrObjstmObNr + ").");
                    }
                    skipSpaces();
                    endObjectKey = readLine();

                    // we have case with a second 'endstream' before endobj
                    if (!endObjectKey.startsWith("endobj")) {
                        if (endObjectKey.startsWith("endstream")) {
                            endObjectKey = endObjectKey.substring(9).trim();
                            if (endObjectKey.length() == 0) {
                                // no other characters in extra endstream line
                                endObjectKey = readLine(); // read next line 
                            }
                        }
                    }
                } else if (securityHandler != null) {
                    // decrypt
                    if (pb instanceof COSString) {
                        decrypt((COSString) pb, objNr, objGenNr);
                    } else if (pb instanceof COSDictionary) {
                        for (Entry<COSName, COSBase> entry : ((COSDictionary) pb).entrySet()) {
                            // TODO: specially handle 'Contents' entry of signature dictionary like in SecurityHandler#decryptDictionary
                            if (entry.getValue() instanceof COSString) {
                                decrypt((COSString) entry.getValue(), objNr, objGenNr);
                            }
                        }
                    } else if (pb instanceof COSArray) {
                        final COSArray array = (COSArray) pb;
                        for (int aIdx = 0, len = array.size(); aIdx < len; aIdx++) {
                            if (array.get(aIdx) instanceof COSString) {
                                decrypt((COSString) array.get(aIdx), objNr, objGenNr);
                            }
                        }
                    }
                }

                pdfObject.setObject(pb);

                if (!endObjectKey.startsWith("endobj")) {
                    throw new IOException("Object (" + readObjNr + ":" + readObjGen + ") at offset "
                            + offsetOrObjstmObNr + " does not end with 'endobj'.");
                }

                releasePdfSourceInputStream();

            } else {
                // xref value is object nr of object stream containing object to be parsed;
                // since our object was not found it means object stream was not parsed so far
                final int objstmObjNr = (int) (-offsetOrObjstmObNr);
                final COSBase objstmBaseObj = parseObjectDynamically(objstmObjNr, 0, true);
                if (objstmBaseObj instanceof COSStream) {
                    // parse object stream
                    PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj, document,
                            forceParsing);
                    parser.parse();

                    // get set of object numbers referenced for this object stream
                    final Set<Long> refObjNrs = xrefTrailerResolver.getContainedObjectNumbers(objstmObjNr);

                    // register all objects which are referenced to be contained in object stream
                    for (COSObject next : parser.getObjects()) {
                        COSObjectKey stmObjKey = new COSObjectKey(next);
                        if (refObjNrs.contains(stmObjKey.getNumber())) {
                            COSObject stmObj = document.getObjectFromPool(stmObjKey);
                            stmObj.setObject(next.getObject());
                        }
                    }
                }
            }
        }
        return pdfObject.getObject();
    }

    // ------------------------------------------------------------------------
    /** Decrypts given COSString. */
    private final void decrypt(COSString str, long objNr, long objGenNr) throws IOException {
        try {
            securityHandler.decryptString(str, objNr, objGenNr);
        } catch (CryptographyException ce) {
            throw new IOException("Error decrypting string: " + ce.getMessage()
            /*, ce // TODO: remove remark with Java 1.6 */ );
        }
    }

    // ------------------------------------------------------------------------
    private boolean inGetLength = false;

    /** Returns length value referred to or defined in given object. */
    private COSNumber getLength(final COSBase lengthBaseObj) throws IOException {
        if (lengthBaseObj == null) {
            return null;
        }

        if (inGetLength) {
            throw new IOException("Loop while reading length from " + lengthBaseObj);
        }

        COSNumber retVal = null;

        try {
            inGetLength = true;

            // ---- maybe length was given directly
            if (lengthBaseObj instanceof COSNumber) {
                retVal = (COSNumber) lengthBaseObj;
            }
            // ---- length in referenced object
            else if (lengthBaseObj instanceof COSObject) {
                COSObject lengthObj = (COSObject) lengthBaseObj;

                if (lengthObj.getObject() == null) {
                    // not read so far

                    // keep current stream position
                    final long curFileOffset = getPdfSourceOffset();
                    releasePdfSourceInputStream();

                    parseObjectDynamically(lengthObj, true);

                    // reset current stream position
                    setPdfSource(curFileOffset);

                    if (lengthObj.getObject() == null) {
                        throw new IOException("Length object content was not read.");
                    }
                }

                if (!(lengthObj.getObject() instanceof COSNumber)) {
                    throw new IOException("Wrong type of referenced length object " + lengthObj + ": "
                            + lengthObj.getObject().getClass().getSimpleName());
                }

                retVal = (COSNumber) lengthObj.getObject();

            } else {
                throw new IOException("Wrong type of length object: " + lengthBaseObj.getClass().getSimpleName());
            }
        } finally {
            inGetLength = false;
        }
        return retVal;
    }

    // ------------------------------------------------------------------------
    private final int streamCopyBufLen = 8192;
    private final byte[] streamCopyBuf = new byte[streamCopyBufLen];

    /**
     * This will read a COSStream from the input stream using length attribute
     * within dictionary.
     * If length attribute is a indirect reference it is first resolved to get
     * the stream length. This means we copy stream data without testing for
     * 'endstream' or 'endobj' and thus it is no problem if these keywords
     * occur within stream.
     * We require 'endstream' to be found after stream data is read. 
     *
     * @param dic  dictionary that goes with this stream.
     * @param file  file to write the stream to when reading.
     *
     * @return parsed pdf stream.
     *
     * @throws IOException if an error occurred reading the stream, like problems
     *         with reading length attribute, stream does not end with 'endstream'
     *         after data read, stream too short etc.
     */
    @Override
    protected COSStream parseCOSStream(COSDictionary dic, RandomAccess file) throws IOException {
        final COSStream stream = new COSStream(dic, file);
        OutputStream out = null;
        try {
            readString(); // read 'stream'; this was already tested in parseObjectsDynamically()

            // ---- skip whitespaces before start of data
            //      PDF Ref 1.7, chap. 3.2.7:
            //      'stream' should be followed by either a CRLF (0x0d 0x0a) or LF but nothing else.
            {
                int whitespace = pdfSource.read();

                //see brother_scan_cover.pdf, it adds whitespaces
                //after the stream but before the start of the
                //data, so just read those first
                while (whitespace == 0x20) {
                    whitespace = pdfSource.read();
                }

                if (whitespace == 0x0D) {
                    whitespace = pdfSource.read();
                    if (whitespace != 0x0A) {
                        // the spec says this is invalid but it happens in the real
                        // world so we must support it
                        pdfSource.unread(whitespace);
                    }
                } else if (whitespace != 0x0A) {
                    // no whitespace after 'stream'; PDF ref. says 'should' so that is ok
                    pdfSource.unread(whitespace);
                }
            }

            /*This needs to be dic.getItem because when we are parsing, the underlying object
             * might still be null.
             */
            COSNumber streamLengthObj = getLength(dic.getItem(COSName.LENGTH));
            if (streamLengthObj == null) {
                throw new IOException("Missing length for stream.");
            }

            // ---- get output stream to copy data to
            out = stream.createFilteredStream(streamLengthObj);

            long remainBytes = streamLengthObj.longValue();

            while (remainBytes > 0) {
                final int readBytes = pdfSource.read(streamCopyBuf, 0,
                        (remainBytes > streamCopyBufLen) ? streamCopyBufLen : (int) remainBytes);
                if (readBytes <= 0) {
                    throw new IOException("No more bytes from stream but expected: " + remainBytes);
                }
                out.write(streamCopyBuf, 0, readBytes);

                remainBytes -= readBytes;
            }

            String endStream = readString();

            if (!endStream.equals("endstream")) {
                throw new IOException("Error reading stream using length value. Expected='endstream' actual='"
                        + endStream + "' ");
            }

        } finally {
            if (out != null) {
                out.close();
            }
        }
        return stream;
    }
}