Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.pdfparser; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.security.KeyStore; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Queue; import java.util.Set; import java.util.TreeMap; import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.cos.COSArray; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSNull; import org.apache.pdfbox.cos.COSNumber; import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.io.PushBackInputStream; import org.apache.pdfbox.io.RandomAccess; import org.apache.pdfbox.io.RandomAccessBuffer; import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.DecryptionMaterial; import org.apache.pdfbox.pdmodel.encryption.PDEncryptionDictionary; import org.apache.pdfbox.pdmodel.encryption.PublicKeyDecryptionMaterial; import org.apache.pdfbox.pdmodel.encryption.SecurityHandler; import org.apache.pdfbox.pdmodel.encryption.SecurityHandlersManager; import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; import org.apache.pdfbox.persistence.util.COSObjectKey; /** * PDFParser which first reads startxref and xref tables in order to know valid * objects and parse only these objects. Thus it is closer to a conforming parser * than the sequential reading of {@link PDFParser}. * * This class can be used as a {@link PDFParser} replacement. First {@link #parse()} * must be called before page objects can be retrieved, e.g. {@link #getPDDocument()}. * * This class is a much enhanced version of <code>QuickParser</code> presented in * <a href="https://issues.apache.org/jira/browse/PDFBOX-1104">PDFBOX-1104</a> * by Jeremy Villalobos. */ public class NonSequentialPDFParser extends PDFParser { public static final String SYSPROP_PARSEMINIMAL = "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.parseMinimal"; public static final String SYSPROP_EOFLOOKUPRANGE = "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange"; private static final InputStream EMPTY_INPUT_STREAM = new ByteArrayInputStream(new byte[0]); private static final int DEFAULT_TRAIL_BYTECOUNT = 2048; private static final char[] EOF_MARKER = new char[] { '%', '%', 'E', 'O', 'F' }; private static final char[] STARTXREF_MARKER = new char[] { 's', 't', 'a', 'r', 't', 'x', 'r', 'e', 'f' }; private static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' }; private final File pdfFile; private final RandomAccessBufferedFileInputStream raStream; private SecurityHandler securityHandler = null; private String keyStoreFilename = null; private String alias = null; private String password = ""; private int readTrailBytes = DEFAULT_TRAIL_BYTECOUNT; // how many trailing bytes to read for EOF marker /** If <code>true</code> object references in catalog are not followed; * pro: page objects will be only parsed when needed; cons: some information of catalog * might not be available (e.g. outline). * Catalog parsing without pages is not an option since a number of entries will * also refer to page objects (like OpenAction). */ private boolean parseMinimalCatalog = "true".equals(System.getProperty(SYSPROP_PARSEMINIMAL)); private boolean initialParseDone = false; private boolean allPagesParsed = false; private static final Log LOG = LogFactory.getLog(NonSequentialPDFParser.class); // ------------------------------------------------------------------------ /** * Constructs parser for given file using memory buffer. * * @param filename the filename of the pdf to be parsed * * @throws IOException If something went wrong. */ public NonSequentialPDFParser(String filename) throws IOException { this(new File(filename), null); } /** * Constructs parser for given file using given buffer for temporary storage. * * @param file the pdf to be parsed * @param raBuf the buffer to be used for parsing * * @throws IOException If something went wrong. */ /** * Constructs parser for given file using given buffer for temporary storage. * * @param file the pdf to be parsed * @param raBuf the buffer to be used for parsing * * @throws IOException If something went wrong. */ public NonSequentialPDFParser(File file, RandomAccess raBuf) throws IOException { this(file, raBuf, ""); } /** * Constructs parser for given file using given buffer for temporary storage. * * @param file the pdf to be parsed * @param raBuf the buffer to be used for parsing * * @throws IOException If something went wrong. */ /** * Constructs parser for given file using given buffer for temporary storage. * * @param file the pdf to be parsed * @param raBuf the buffer to be used for parsing * @param decryptionPassword password to be used for decryption * * @throws IOException If something went wrong. */ public NonSequentialPDFParser(File file, RandomAccess raBuf, String decryptionPassword) throws IOException { super(EMPTY_INPUT_STREAM, null, false); String eofLookupRangeStr = System.getProperty(SYSPROP_EOFLOOKUPRANGE); if (eofLookupRangeStr != null) { try { setEOFLookupRange(Integer.parseInt(eofLookupRangeStr)); } catch (NumberFormatException nfe) { LOG.warn("System property " + SYSPROP_EOFLOOKUPRANGE + " does not contain an integer value, but: '" + eofLookupRangeStr + "'"); } } pdfFile = file; raStream = new RandomAccessBufferedFileInputStream(pdfFile); setDocument( (raBuf == null) ? new COSDocument(new RandomAccessBuffer(), false) : new COSDocument(raBuf, false)); pdfSource = new PushBackInputStream(raStream, 4096); password = decryptionPassword; } // ------------------------------------------------------------------------ /** * Sets how many trailing bytes of PDF file are searched for * EOF marker and 'startxref' marker. * If not set we use default value {@link #DEFAULT_TRAIL_BYTECOUNT}. * * <p<We check that new value is at least 16. However for practical use * cases this value should not be lower than 1000; even 2000 * was found to not be enough in some cases where some trailing * garbage like HTML snippets followed the EOF marker.</p> * * <p>In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined * this value will be set on initialization but can be overwritten later.</p> * * @param byteCount number of trailing bytes */ public void setEOFLookupRange(int byteCount) { if (byteCount > 15) { readTrailBytes = byteCount; } } // ------------------------------------------------------------------------ /** * The initial parse will first parse only the trailer, the xrefstart and * all xref tables to have a pointer (offset) to all the pdf's objects. * It can handle linearized pdfs, which will have an xref at the * end pointing to an xref at the beginning of the file. * Last the root object is parsed. * * @throws IOException */ private void initialParse() throws IOException { final long startxrefOff = getStartxrefOffset(); // ---- parse startxref setPdfSource(startxrefOff); parseStartXref(); final long xrefOffset = document.getStartXref(); long prev = xrefOffset; // ---- parse whole chain of xref tables/object streams using PREV reference while (prev > -1) { // seek to xref table setPdfSource(prev); // -- parse xref if (pdfSource.peek() == 'x') { // xref table and trailer // use existing parser to parse xref table parseXrefTable(prev); // parse the last trailer. if (!parseTrailer()) { throw new IOException("Expected trailer object at position: " + pdfSource.getOffset()); } COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer(); prev = trailer.getInt(COSName.PREV); } else { // xref stream prev = parseXrefObjStream(prev); } } // ---- build valid xrefs out of the xref chain xrefTrailerResolver.setStartxref(xrefOffset); document.setTrailer(xrefTrailerResolver.getTrailer()); // ---- prepare encryption if necessary COSBase trailerEncryptItem = document.getTrailer().getItem(COSName.ENCRYPT); if (trailerEncryptItem != null) { if (trailerEncryptItem instanceof COSObject) { COSObject trailerEncryptObj = (COSObject) trailerEncryptItem; parseObjectDynamically(trailerEncryptObj, true); } try { PDEncryptionDictionary encParameters = new PDEncryptionDictionary( document.getEncryptionDictionary()); DecryptionMaterial decryptionMaterial = null; if (keyStoreFilename != null) { KeyStore ks = KeyStore.getInstance("PKCS12"); ks.load(new FileInputStream(keyStoreFilename), password.toCharArray()); decryptionMaterial = new PublicKeyDecryptionMaterial(ks, alias, password); } else { decryptionMaterial = new StandardDecryptionMaterial(password); } securityHandler = SecurityHandlersManager.getInstance() .getSecurityHandler(encParameters.getFilter()); securityHandler.prepareForDecryption(encParameters, document.getDocumentID(), decryptionMaterial); AccessPermission permission = securityHandler.getCurrentAccessPermission(); if (!permission.canExtractContent()) { LOG.warn("PDF file '" + pdfFile.getPath() + "' does not allow extracting content."); } } catch (Exception e) { throw new IOException("Error (" + e.getClass().getSimpleName() + ") while creating security handler for decryption: " + e.getMessage() /*, e // TODO: remove remark with Java 1.6 */); } } // ---- parse catalog or root object COSObject root = (COSObject) xrefTrailerResolver.getTrailer().getItem(COSName.ROOT); if (root == null) { throw new IOException("Missing root object specification in trailer."); } parseObjectDynamically(root, false); // ---- resolve all objects (including pages) if (!parseMinimalCatalog) { COSObject catalogObj = document.getCatalog(); if (catalogObj != null) { if (catalogObj.getObject() instanceof COSDictionary) { parseDictObjects((COSDictionary) catalogObj.getObject(), (COSName[]) null); allPagesParsed = true; document.setDecrypted(); } } } initialParseDone = true; } // ------------------------------------------------------------------------ /** Parses an xref object stream starting with indirect object id. * * @return value of PREV item in dictionary or <code>-1</code> if no such item exists */ private long parseXrefObjStream(long objByteOffset) throws IOException { // ---- parse indirect object head readInt(); readInt(); readPattern(OBJ_MARKER); COSDictionary dict = parseCOSDictionary(); COSStream xrefStream = parseCOSStream(dict, getDocument().getScratchFile()); parseXrefStream(xrefStream, (int) objByteOffset); return dict.getLong(COSName.PREV); } // ------------------------------------------------------------------------ /** Get current offset in file at which next byte would be read. */ private final long getPdfSourceOffset() { return pdfSource.getOffset(); } /** Sets {@link #pdfSource} to start next parsing at given file offset. */ private final void setPdfSource(long fileOffset) throws IOException { pdfSource.seek(fileOffset); // alternative using 'old fashioned' input stream // if ( pdfSource != null ) // pdfSource.close(); // // pdfSource = new PushBackInputStream( // new BufferedInputStream( // new FileInputStream( file ), 16384), 4096); // pdfSource.skip( _fileOffset ); } /** Enable handling of alternative pdfSource implementation. */ private final void releasePdfSourceInputStream() throws IOException { // if ( pdfSource != null ) // pdfSource.close(); } private final void closeFileStream() throws IOException { if (pdfSource != null) { pdfSource.close(); } } // ------------------------------------------------------------------------ /** Looks for and parses startxref. We first look for last '%%EOF' marker * (within last {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via * {@link #setEOFLookupRange(int)}) and go back to find <code>startxref</code>. */ private final long getStartxrefOffset() throws IOException { byte[] buf; long skipBytes; // ---- read trailing bytes into buffer final long fileLen = pdfFile.length(); FileInputStream fIn = null; try { fIn = new FileInputStream(pdfFile); final int trailByteCount = (fileLen < readTrailBytes) ? (int) fileLen : readTrailBytes; buf = new byte[trailByteCount]; fIn.skip(skipBytes = fileLen - trailByteCount); int off = 0; int readBytes; while (off < trailByteCount) { readBytes = fIn.read(buf, off, trailByteCount - off); // in order to not get stuck in a loop we check readBytes (this should never happen) if (readBytes < 1) { throw new IOException( "No more bytes to read for trailing buffer, but expected: " + (trailByteCount - off)); } off += readBytes; } } finally { if (fIn != null) { try { fIn.close(); } catch (IOException ioe) { } } } // ---- find last '%%EOF' int bufOff = lastIndexOf(EOF_MARKER, buf, buf.length); if (bufOff < 0) { throw new IOException("Missing end of file marker '" + (new String(EOF_MARKER)) + "'"); } // ---- find last startxref preceding EOF marker bufOff = lastIndexOf(STARTXREF_MARKER, buf, bufOff); if (bufOff < 0) { throw new IOException("Missing 'startxref' marker."); } return skipBytes + bufOff; } // ------------------------------------------------------------------------ /** Searches last appearance of pattern within buffer. Lookup before _lastOff * and goes back until 0. * * @param pattern pattern to search for * @param buf buffer to search pattern in * @param endOff offset (exclusive) where lookup starts at * * @return start offset of pattern within buffer or <code>-1</code> if pattern could not be found */ private final int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff) { final int lastPatternChOff = pattern.length - 1; int bufOff = endOff; int patOff = lastPatternChOff; char lookupCh = pattern[patOff]; while (--bufOff >= 0) { if (buf[bufOff] == lookupCh) { if (--patOff < 0) { // whole pattern matched return bufOff; } // matched current char, advance to preceding one lookupCh = pattern[patOff]; } else if (patOff < lastPatternChOff) { // no char match but already matched some chars; reset lookupCh = pattern[patOff = lastPatternChOff]; } } return -1; } // ------------------------------------------------------------------------ /** Reads given pattern from {@link #pdfSource}. Skipping whitespace at start and end. * * @throws IOException if pattern could not be read */ private final void readPattern(final char[] pattern) throws IOException { skipSpaces(); for (char c : pattern) { if (pdfSource.read() != c) { throw new IOException( "Expected pattern '" + new String(pattern) + " but missed at character '" + c + "'"); } } skipSpaces(); } // ------------------------------------------------------------------------ private COSDictionary pagesDictionary = null; /** Returns PAGES {@link COSDictionary} object or throws {@link IOException} * if PAGES dictionary does not exist. */ private COSDictionary getPagesObject() throws IOException { if (pagesDictionary != null) { return pagesDictionary; } COSObject pages = (COSObject) document.getCatalog().getItem(COSName.PAGES); if (pages == null) { throw new IOException("Missing PAGES entry in document catalog."); } COSBase object = parseObjectDynamically(pages, false); if (!(object instanceof COSDictionary)) { throw new IOException("PAGES not a dictionary object, but: " + object.getClass().getSimpleName()); } pagesDictionary = (COSDictionary) object; return pagesDictionary; } // ------------------------------------------------------------------------ /** Parses all objects needed by pages and closes input stream. */ /** * {@inheritDoc} */ @Override public void parse() throws IOException { boolean exceptionOccurred = true; // set to false if all is processed try { if (!initialParseDone) { initialParse(); } final int pageCount = getPageNumber(); if (!allPagesParsed) { for (int pNr = 0; pNr < pageCount; pNr++) { getPage(pNr); } allPagesParsed = true; document.setDecrypted(); } exceptionOccurred = false; } finally { try { closeFileStream(); } catch (IOException ioe) { } if (exceptionOccurred && (document != null)) { try { document.close(); } catch (IOException ioe) { } } } } // ------------------------------------------------------------------------ /** * Returns security handler of the document or <code>null</code> if document * is not encrypted or {@link #parse()} wasn't called before. * * @return the security handler. */ public SecurityHandler getSecurityHandler() { return securityHandler; } // ------------------------------------------------------------------------ /** * This will get the PD document that was parsed. When you are done with * this document you must call close() on it to release resources. * * Overwriting super method was necessary in order to set security handler. * * @return The document at the PD layer. * * @throws IOException If there is an error getting the document. */ @Override public PDDocument getPDDocument() throws IOException { PDDocument pdDocument = super.getPDDocument(); if (securityHandler != null) pdDocument.setSecurityHandler(securityHandler); return pdDocument; } // ------------------------------------------------------------------------ /** * Returns the number of pages in a document. * * @return the number of pages. * * @throws IOException if PAGES or other needed object is missing */ public int getPageNumber() throws IOException { int pageCount = getPagesObject().getInt(COSName.COUNT); if (pageCount < 0) { throw new IOException("No page number specified."); } return pageCount; } // ------------------------------------------------------------------------ /** * Returns the page requested with all the objects loaded into it. * * @param pageNr starts from 0 to the number of pages. * @return the page with the given pagenumber. * @throws IOException If something went wrong. */ public PDPage getPage(int pageNr) throws IOException { getPagesObject(); // ---- get list of top level pages COSArray kids = (COSArray) pagesDictionary.getDictionaryObject(COSName.KIDS); if (kids == null) { throw new IOException("Missing 'Kids' entry in pages dictionary."); } // ---- get page we are looking for (possibly going recursively into subpages) COSObject pageObj = getPageObject(pageNr, kids, 0); if (pageObj == null) { throw new IOException("Page " + pageNr + " not found."); } // ---- parse all objects necessary to load page. COSDictionary pageDict = (COSDictionary) pageObj.getObject(); if (parseMinimalCatalog && (!allPagesParsed)) { // parse page resources since we did not do this on start COSDictionary resDict = (COSDictionary) pageDict.getDictionaryObject(COSName.RESOURCES); parseDictObjects(resDict); } return new PDPage(pageDict); } /** * Returns the object for a specific page. * The page tree is made up of kids. The kids have COSArray with COSObjects * inside of them. The COSObject can be parsed using the dynamic parsing method * We want to only parse the minimum COSObjects and still return a complete page. * ready to be used. * * @param num the requested page number; numbering starts with 0 * @param startKids Kids array to start with looking up page number * @param startPageCount * * @return page object or <code>null</code> if no such page exists * * @throws IOException */ private COSObject getPageObject(int num, COSArray startKids, int startPageCount) throws IOException { int curPageCount = startPageCount; Iterator<COSBase> kidsIter = startKids.iterator(); while (kidsIter.hasNext()) { COSObject obj = (COSObject) kidsIter.next(); COSBase base = obj.getObject(); if (base == null) { base = parseObjectDynamically(obj, false); obj.setObject(base); } COSDictionary dic = (COSDictionary) base; int count = dic.getInt(COSName.COUNT); if (count >= 0) { // skip this branch if requested page comes later if ((curPageCount + count) <= num) { curPageCount += count; continue; } } COSArray kids = (COSArray) dic.getDictionaryObject(COSName.KIDS); if (kids != null) { // recursively scan subpages COSObject ans = getPageObject(num, kids, curPageCount); // if ans is not null, we got what we were looking for if (ans != null) { return ans; } } else { // found page? if (curPageCount == num) { return obj; } // page has no kids and it is not the page we are looking for curPageCount++; } } return null; } /** Creates a unique object id using object number and object generation number. * (requires object number < 2^31)) */ private final long getObjectId(final COSObject obj) { return (obj.getObjectNumber().longValue() << 32) | obj.getGenerationNumber().longValue(); } /** Adds all from newObjects to toBeParsedList if it is not an COSObject * or we didn't add this COSObject already (checked via addedObjects). */ private final void addNewToList(final Queue<COSBase> toBeParsedList, final Collection<COSBase> newObjects, final Set<Long> addedObjects) { for (COSBase newObject : newObjects) { if (newObject instanceof COSObject) { final long objId = getObjectId((COSObject) newObject); if (!addedObjects.add(objId)) { continue; } } toBeParsedList.add(newObject); } } /** Adds newObject to toBeParsedList if it is not an COSObject * or we didn't add this COSObject already (checked via addedObjects). */ private final void addNewToList(final Queue<COSBase> toBeParsedList, final COSBase newObject, final Set<Long> addedObjects) { if (newObject instanceof COSObject) { final long objId = getObjectId((COSObject) newObject); if (!addedObjects.add(objId)) { return; } } toBeParsedList.add(newObject); } /** * Will parse every object necessary to load a single page from the pdf document. * We try our best to order objects according to offset in file before reading * to minimize seek operations. * * @param dict the COSObject from the parent pages. * @param excludeObjects dictionary object reference entries with these names will not be parsed * * @throws IOException */ private void parseDictObjects(COSDictionary dict, COSName... excludeObjects) throws IOException { // ---- create queue for objects waiting for further parsing final Queue<COSBase> toBeParsedList = new LinkedList<COSBase>(); // offset ordered object map final TreeMap<Long, List<COSObject>> objToBeParsed = new TreeMap<Long, List<COSObject>>(); // in case of compressed objects offset points to stmObj final Set<Long> parsedObjects = new HashSet<Long>(); final Set<Long> addedObjects = new HashSet<Long>(); // ---- add objects not to be parsed to list of already parsed objects if (excludeObjects != null) { for (COSName objName : excludeObjects) { COSBase baseObj = dict.getItem(objName); if (baseObj instanceof COSObject) { parsedObjects.add(getObjectId((COSObject) baseObj)); } } } addNewToList(toBeParsedList, dict.getValues(), addedObjects); // ---- go through objects to be parsed while (!(toBeParsedList.isEmpty() && objToBeParsed.isEmpty())) { // -- first get all COSObject from other kind of objects and // put them in objToBeParsed; afterwards toBeParsedList is empty COSBase baseObj; while ((baseObj = toBeParsedList.poll()) != null) { if (baseObj instanceof COSStream) { addNewToList(toBeParsedList, ((COSStream) baseObj).getValues(), addedObjects); } else if (baseObj instanceof COSDictionary) { addNewToList(toBeParsedList, ((COSDictionary) baseObj).getValues(), addedObjects); } else if (baseObj instanceof COSArray) { final Iterator<COSBase> arrIter = ((COSArray) baseObj).iterator(); while (arrIter.hasNext()) { addNewToList(toBeParsedList, arrIter.next(), addedObjects); } } else if (baseObj instanceof COSObject) { COSObject obj = (COSObject) baseObj; long objId = getObjectId(obj); COSObjectKey objKey = new COSObjectKey(obj.getObjectNumber().intValue(), obj.getGenerationNumber().intValue()); if (!(parsedObjects.contains(objId) /*|| document.hasObjectInPool( objKey ) */ )) { Long fileOffset = xrefTrailerResolver.getXrefTable().get(objKey); // it is allowed that object references point to null, thus we have to test if (fileOffset != null) { if (fileOffset > 0) { objToBeParsed.put(fileOffset, Collections.singletonList(obj)); } else { // negative offset means we have a compressed object within object stream; // get offset of object stream fileOffset = xrefTrailerResolver.getXrefTable() .get(new COSObjectKey(-fileOffset, 0)); if ((fileOffset == null) || (fileOffset <= 0)) { throw new IOException( "Invalid object stream xref object reference: " + fileOffset); } List<COSObject> stmObjects = objToBeParsed.get(fileOffset); if (stmObjects == null) { objToBeParsed.put(fileOffset, stmObjects = new ArrayList<COSObject>()); } stmObjects.add(obj); } } else { // NULL object COSObject pdfObject = document.getObjectFromPool(objKey); pdfObject.setObject(COSNull.NULL); } } } } // ---- read first COSObject with smallest offset; // resulting object will be added to toBeParsedList if (objToBeParsed.isEmpty()) { break; } for (COSObject obj : objToBeParsed.remove(objToBeParsed.firstKey())) { COSBase parsedObj = parseObjectDynamically(obj, false); obj.setObject(parsedObj); addNewToList(toBeParsedList, parsedObj, addedObjects); parsedObjects.add(getObjectId(obj)); } } } /** * This will parse the next object from the stream and add it to * the local state. * This is taken from {@link PDFParser} and reduced to parsing * an indirect object. * * @param obj object to be parsed (we only take object number and generation number for lookup start offset) * @param requireExistingNotCompressedObj if <code>true</code> object to be parsed must * not be contained within compressed stream * @return the parsed object (which is also added to document object) * * @throws IOException If an IO error occurs. */ private COSBase parseObjectDynamically(COSObject obj, boolean requireExistingNotCompressedObj) throws IOException { return parseObjectDynamically(obj.getObjectNumber().intValue(), obj.getGenerationNumber().intValue(), requireExistingNotCompressedObj); } /** * This will parse the next object from the stream and add it to * the local state. * This is taken from {@link PDFParser} and reduced to parsing * an indirect object. * * @param objNr object number of object to be parsed * @param objGenNr object generation number of object to be parsed * @param requireExistingNotCompressedObj if <code>true</code> the object to be parsed must be defined * in xref (comment: null objects may be missing from xref) and * it must not be a compressed object within object stream * (this is used to circumvent being stuck in a loop in a malicious PDF) * * @return the parsed object (which is also added to document object) * * @throws IOException If an IO error occurs. */ private COSBase parseObjectDynamically(int objNr, int objGenNr, boolean requireExistingNotCompressedObj) throws IOException { // ---- create object key and get object (container) from pool final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr); final COSObject pdfObject = document.getObjectFromPool(objKey); if (pdfObject.getObject() == null) { // not previously parsed // ---- read offset or object stream object number from xref table Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get(objKey); // sanity test to circumvent loops with broken documents if (requireExistingNotCompressedObj && ((offsetOrObjstmObNr == null) || (offsetOrObjstmObNr <= 0))) { throw new IOException("Object must be defined and must not be compressed object: " + objKey.getNumber() + ":" + objKey.getGeneration()); } if (offsetOrObjstmObNr == null) { // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9) pdfObject.setObject(COSNull.NULL); } else if (offsetOrObjstmObNr > 0) { // offset of indirect object in file // ---- go to object start setPdfSource(offsetOrObjstmObNr); // ---- we must have an indirect object final int readObjNr = readInt(); final int readObjGen = readInt(); readPattern(OBJ_MARKER); // ---- consistency check if ((readObjNr != objKey.getNumber()) || (readObjGen != objKey.getGeneration())) { throw new IOException("XREF for " + objKey.getNumber() + ":" + objKey.getGeneration() + " points to wrong object: " + readObjNr + ":" + readObjGen); } skipSpaces(); COSBase pb = parseDirObject(); String endObjectKey = readString(); if (endObjectKey.equals("stream")) { pdfSource.unread(endObjectKey.getBytes("ISO-8859-1")); pdfSource.unread(' '); if (pb instanceof COSDictionary) { COSStream stream = parseCOSStream((COSDictionary) pb, getDocument().getScratchFile()); if (securityHandler != null) { try { securityHandler.decryptStream(stream, objNr, objGenNr); } catch (CryptographyException ce) { throw new IOException( "Error decrypting stream object " + objNr + ": " + ce.getMessage() /*, ce // TODO: remove remark with Java 1.6 */ ); } } pb = stream; } else { // this is not legal // the combination of a dict and the stream/endstream forms a complete stream object throw new IOException( "Stream not preceded by dictionary (offset: " + offsetOrObjstmObNr + ")."); } skipSpaces(); endObjectKey = readLine(); // we have case with a second 'endstream' before endobj if (!endObjectKey.startsWith("endobj")) { if (endObjectKey.startsWith("endstream")) { endObjectKey = endObjectKey.substring(9).trim(); if (endObjectKey.length() == 0) { // no other characters in extra endstream line endObjectKey = readLine(); // read next line } } } } else if (securityHandler != null) { // decrypt if (pb instanceof COSString) { decrypt((COSString) pb, objNr, objGenNr); } else if (pb instanceof COSDictionary) { for (Entry<COSName, COSBase> entry : ((COSDictionary) pb).entrySet()) { // TODO: specially handle 'Contents' entry of signature dictionary like in SecurityHandler#decryptDictionary if (entry.getValue() instanceof COSString) { decrypt((COSString) entry.getValue(), objNr, objGenNr); } } } else if (pb instanceof COSArray) { final COSArray array = (COSArray) pb; for (int aIdx = 0, len = array.size(); aIdx < len; aIdx++) { if (array.get(aIdx) instanceof COSString) { decrypt((COSString) array.get(aIdx), objNr, objGenNr); } } } } pdfObject.setObject(pb); if (!endObjectKey.startsWith("endobj")) { throw new IOException("Object (" + readObjNr + ":" + readObjGen + ") at offset " + offsetOrObjstmObNr + " does not end with 'endobj'."); } releasePdfSourceInputStream(); } else { // xref value is object nr of object stream containing object to be parsed; // since our object was not found it means object stream was not parsed so far final int objstmObjNr = (int) (-offsetOrObjstmObNr); final COSBase objstmBaseObj = parseObjectDynamically(objstmObjNr, 0, true); if (objstmBaseObj instanceof COSStream) { // parse object stream PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj, document, forceParsing); parser.parse(); // get set of object numbers referenced for this object stream final Set<Long> refObjNrs = xrefTrailerResolver.getContainedObjectNumbers(objstmObjNr); // register all objects which are referenced to be contained in object stream for (COSObject next : parser.getObjects()) { COSObjectKey stmObjKey = new COSObjectKey(next); if (refObjNrs.contains(stmObjKey.getNumber())) { COSObject stmObj = document.getObjectFromPool(stmObjKey); stmObj.setObject(next.getObject()); } } } } } return pdfObject.getObject(); } // ------------------------------------------------------------------------ /** Decrypts given COSString. */ private final void decrypt(COSString str, long objNr, long objGenNr) throws IOException { try { securityHandler.decryptString(str, objNr, objGenNr); } catch (CryptographyException ce) { throw new IOException("Error decrypting string: " + ce.getMessage() /*, ce // TODO: remove remark with Java 1.6 */ ); } } // ------------------------------------------------------------------------ private boolean inGetLength = false; /** Returns length value referred to or defined in given object. */ private COSNumber getLength(final COSBase lengthBaseObj) throws IOException { if (lengthBaseObj == null) { return null; } if (inGetLength) { throw new IOException("Loop while reading length from " + lengthBaseObj); } COSNumber retVal = null; try { inGetLength = true; // ---- maybe length was given directly if (lengthBaseObj instanceof COSNumber) { retVal = (COSNumber) lengthBaseObj; } // ---- length in referenced object else if (lengthBaseObj instanceof COSObject) { COSObject lengthObj = (COSObject) lengthBaseObj; if (lengthObj.getObject() == null) { // not read so far // keep current stream position final long curFileOffset = getPdfSourceOffset(); releasePdfSourceInputStream(); parseObjectDynamically(lengthObj, true); // reset current stream position setPdfSource(curFileOffset); if (lengthObj.getObject() == null) { throw new IOException("Length object content was not read."); } } if (!(lengthObj.getObject() instanceof COSNumber)) { throw new IOException("Wrong type of referenced length object " + lengthObj + ": " + lengthObj.getObject().getClass().getSimpleName()); } retVal = (COSNumber) lengthObj.getObject(); } else { throw new IOException("Wrong type of length object: " + lengthBaseObj.getClass().getSimpleName()); } } finally { inGetLength = false; } return retVal; } // ------------------------------------------------------------------------ private final int streamCopyBufLen = 8192; private final byte[] streamCopyBuf = new byte[streamCopyBufLen]; /** * This will read a COSStream from the input stream using length attribute * within dictionary. * If length attribute is a indirect reference it is first resolved to get * the stream length. This means we copy stream data without testing for * 'endstream' or 'endobj' and thus it is no problem if these keywords * occur within stream. * We require 'endstream' to be found after stream data is read. * * @param dic dictionary that goes with this stream. * @param file file to write the stream to when reading. * * @return parsed pdf stream. * * @throws IOException if an error occurred reading the stream, like problems * with reading length attribute, stream does not end with 'endstream' * after data read, stream too short etc. */ @Override protected COSStream parseCOSStream(COSDictionary dic, RandomAccess file) throws IOException { final COSStream stream = new COSStream(dic, file); OutputStream out = null; try { readString(); // read 'stream'; this was already tested in parseObjectsDynamically() // ---- skip whitespaces before start of data // PDF Ref 1.7, chap. 3.2.7: // 'stream' should be followed by either a CRLF (0x0d 0x0a) or LF but nothing else. { int whitespace = pdfSource.read(); //see brother_scan_cover.pdf, it adds whitespaces //after the stream but before the start of the //data, so just read those first while (whitespace == 0x20) { whitespace = pdfSource.read(); } if (whitespace == 0x0D) { whitespace = pdfSource.read(); if (whitespace != 0x0A) { // the spec says this is invalid but it happens in the real // world so we must support it pdfSource.unread(whitespace); } } else if (whitespace != 0x0A) { // no whitespace after 'stream'; PDF ref. says 'should' so that is ok pdfSource.unread(whitespace); } } /*This needs to be dic.getItem because when we are parsing, the underlying object * might still be null. */ COSNumber streamLengthObj = getLength(dic.getItem(COSName.LENGTH)); if (streamLengthObj == null) { throw new IOException("Missing length for stream."); } // ---- get output stream to copy data to out = stream.createFilteredStream(streamLengthObj); long remainBytes = streamLengthObj.longValue(); while (remainBytes > 0) { final int readBytes = pdfSource.read(streamCopyBuf, 0, (remainBytes > streamCopyBufLen) ? streamCopyBufLen : (int) remainBytes); if (readBytes <= 0) { throw new IOException("No more bytes from stream but expected: " + remainBytes); } out.write(streamCopyBuf, 0, readBytes); remainBytes -= readBytes; } String endStream = readString(); if (!endStream.equals("endstream")) { throw new IOException("Error reading stream using length value. Expected='endstream' actual='" + endStream + "' "); } } finally { if (out != null) { out.close(); } } return stream; } }