org.apache.pdfbox.io.RandomAccessBufferedFileInputStream.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pdfbox.io.RandomAccessBufferedFileInputStream.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.io;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.util.LinkedHashMap;
import java.util.Map;

/**
 * Provides {@link InputStream} access to portions of a file combined with
 * buffered reading of content. Start of next bytes to read can be set via seek
 * method.
 * 
 * File is accessed via {@link RandomAccessFile} and is read in byte chunks
 * which are cached.
 * 
 * @author Timo Boehme
 */
public class RandomAccessBufferedFileInputStream extends InputStream implements RandomAccessRead {
    /**
     * The prefix for the temp file being used. 
     */
    private static final String TMP_FILE_PREFIX = "tmpPDFBox";

    private int pageSizeShift = 12;
    private int pageSize = 1 << pageSizeShift;
    private long pageOffsetMask = -1L << pageSizeShift;
    private int maxCachedPages = 1000;
    private File tempFile;

    private byte[] lastRemovedCachePage = null;

    /** Create a LRU page cache. */
    private final Map<Long, byte[]> pageCache = new LinkedHashMap<Long, byte[]>(maxCachedPages, 0.75f, true) {
        private static final long serialVersionUID = -6302488539257741101L;

        @Override
        protected boolean removeEldestEntry(Map.Entry<Long, byte[]> eldest) {
            final boolean doRemove = size() > maxCachedPages;
            if (doRemove) {
                lastRemovedCachePage = eldest.getValue();
            }
            return doRemove;
        }
    };

    private long curPageOffset = -1;
    private byte[] curPage = new byte[pageSize];
    private int offsetWithinPage = 0;

    private final RandomAccessFile raFile;
    private final long fileLength;
    private long fileOffset = 0;
    private boolean isClosed;

    /**
     * Create a random access input stream instance for the file with the given name.
     *
     * @param filename the filename of the file to be read.
     * @throws IOException if something went wrong while accessing the given file.
     */
    public RandomAccessBufferedFileInputStream(String filename) throws IOException {
        this(new File(filename));
    }

    /** 
     * Create a random access input stream instance for the given file.
     *
     * @param file the file to be read.
     * @throws IOException if something went wrong while accessing the given file.
     */
    public RandomAccessBufferedFileInputStream(File file) throws IOException {
        raFile = new RandomAccessFile(file, "r");
        fileLength = file.length();
        seek(0);
    }

    /**
     * Create a random access input stream for the given input stream by copying the data to a
     * temporary file.
     *
     * @param input the input stream to be read. It will be closed by this method.
     * @throws IOException if something went wrong while creating the temporary file.
     */
    public RandomAccessBufferedFileInputStream(InputStream input) throws IOException {
        tempFile = createTmpFile(input);
        fileLength = tempFile.length();
        raFile = new RandomAccessFile(tempFile, "r");
        seek(0);
    }

    private File createTmpFile(InputStream input) throws IOException {
        File tmpFile = File.createTempFile(TMP_FILE_PREFIX, ".pdf");
        try (FileOutputStream fos = new FileOutputStream(tmpFile)) {
            IOUtils.copy(input, fos);
            return tmpFile;
        } finally {
            IOUtils.closeQuietly(input);
        }
    }

    /**
     * Remove the temporary file. A temporary file is created if this class is instantiated with an InputStream
     */
    private void deleteTempFile() {
        if (tempFile != null) {
            tempFile.delete();
        }
    }

    /** Returns offset in file at which next byte would be read. */
    @Override
    public long getPosition() {
        return fileOffset;
    }

    /**
     * Seeks to new position. If new position is outside of current page the new page is either
     * taken from cache or read from file and added to cache.
     *
     * @param newOffset the position to seek to.
     * @throws java.io.IOException if something went wrong.
     */
    @Override
    public void seek(final long newOffset) throws IOException {
        final long newPageOffset = newOffset & pageOffsetMask;
        if (newPageOffset != curPageOffset) {
            byte[] newPage = pageCache.get(newPageOffset);
            if (newPage == null) {
                raFile.seek(newPageOffset);
                newPage = readPage();
                pageCache.put(newPageOffset, newPage);
            }
            curPageOffset = newPageOffset;
            curPage = newPage;
        }

        offsetWithinPage = (int) (newOffset - curPageOffset);
        fileOffset = newOffset;
    }

    /**
     * Reads a page with data from current file position. If we have a
     * previously removed page from cache the buffer of this page is reused.
     * Otherwise a new byte buffer is created.
     */
    private byte[] readPage() throws IOException {
        byte[] page;

        if (lastRemovedCachePage != null) {
            page = lastRemovedCachePage;
            lastRemovedCachePage = null;
        } else {
            page = new byte[pageSize];
        }

        int readBytes = 0;
        while (readBytes < pageSize) {
            int curBytesRead = raFile.read(page, readBytes, pageSize - readBytes);
            if (curBytesRead < 0) {
                // EOF
                break;
            }
            readBytes += curBytesRead;
        }

        return page;
    }

    @Override
    public int read() throws IOException {
        if (fileOffset >= fileLength) {
            return -1;
        }

        if (offsetWithinPage == pageSize) {
            seek(fileOffset);
        }

        fileOffset++;
        return curPage[offsetWithinPage++] & 0xff;
    }

    @Override
    public int read(byte[] b) throws IOException {
        return read(b, 0, b.length);
    }

    @Override
    public int read(byte[] b, int off, int len) throws IOException {
        if (fileOffset >= fileLength) {
            return -1;
        }

        if (offsetWithinPage == pageSize) {
            seek(fileOffset);
        }

        int commonLen = Math.min(pageSize - offsetWithinPage, len);
        if ((fileLength - fileOffset) < pageSize) {
            commonLen = Math.min(commonLen, (int) (fileLength - fileOffset));
        }

        System.arraycopy(curPage, offsetWithinPage, b, off, commonLen);

        offsetWithinPage += commonLen;
        fileOffset += commonLen;

        return commonLen;
    }

    @Override
    public int available() throws IOException {
        return (int) Math.min(fileLength - fileOffset, Integer.MAX_VALUE);
    }

    @Override
    public long skip(long n) throws IOException {
        // test if we have to reduce skip count because of EOF
        long toSkip = n;

        if (fileLength - fileOffset < toSkip) {
            toSkip = fileLength - fileOffset;
        }

        if ((toSkip < pageSize) && ((offsetWithinPage + toSkip) <= pageSize)) {
            // we can skip within current page
            offsetWithinPage += toSkip;
            fileOffset += toSkip;
        } else {
            // seek to the page we will get after skipping
            seek(fileOffset + toSkip);
        }

        return toSkip;
    }

    @Override
    public long length() throws IOException {
        return fileLength;
    }

    @Override
    public void close() throws IOException {
        raFile.close();
        deleteTempFile();
        pageCache.clear();
        isClosed = true;
    }

    @Override
    public boolean isClosed() {
        return isClosed;
    }

    @Override
    public int peek() throws IOException {
        int result = read();
        if (result != -1) {
            rewind(1);
        }
        return result;
    }

    @Override
    public void rewind(int bytes) throws IOException {
        seek(getPosition() - bytes);
    }

    @Override
    public byte[] readFully(int length) throws IOException {
        byte[] b = new byte[length];
        int bytesRead = read(b);
        while (bytesRead < length) {
            bytesRead += read(b, bytesRead, length - bytesRead);
        }
        return b;
    }

    @Override
    public boolean isEOF() throws IOException {
        int peek = peek();
        return peek == -1;
    }
}