de.offis.health.icardea.cied.pdf.extractor.PDFiText5Extractor.java Source code

Introduction

Here is the source code for de.offis.health.icardea.cied.pdf.extractor.PDFiText5Extractor.java
Source

/**
 * 
 */
package de.offis.health.icardea.cied.pdf.extractor;

import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.log4j.Logger;

import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfSmartCopy;
import com.itextpdf.text.pdf.PdfString;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;

import de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor;
import de.offis.health.icardea.tools.GlobalTools;

/**
 * <p>
 * This class allows opening a PDF file and can extract information from
 * it using the iText v5+ library (PdfReaderContentParser and PdfTextExtractor).
 * </p>
 * <p>
 * <b>Note:</b> This library is using the GNU AGPL license so it is only an 
 * <i>optional</i> and currently experimental package due to the problems
 * with the nature of the <a href="http://itextpdf.com/terms-of-use/index.php?page=AGPL">Affero
 * General Public License (AGPL)</a>.
 * <br />
 * <b>In other word this class is currently not used due to licensing issues.</b>
 * </p>
 * 
 * @see <a href="http://itextpdf.com/">iText (Java PDF library) - Website</a>
 * 
 *
 */
public class PDFiText5Extractor extends AbstractPDFExtractor implements PDFExtractor {
    /**
     * Logger object
     */
    private static Logger logger = Logger.getLogger(PDFiText5Extractor.class);

    /**
     * The full PDF file path.
     */
    private String fullPDFFilePath = null;

    /**
     * The full PDF directory path with ending file separator.
     */
    private String fullPDFDirectoryPath = null;

    /**
     * PDF reader
     */
    private PdfReader pdfReader = null;

    /**
     * PDF text bookmark entries in a List
     */
    @SuppressWarnings("unchecked")
    private List bookmarkTextList = null;

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#exportAllImagesToFileSystem(String)
     */
    public void exportAllImagesToFileSystem(String fullExportDirectoryPath) throws IOException, Exception {
        imageExtractor(fullExportDirectoryPath);
        return;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getPage(int)
     */
    public Object getPage(int pageNumber) {
        // TODO Auto-generated method stub
        return null;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getPDFPage(int)
     */
    public byte[] getPDFPage(int pageNumber) {
        return getPDFPages(pageNumber, pageNumber);
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getPDFPages(int, int)
     */
    public byte[] getPDFPages(int fromPageNumber, int toPageNumber) {
        ByteArrayOutputStream byteArrayOutputStream = null;
        boolean extractionSuccessful = false;

        if (pdfReader != null) {
            int numberOfPages = getNumberOfPages();

            /*
             * Check if the given page numbers are in the allowed range.
             */
            if (fromPageNumber > 0 && fromPageNumber <= numberOfPages && toPageNumber > 0
                    && toPageNumber <= numberOfPages) {
                /*
                 * Now check if the given fromPageNumber is smaller
                 * as the given toPageNumber. If not swap the numbers.
                 */
                if (fromPageNumber > toPageNumber) {
                    int tmpPageNumber = toPageNumber;
                    toPageNumber = fromPageNumber;
                    fromPageNumber = tmpPageNumber;
                }

                Document newDocument = new Document();

                try {
                    byteArrayOutputStream = new ByteArrayOutputStream();
                    PdfSmartCopy pdfCopy = new PdfSmartCopy(newDocument, byteArrayOutputStream);
                    newDocument.open();
                    for (int currentPage = fromPageNumber; currentPage <= toPageNumber; currentPage++) {
                        pdfCopy.addPage(pdfCopy.getImportedPage(pdfReader, currentPage));
                    } // end for
                    pdfCopy.flush();
                    pdfCopy.close();
                    newDocument.close();
                    extractionSuccessful = true;
                } catch (DocumentException ex) {
                    // TODO: Create an own exception for PDF processing errors.
                    logger.error("An exception occurred while extracting " + "pages from the input PDF file.", ex);
                } catch (IOException ex) {
                    // TODO: Create an own exception for PDF processing errors.
                    logger.error("An exception occurred while extracting " + "pages from the input PDF file.", ex);
                } finally {
                    if (!extractionSuccessful) {
                        byteArrayOutputStream = null;
                    }
                } // end try..catch..finally
            } // end if checking range of given pages
        } // end if (pdfReader != null)

        if (byteArrayOutputStream != null) {
            return byteArrayOutputStream.toByteArray();
        }
        return null;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getText()
     */
    public String getText(int pageNumber) throws IOException, Exception {
        String returnValue = null;

        if (pdfReader != null) {
            int numberOfPages = getNumberOfPages();

            if (pageNumber > 0 && pageNumber <= numberOfPages) {
                TextExtractionStrategy strategy;
                PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);
                strategy = parser.processContent(pageNumber, new SimpleTextExtractionStrategy());

                if (strategy != null && strategy.getResultantText().trim().length() > 0) {
                    returnValue = PAGE_START_MARKER + strategy.getResultantText();
                } // end if
            } else {
                // TODO: Add own exception.
                throw new Exception("The given page number (" + pageNumber + ") "
                        + "is not in the range of valid pages (1.." + numberOfPages + ").");
            } // end if..else
        } else {
            // TODO: Add own exception.
            throw new Exception("There is no open PDF to work with.");
        } // end if..else
        return returnValue;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getText()
     */
    public String getText() throws Exception {
        String returnValue = null;
        StringBuffer stringBuffer = null;
        String pageText = null;

        if (pdfReader != null) {
            int numberOfPages = getNumberOfPages();
            stringBuffer = new StringBuffer();

            // Try to extract text from each page and fill stringBuffer
            for (int currentPage = 1; currentPage <= numberOfPages; currentPage++) {
                pageText = getText(currentPage);
                if (pageText != null) {
                    stringBuffer.append(pageText);
                } else {
                    logger.debug("The call of getText(" + currentPage + ") returned: null");
                } // end if..else
            } // end for

            if (stringBuffer.length() > 0) {
                returnValue = stringBuffer.toString();
            }
        } else {
            // TODO: Add own exception.
            throw new Exception("There is no open PDF to work with.");
        } // end if..else

        return returnValue;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#openDocument()
     */
    public boolean openDocument(String fullPDFFilePath) throws IOException, Exception {
        boolean returnCode = false;

        if (fullPDFFilePath == null) {
            throw new Exception("There is no full path to a file given.");
        } // end if

        File pdfFile = new File(fullPDFFilePath);
        if (pdfFile.isFile() && pdfFile.canRead()) {
            this.fullPDFFilePath = pdfFile.getAbsolutePath();
            this.fullPDFDirectoryPath = pdfFile.getPath();

            logger.debug("FilePath.....: " + this.fullPDFFilePath);
            logger.debug("DirectoryPath: " + this.fullPDFDirectoryPath);

            // Open the PDF file
            pdfReader = new PdfReader(pdfFile.getAbsolutePath());

            logger.debug("PDF contains pages: " + pdfReader.getNumberOfPages());

            // Remove reference to the file object as it is no longer needed (cleanup)
            pdfFile = null;

            returnCode = true;
        } else {
            throw new Exception("The given PDF file is not a file or not readable (check permissions).");
        } // end if..else
        return returnCode;
    }

    /**
     * <p>
     * This method searches for all image objects from the currently processed
     * PDF file and stores them using the correct extension in the given export
     * directory or in the same directory where the original PDF file is stored.
     * </p>
     * <p>
     * The filename of the images is build based on the original PDF filename
     * (without extension) and additional details like page number, image
     * number and if available the internal image name.
     * </p>
     * @param fullExportDirectoryPath The optional full export path where the images
     * should be stored. If not given, the location of the original PDF file is used.
     * @throws Exception 
     */
    private void imageExtractor(String fullExportDirectoryPath) throws Exception {
        if (fullExportDirectoryPath != null) {
            fullExportDirectoryPath = GlobalTools.checkDirectoryPath(fullExportDirectoryPath);
            File exportDirectory = new File(fullExportDirectoryPath);
            if (!exportDirectory.exists()) {
                exportDirectory.mkdirs();
            } // end if
        } // end if

        String baseExportDirectoryPath = fullExportDirectoryPath != null ? fullExportDirectoryPath
                : this.fullPDFDirectoryPath;
        String baseFileNameWithoutExtension = GlobalTools.getFileNameWithoutExtension(this.fullPDFFilePath);

        PDFiText5PageReaderContentParser contentParser = new PDFiText5PageReaderContentParser(pdfReader);
        PDFiText5ImageRenderListener imageRenderListener = new PDFiText5ImageRenderListener(baseExportDirectoryPath,
                baseFileNameWithoutExtension);
        for (int i = 1; i <= pdfReader.getNumberOfPages(); i++) {
            contentParser.processContent(i, imageRenderListener);
        } // end for
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#exportAllPagesAsTextToFileSystem(java.lang.String)
     */
    public void exportAllPagesAsTextToFileSystem(String fullExportDirectoryPath) throws Exception {
        textExtractor(fullExportDirectoryPath);
    }

    /**
     * This method will write the text extracted from the PDF document into
     * a file with the extension <code>.txt</code>.
     * 
     * @param fullExportDirectoryPath The optional full export path where the text file should be stored. If not given, the location of the original PDF file is used.
     * @throws Exception
     */
    private void textExtractor(String fullExportDirectoryPath) throws Exception {
        if (fullExportDirectoryPath != null) {
            fullExportDirectoryPath = GlobalTools.checkDirectoryPath(fullExportDirectoryPath);
            File exportDirectory = new File(fullExportDirectoryPath);
            if (!exportDirectory.exists()) {
                exportDirectory.mkdirs();
            } // end if
        } // end if

        String baseExportDirectoryPath = fullExportDirectoryPath != null ? fullExportDirectoryPath
                : this.fullPDFDirectoryPath;
        String baseFileNameWithoutExtension = GlobalTools.getFileNameWithoutExtension(this.fullPDFFilePath);

        // Writer object to write files
        Writer out = null;
        StringBuffer stringBuffer = new StringBuffer();
        String pageText = null;

        int numberOfPages = pdfReader.getNumberOfPages();
        for (int currentPage = 1; currentPage <= numberOfPages; currentPage++) {
            pageText = getText(currentPage);
            if (pageText != null) {
                stringBuffer.append(pageText);
            } else {
                logger.debug("The call of getText(" + currentPage + ") returned: null");
            } // end if..else
        } // end for

        // If we have something to write, open a file to write the content. 
        String fullExportFileNameWithExtension = baseExportDirectoryPath + baseFileNameWithoutExtension + ".txt";
        if (stringBuffer.length() > 0) {
            logger.debug("Full export filename with extension: " + fullExportFileNameWithExtension);
            out = new BufferedWriter(
                    new OutputStreamWriter(new FileOutputStream(fullExportFileNameWithExtension), "UTF8"));
            out.write(stringBuffer.toString());
            out.flush();
            out.close();
        } else {
            logger.debug("Nothing to export to file '"
                    + GlobalTools.getFileNameWithoutFullPath(fullExportFileNameWithExtension) + "'");
        } // end if..else
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getNumberOfPages()
     */
    public int getNumberOfPages() {
        int numberOfPages = -1;
        if (pdfReader != null) {
            numberOfPages = pdfReader.getNumberOfPages();
        }
        return numberOfPages;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getBookmarkContentAsText()
     */
    @SuppressWarnings("unchecked")
    public java.util.List getBookmarkTitlesAsText() {
        java.util.List bookmarkContent = null;
        if (pdfReader != null) {
            //bookmarkContent = SimpleBookmark.getBookmark(pdfReader);

            PdfDictionary catalog = pdfReader.getCatalog();
            if (catalog != null) {
                PdfObject rootPdfObject = PdfReader.getPdfObjectRelease(catalog.get(PdfName.OUTLINES));
                if (rootPdfObject != null && rootPdfObject.isDictionary()) {
                    PdfDictionary rootOutlinesPdfDictionary = (PdfDictionary) rootPdfObject;
                    /*
                     * If it doesn't exist create the List and populate it,
                     * otherwise just return the already existing List.
                     */
                    if (bookmarkTextList == null) {
                        bookmarkTextList = new ArrayList<String>();

                        // Populate the List
                        populateBookmarkTextList(rootOutlinesPdfDictionary, "");
                    } // end if

                }
            } // end if
        }
        return bookmarkContent;
    }

    /**
     * This method will populate the text bookmark list.
     * 
     * @param rootOutlinesPdfDictionary The node element for the bookmark item.
     * @param indentionString The base indention string to be used.
     */
    @SuppressWarnings("unchecked")
    private void populateBookmarkTextList(PdfDictionary rootOutlinesPdfDictionary, String indentionString) {
        PdfDictionary outlineItemPdfDictionary = (PdfDictionary) PdfReader
                .getPdfObjectRelease(rootOutlinesPdfDictionary.get(PdfName.FIRST));
        while (outlineItemPdfDictionary != null) {
            PdfString bookmarkTitle = (PdfString) PdfReader
                    .getPdfObjectRelease(outlineItemPdfDictionary.get(PdfName.TITLE));
            bookmarkTextList.add(indentionString + bookmarkTitle.toUnicodeString());
            logger.trace(indentionString + bookmarkTitle.toUnicodeString());

            /*
             * Recursive call to fill List
             */
            populateBookmarkTextList(outlineItemPdfDictionary, indentionString + bookmarkIndentionString());

            /*
             * Get next outline item
             */
            outlineItemPdfDictionary = (PdfDictionary) PdfReader
                    .getPdfObjectRelease(outlineItemPdfDictionary.get(PdfName.NEXT));
        } // end while
    }

    /**
     * This method will return the key and value pairs stored in the PDF
     * information. It's the basic information like title, subject, author,
     * creator, keywords, producer (meaning application) as well as creation
     * and modification date. The method is provided for debugging purposes.
     * 
     * @return Returns <code>key=value</code> pair line by line (using system
     * dependent newline).
     */
    @SuppressWarnings("unused")
    private String getPdfInfo() {
        StringBuffer stringBuffer = new StringBuffer();
        if (pdfReader != null) {
            Map<?, ?> info = pdfReader.getInfo();
            String key;
            String value;
            Iterator<?> iterator = info.keySet().iterator();
            while (iterator.hasNext()) {
                key = (String) iterator.next();
                value = (String) info.get(key);
                stringBuffer.append(key);
                stringBuffer.append("=");
                stringBuffer.append(value);
                stringBuffer.append(GlobalTools.LINESEPARATOR);
            } // end while
        } // end if

        return stringBuffer.toString();
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getAuthor()
     */
    public String getAuthor() {
        /*
         * Key = "Author"
         */
        String author = null;
        if (pdfReader != null) {
            Map<?, ?> info = pdfReader.getInfo();
            author = (String) info.get("Author");
        } // end if

        return author;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getCreationDate()
     */
    public String getCreationDate() {
        /*
         * Key = "CreationDate"
         * 
         * Samples of format (key=value):
         * CreationDate=D:20100415143436Z
         * CreationDate=D:20100415174640+02'00'
         */
        String creationDate = null;
        if (pdfReader != null) {
            Map<?, ?> info = pdfReader.getInfo();
            // TODO: Fix the format of the date to ISO8601 timestamp
            creationDate = (String) info.get("CreationDate");
        } // end if

        return creationDate;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getCreator()
     */
    public String getCreator() {
        /*
         * Key = "Creator"
         */
        String creator = null;
        if (pdfReader != null) {
            Map<?, ?> info = pdfReader.getInfo();
            creator = (String) info.get("Creator");
        } // end if

        return creator;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getKeywords()
     */
    public String getKeywords() {
        /*
         * Key = "Keywords"
         */
        String keywords = null;
        if (pdfReader != null) {
            Map<?, ?> info = pdfReader.getInfo();
            keywords = (String) info.get("Keywords");
        } // end if

        return keywords;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getModificationDate()
     */
    public String getModificationDate() {
        /*
         * Key = "ModDate"
         * 
         * Samples of format (key=value):
         * ModDate=D:20100415143436Z
         * ModDate=D:20100415174640+02'00'
         */
        String modificationDate = null;
        if (pdfReader != null) {
            Map<?, ?> info = pdfReader.getInfo();
            // TODO: Fix the format of the date to ISO8601 timestamp
            modificationDate = (String) info.get("ModDate");
        } // end if

        return modificationDate;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getProducer()
     */
    public String getProducer() {
        /*
         * Key = "Producer"
         * 
         * Samples of format (key=value):
         * Producer=iText by lowagie.com (r0.94 - paulo 102)
         */
        String producer = null;
        if (pdfReader != null) {
            Map<?, ?> info = pdfReader.getInfo();
            producer = (String) info.get("Producer");
        } // end if

        return producer;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getSubject()
     */
    public String getSubject() {
        /*
         * Key = "Subject"
         */
        String subject = null;
        if (pdfReader != null) {
            Map<?, ?> info = pdfReader.getInfo();
            subject = (String) info.get("Subject");
        } // end if

        return subject;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#getTitle()
     */
    public String getTitle() {
        /*
         * Key = "Title"
         */
        String title = null;
        if (pdfReader != null) {
            Map<?, ?> info = pdfReader.getInfo();
            title = (String) info.get("Title");
        } // end if

        return title;
    }

    /* (non-Javadoc)
     * @see de.offis.health.icardea.cied.pdf.interfaces.PDFExtractor#dispose()
     */
    public void dispose() {
        this.bookmarkTextList = null;
        this.fullPDFDirectoryPath = null;
        this.fullPDFFilePath = null;

        if (this.pdfReader != null) {
            this.pdfReader.close();
        }
        System.gc();
        return;
    }
}