de.mat.utils.pdftools.PdfExtractEmptyPages.java Source code

Introduction

Here is the source code for de.mat.utils.pdftools.PdfExtractEmptyPages.java
Source

/**
 * <h4>FeatureDomain:</h4>
 *     Publishing
 *
 * <h4>FeatureDescription:</h4>
 *     software for publishing<br>
 *     pdftools
 * 
 * @author Michael Schreiner <michael.schreiner@your-it-fellow.de>
 * @category collaboration
 * @copyright Copyright (c) 2011-2014, Michael Schreiner
 * @license http://mozilla.org/MPL/2.0/ Mozilla Public License 2.0
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */
package de.mat.utils.pdftools;

import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.log4j.Logger;

import com.itextpdf.text.Document;
import com.itextpdf.text.pdf.PdfCopy;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfImportedPage;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PdfReader;

/**
 * <h4>FeatureDomain:</h4>
 *     Publishing
 * <h4>FeatureDescription:</h4>
 *     extract empty pages
 * 
 * @package de.mat.utils.pdftools
 * @author Michael Schreiner <michael.schreiner@your-it-fellow.de>
 * @category Publishing
 * @copyright Copyright (c) 2011-2014, Michael Schreiner
 * @license http://mozilla.org/MPL/2.0/ Mozilla Public License 2.0
 */
public class PdfExtractEmptyPages extends CmdLineJob {

    public PdfExtractEmptyPages(String[] args) {
        super(args);
    }

    // identify empty pages, set minsize
    public static int blankPdfsize = 15635; // siehe \tmp\buch\meine-3000er-076-tour-paternkofel_-_paternkofelsteig-dontshowtourentalort.pdf alt:1400;
    public static int blankPdfsize_v5 = 800;

    // Logger
    private static final Logger LOGGER = Logger.getLogger(PdfExtractEmptyPages.class);

    @Override
    protected boolean validateCmdLine(CommandLine cmdLine) throws Throwable {
        if (cmdLine.getArgList().size() < 3) {
            return false;
        }
        return true;
    }

    @Override
    protected void printUsage() throws Throwable {
        HelpFormatter formatter = new HelpFormatter();
        formatter
                .printHelp(this.getJobName() + " SrcPdfFile DestPdfFile EmptyPagesPdfFile",
                        "Desc: reads pdfSourceFile and adds pages to pdfRemovedFile "
                                + "if empty, or to pdfDestinationFile if not empty",
                        this.availiableCmdLineOptions, "");
    }

    @Override
    protected void doJob() throws Throwable {
        // get parameter
        String srcFile = this.cmdLine.getArgs()[0];
        String destFile = this.cmdLine.getArgs()[1];
        String emptyFile = this.cmdLine.getArgs()[1];

        // start
        PdfExtractEmptyPages.removeBlankPdfPages(srcFile, destFile, emptyFile);
    }

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        PdfExtractEmptyPages me = new PdfExtractEmptyPages(args);
        me.startJobProcessing();
    }

    /**
     * <h4>FeatureDomain:</h4>
     *     PublishingTools
     * <h4>FeatureDescription:</h4>
     *     reads pdfSourceFile and adds pages to pdfRemovedFile if empty, or to 
     *     pdfDestinationFile if not empty
     * <h4>FeatureResult:</h4>
     *   <ul>
     *     <li>updates pdfDestinationFile - add all pages which are not empty
     *     <li>updates pdfRemovedFile - add all empty pages
     *   </ul> 
     * <h4>FeatureKeywords:</h4>
     *     PDF Publishing
     * @param pdfSourceFile - source pdf-file
     * @param pdfDestinationFile - pdf with all not empty pages
     * @param pdfRemovedFile - pdf with all empty pages
     * @throws Exception
     */
    public static void removeBlankPdfPages(String pdfSourceFile, String pdfDestinationFile, String pdfRemovedFile)
            throws Exception {
        // create readerOrig
        PdfReader readerOrig = new PdfReader(pdfSourceFile);

        // create writerTrimmed which bases on readerOrig
        Document documentTrimmed = new Document(readerOrig.getPageSizeWithRotation(1));
        PdfCopy writerTrimmed = new PdfCopy(documentTrimmed, new FileOutputStream(pdfDestinationFile));
        documentTrimmed.open();

        // create writerRemoved which bases on readerOrig
        Document documentRemoved = new Document(readerOrig.getPageSizeWithRotation(1));
        PdfCopy writerRemoved = new PdfCopy(documentRemoved, new FileOutputStream(pdfRemovedFile));
        documentRemoved.open();

        // extract and copy empty pages
        addTrimmedPages(pdfSourceFile, readerOrig, writerTrimmed, writerRemoved, true);

        // close everything
        documentTrimmed.close();
        writerTrimmed.close();
        documentRemoved.close();
        writerRemoved.close();
        readerOrig.close();
    }

    /**
     * <h4>FeatureDomain:</h4>
     *     PublishingTools
     * <h4>FeatureDescription:</h4>
     *     reads readerOrig and adds pages to writerRemoved if empty, or to 
     *     writerTrimmed if not empty
     * <h4>FeatureResult:</h4>
     *   <ul>
     *     <li>updates writerTrimmed - add all pages which are not empty
     *     <li>updates writerRemoved - add all empty pages
     *   </ul> 
     * <h4>FeatureKeywords:</h4>
     *     PDF Publishing
     * @param origFileName - orig filename of the sourcepdf
     * @param readerOrig - reader of source
     * @param writerTrimmed - writer for trimmed pages
     * @param writerRemoved - writer for empty pages
     * @param flgTrim - ??
     * @return - count of trimmed pages
     * @throws Exception
     */
    public static int addTrimmedPages(String origFileName, PdfReader readerOrig, PdfCopy writerTrimmed,
            PdfCopy writerRemoved, boolean flgTrim) throws Exception {
        PdfImportedPage page = null;
        int countTrimmedPages = 0;

        //loop each page
        for (int i = 1; i <= readerOrig.getNumberOfPages(); i++) {
            boolean flgIsEmpty = true;

            // get dictionary
            PdfDictionary pageDict = readerOrig.getPageN(i);

            // every pdf-version has its own way :-(
            char version = readerOrig.getPdfVersion();

            if (version == '3') {
                // PDF-Version: 3

                // examine the resource dictionary for /Font or
                // /XObject keys.  If either are present, they're almost
                // certainly actually used on the page -> not blank.
                PdfObject myObj = pageDict.get(PdfName.RESOURCES);
                PdfDictionary resDict = null;
                if (myObj instanceof PdfDictionary) {
                    resDict = (PdfDictionary) myObj;
                } else {
                    resDict = (PdfDictionary) PdfReader.getPdfObject(myObj);
                }
                if (resDict != null) {
                    flgIsEmpty = resDict.get(PdfName.FONT) == null && resDict.get(PdfName.XOBJECT) == null;
                    if (LOGGER.isInfoEnabled()) {
                        if (flgIsEmpty) {
                            LOGGER.info("probably empty page " + i + " Version: 1." + version
                                    + " FONT/XOBJECT found in File:" + origFileName);
                        } else {
                            LOGGER.info("normal page " + i + " Version: 1." + version
                                    + " no FONT/XOBJECT found in File:" + origFileName);
                        }
                    }
                }
            } else if (version == '4') {
                // PDF-Version: 4
                // check the contentsize.

                // get the page content
                byte bContent[] = readerOrig.getPageContent(i);
                ByteArrayOutputStream bs = new ByteArrayOutputStream();
                // write the content to an output stream
                bs.write(bContent);

                flgIsEmpty = true;
                if (bs.size() > blankPdfsize) {
                    if (LOGGER.isInfoEnabled())
                        LOGGER.info("normal page " + i + " Version: 1." + version + " BS:" + bs.size() + " File:"
                                + origFileName);
                    flgIsEmpty = false;
                } else {
                    if (LOGGER.isInfoEnabled())
                        LOGGER.info("probably empty page " + i + " Version: 1." + version + " BS:" + bs.size()
                                + " File:" + origFileName);
                }
            } else if (version == '5') {
                // PDF-Version: 5
                // check the contentsize.

                // get the page content
                byte bContent[] = readerOrig.getPageContent(i);
                ByteArrayOutputStream bs = new ByteArrayOutputStream();
                // write the content to an output stream
                bs.write(bContent);

                flgIsEmpty = true;
                if (bs.size() > blankPdfsize_v5) {
                    if (LOGGER.isInfoEnabled())
                        LOGGER.info("normal page " + i + " Version: 1." + version + " BS:" + bs.size() + " File:"
                                + origFileName);
                    flgIsEmpty = false;
                } else {
                    if (LOGGER.isInfoEnabled())
                        LOGGER.info("probably empty page " + i + " Version: 1." + version + " BS:" + bs.size()
                                + " File:" + origFileName);
                }
            }

            // add page to removed or trimmed document
            if (!flgIsEmpty || !flgTrim) {
                if (LOGGER.isInfoEnabled())
                    LOGGER.info("add page " + i);
                page = writerTrimmed.getImportedPage(readerOrig, i);
                writerTrimmed.addPage(page);
                countTrimmedPages++;
            } else {
                if (LOGGER.isInfoEnabled())
                    LOGGER.info("skip page " + i + " Version: 1." + version + " File:" + origFileName);
                if (writerRemoved != null) {
                    page = writerRemoved.getImportedPage(readerOrig, i);
                    writerRemoved.addPage(page);
                }
            }
        }

        return countTrimmedPages;
    }
}