org.mnsoft.pdfocr.Wrapper.java Source code

Introduction

Here is the source code for org.mnsoft.pdfocr.Wrapper.java
Source

package org.mnsoft.pdfocr;

import com.lowagie.text.DocumentException;
import com.lowagie.text.pdf.PdfContentByte;
import com.lowagie.text.pdf.PdfImportedPage;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.PdfStamper;

import org.apache.commons.io.FileUtils;

import org.apache.log4j.Logger;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;

import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.TreeMap;

/**
 * Provides the wrapper for the OCR engine.<p>
 *
 * This program is a free software available under the GNU
 * Lesser General Public License.
 *
 * This code is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 * @author (c) 2010, Matthias Nott
 */
public class Wrapper {
    /**
     * Logger for this class
     */
    private static final Logger log = Logger.getLogger(Wrapper.class);

    /**
     * Default config file name.
     */
    public static final String CONFIG_FILE_NAME = "pdfocr.properties";

    /**
     * The command to run the OCR Engine,
     * with place holders for the Input and Output file names.
     */
    public String OCR_COMMAND = "abbyyocr -ic Uncompressed -if ###IF### -f PDF -pem ImageOnText -pfq 100% -pfc ###CREATOR### -rtn -of ###OF###";

    /**
     * The creator field entry. If detected,
     * file is not treated (again).
     */
    public String OCR_CREATOR = "ocr";

    /**
     * Temporary extension added to the file
     * name when running the OCR.
     */
    public String TMP_EXTENSION = ".ocr";

    /**
     * Directory for temporary files
     */
    public String TMP_DIR = ".";

    /**
     * The program parameters.
     */
    private HashMap<String, String> parameters = new HashMap<String, String>();

    /**
     * The configuration file name, if any.
     */
    private String propertiesFile = null;

    /**
     * Working directory.
     */
    private String wd = ".";

    /**
     * Constructor.
     */
    public Wrapper(String wd) {
        this.wd = wd;
    }

    /**
     * The main loop.
     * @param args
     */
    public static void main(String[] args) throws Exception {
        final Wrapper w = new Wrapper((args.length == 0) ? "." : args[0]);

        w.init(args);

        w.run();
    }

    /**
     * Run the Wrapper.
     *
     * @throws IOException
     * @throws InterruptedException
     * @throws DocumentException
     */
    @SuppressWarnings("rawtypes")
    public void run() throws IOException, InterruptedException, DocumentException {
        RecursiveFileListIterator it = new RecursiveFileListIterator(new File(wd), new FileFilter(".pdf"));

        while (it.hasNext()) {
            final File originalFile = it.next();
            final String originalFilePath = originalFile.getAbsolutePath();

            /*
             * Open the reader on the original File
             */
            PdfReader readerOnOriginalFile;

            try {
                readerOnOriginalFile = new PdfReader(originalFilePath);
            } catch (Exception e) {
                log.error("! ERROR: " + e.getMessage() + " File: " + originalFilePath);

                continue;
            }

            /*
             * Get the document information
             */
            Map info = readerOnOriginalFile.getInfo();

            /*
             * Get the document creator. If the document
             * has already been worked on, continue with
             * the next document.
             */
            String doc_creator = (String) info.get("Creator");

            if (this.OCR_CREATOR.equals(doc_creator)) {
                log.debug(
                        "+ INFO: File " + originalFilePath + " had already been run trough OCR engine. Skipping.");

                continue;
            }

            /*
             * Get the document time stamp so that we can set it later.
             */
            final Date doc_timestamp = new Date(originalFile.lastModified());

            /*
             * Get the number of pages in the original file
             */
            int nOri = readerOnOriginalFile.getNumberOfPages();

            log.debug("+ Working on: " + originalFilePath + " (" + nOri + " pages).");

            final StringBuffer sb = new StringBuffer();

            sb.append(originalFilePath + " ... ");

            /*
             * Get the remaining meta data
             */
            String doc_title = ((String) info.get("Title") == null) ? "" : (String) info.get("Title");
            String doc_subject = ((String) info.get("Subject") == null) ? "" : (String) info.get("Subject");
            String doc_keywords = ((String) info.get("Keywords") == null) ? "" : (String) info.get("Keywords");
            String doc_author = ((String) info.get("Author") == null) ? "" : (String) info.get("Author");

            readerOnOriginalFile.close();

            /*
             * Set the creator to our marker
             */
            doc_creator = this.OCR_CREATOR;

            /*
             * Run the OCR Engine
             */
            File outputFileFromOCR = null;
            try {
                outputFileFromOCR = ocr(originalFile);
            } catch (Exception e) {
                log.error("! ERROR: " + e.getMessage());

                continue;
            }

            /*
             * Check for the result of the OCR Engine
             */
            if ((outputFileFromOCR == null) || !outputFileFromOCR.exists()) {
                continue;
            }

            log.debug("+ " + outputFileFromOCR.getAbsolutePath() + " has come out of the OCR engine.");

            /*
             * Create final output
             */

            /*
             * Create a temporary file and copy the source
             * file to it, to avoid UTF-8 encoding problems
             * on the filename confusing the OCR engine
             */
            final File temp = File.createTempFile("ocr", ".pdf", new File(this.TMP_DIR));
            temp.deleteOnExit();

            mergePDFs(originalFile, outputFileFromOCR, temp, doc_title, doc_subject, doc_keywords, doc_author,
                    doc_creator);

            FileUtils.deleteQuietly(originalFile);

            FileUtils.moveFile(temp, new File(originalFilePath));

            /*
             * Set the file access time
             */
            if ("true".equals(getAttribute("KEEPTS"))) {
                if (originalFile.exists()) {
                    originalFile.setLastModified(doc_timestamp.getTime() + 1000);
                }
            }

            /*
             * Finally, remove the temporary document
             */
            FileUtils.deleteQuietly(temp);
            FileUtils.deleteQuietly(outputFileFromOCR);
        }
    }

    /**
     * Run the OCR command
     *
     * @param originalFile The file to run the command on
     * @return The file that was created
     * @throws IOException
     * @throws InterruptedException
     */
    private File ocr(File originalFile) throws IOException, InterruptedException {
        /*
         * Create a temporary file and copy the source
         * file to it, to avoid UTF-8 encoding problems
         * on the filename confusing the OCR engine
         */
        log.debug("> Creating Temporary Source File");

        final File sourceFileForOCR = File.createTempFile("ocr", ".pdf", new File(this.TMP_DIR));
        sourceFileForOCR.deleteOnExit();

        log.debug("< Created Temporary Source File: " + sourceFileForOCR.getAbsolutePath());

        FileUtils.copyFile(originalFile, sourceFileForOCR, true);

        log.debug("+ Copied " + originalFile.getAbsolutePath() + " to " + sourceFileForOCR.getAbsolutePath());

        /*
         * Create the command line
         */
        String[] cmd = StringUtility.split(getAttribute("cmd"), " ");
        for (int i = 0; i < cmd.length; i++) {
            if ("###IF###".equals(cmd[i])) {
                cmd[i] = sourceFileForOCR.getAbsolutePath();
            } else if ("###OF###".equals(cmd[i])) {
                cmd[i] = sourceFileForOCR.getAbsolutePath() + this.TMP_EXTENSION;
            } else if ("###CREATOR###".equals(cmd[i])) {
                cmd[i] = getAttribute("creator");
            }
        }

        final StringBuffer sb = new StringBuffer();
        for (int i = 0; i < cmd.length; i++) {
            sb.append(cmd[i]);
            sb.append(" ");
        }

        log.debug("> Calling OCR Engine: " + sb);

        callOCREngine(cmd);

        /*
         * Copy temporary output file to output file
         */
        final File targetFile = new File(this.TMP_DIR + "/" + originalFile.getName() + this.TMP_EXTENSION);
        FileUtils.deleteQuietly(targetFile);

        FileUtils.moveFile(new File(sourceFileForOCR.getAbsolutePath() + this.TMP_EXTENSION), targetFile);

        /*
         * Delete temporary file
         */
        FileUtils.deleteQuietly(new File(sourceFileForOCR.getAbsolutePath() + this.TMP_EXTENSION));

        log.debug("< Calling OCR Engine. Output file is: " + targetFile.getAbsolutePath());

        return targetFile;
    }

    /**
     * Call an external program.
     *
     * @param cmd
     * @throws IOException
     * @throws InterruptedException
     */
    private void callOCREngine(String[] cmd) throws IOException, InterruptedException {
        Runtime run = Runtime.getRuntime();
        Process pr = run.exec(cmd);
        pr.waitFor();

        BufferedReader buf = new BufferedReader(new InputStreamReader(pr.getInputStream()));
        String line = "";
        while ((line = buf.readLine()) != null) {
            log.info(line);
        }
    }

    @SuppressWarnings({ "unchecked", "rawtypes" })
    private void mergePDFs(File foreground, File background, File newFile, String title, String subject,
            String keywords, String author, String creator) {
        log.debug("Merge " + foreground + " (FG) and " + background + " (BG) to " + newFile);

        final double threshold = ((Integer) StringUtility.StringToInteger(getAttribute("THRESHOLD"), 2))
                .doubleValue();

        try {
            /*
             * Foreground: Original Image.
             * Background: OCR'd Text
             */
            final PdfReader fg = new PdfReader(foreground.getAbsolutePath());
            final PdfReader bg = new PdfReader(background.getAbsolutePath());

            /*
             * Count pages for foreground and background
             */
            final int fg_num_pages = fg.getNumberOfPages();
            final int bg_num_pages = bg.getNumberOfPages();

            if (fg_num_pages != bg_num_pages) {
                log.error(
                        "! Foreground and background have different number of pages. This should really not happen.");
            }

            /*
             *  The output document
             */
            final PdfStamper fg_writer = new PdfStamper(fg, new FileOutputStream(newFile));

            /*
             * Create a PdfTemplate from the first page of mark
             * (PdfImportedPage is derived from PdfTemplate)
             */
            PdfImportedPage bg_page = null;
            for (int i = 0; i < fg_num_pages;) {
                ++i;
                System.out.print(" [" + i + "]");

                final byte[] fg_page_content = fg.getPageContent(i);
                final byte[] bg_page_content = bg.getPageContent(i);

                final int bg_size = bg_page_content.length;
                final int fg_size = fg_page_content.length;

                /*
                 * If we're not explicitly merging, we're merging
                 * the document with itself only anyway.
                 */
                if (!"true".equals(getAttribute("mergefiles"))) {
                    continue;
                }

                /*
                 * Modification 20130904
                 *
                 * We want to scan only what's not been generated by a number of
                 * generators. So, until now, the generator of whom we wanted to
                 * ignore files was ocr, i.e. the one we set ourselves. Now, we
                 * have seen that when we run an OCR on a "pdf+text" file, as we
                 * collate in post the file with its image, we get an overlapping
                 * text which is not pixel correct, i.e. which makes the PDF appear
                 * not nicely.
                 *
                 * If the background image is not at least threshold times as large as
                 * the foreground image, we assume we've been working on a
                 * page that was plain text already, and don't add the image
                 * to the background.
                 */
                if ((bg_size / fg_size) <= threshold) {
                    log.debug("! Not adding background for page " + i + " since background size (" + bg_size
                            + ") not different enough from foreground size (" + fg_size + ").");

                    continue;
                }

                bg_page = fg_writer.getImportedPage(bg, i);

                final PdfContentByte contentByte = fg_writer.getUnderContent(i);

                contentByte.addTemplate(bg_page, 0, 0);
            }

            HashMap map = fg_writer.getMoreInfo();
            if (map == null) {
                map = new HashMap();
            }

            if (title != null) {
                map.put("Title", title);
            }

            if (subject != null) {
                map.put("Subject", subject);
            }

            if (keywords != null) {
                map.put("Keywords", keywords);
            }

            if (author != null) {
                map.put("Author", author);
            }

            if (creator != null) {
                map.put("Creator", creator);
            }

            fg_writer.setMoreInfo(map);

            fg_writer.close();

            System.out.println("");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * Initialize the program.
     *
     * @param args The command line parameters.
     */
    private void init(String[] args) {
        /*
         * Get the command line Parameters.
         */
        getCommandLineParameters(args);

        /*
         * Even if the constructor had set the name
         * of the properties file, we are able to
         * overwrite that name by the "cfg" command
         * line parameter.
         */
        this.propertiesFile = (String) getAttribute("cfg");
        if ((this.propertiesFile == null) || "".equals(this.propertiesFile)) {
            this.propertiesFile = Wrapper.CONFIG_FILE_NAME;
        }

        readConfiguration(this.propertiesFile);
    }

    /**
     * Read the configuration
     * properties file.
     *
     * @param filename The configuration file name.
     */
    @SuppressWarnings("rawtypes")
    private void readConfiguration(String filename) {
        /*
         * Read the configuration file
         */
        log.debug("> readConfiguration");
        log.debug("> Loading configuration from " + filename);

        final Properties p = FileLoader.loadProperties(filename);
        if (p == null) {
            log.info("! No configuration file specified. Using default values.");
            setAttribute("cmd", this.OCR_COMMAND);
            setAttribute("creator", this.OCR_CREATOR);
            setAttribute("tmpext", this.TMP_EXTENSION);
            log.debug("+ Configuration Parameter CMD=" + this.OCR_COMMAND);
            log.debug("+ Configuration Parameter CREATOR=" + this.OCR_CREATOR);
            log.debug("+ Configuration Parameter TMPEXT=" + this.TMP_EXTENSION);

            log.debug("< readConfiguration");

            return;
        }

        /*
         * Convert property names to upper case
         */
        final TreeMap<String, String> uppercaseConfigurationParameters = new TreeMap<String, String>();
        for (final Iterator it = p.keySet().iterator(); it.hasNext();) {
            final String pName = (String) it.next();
            final String pValue = (String) p.get(pName);
            uppercaseConfigurationParameters.put(pName.toUpperCase(), pValue);
        }

        /*
         * We save the parameters that already are in the
         * HashMap, i.e. which have been passed in as command
         * line parameters
         */
        final TreeMap<String, String> uppercaseCommandLineParameters = new TreeMap<String, String>();
        for (final Iterator it = this.parameters.keySet().iterator(); it.hasNext();) {
            final String pName = (String) it.next();
            final String pValue = (String) this.parameters.get(pName);
            uppercaseCommandLineParameters.put(pName.toUpperCase(), pValue);
        }

        final HashMap<String, String> upperCaseParameters = new HashMap<String, String>();

        for (final String key : uppercaseConfigurationParameters.keySet()) {
            final String value = uppercaseConfigurationParameters.get(key);
            upperCaseParameters.put(key, value);
            log.debug("+ Configuration Parameter " + key + "=" + value);
            if ((key.length() > 4) && key.substring(0, 4).equalsIgnoreCase("SYS.")) {
                System.setProperty(key.substring(4), value);
                log.debug("+ Java System   Parameter " + key.substring(4) + "=" + value);
            }
        }

        for (final String key : uppercaseCommandLineParameters.keySet()) {
            final String value = uppercaseCommandLineParameters.get(key);
            upperCaseParameters.put(key, value);
            log.debug("+ Command Line  Parameter " + key + "=" + value);
            if ((key.length() > 4) && key.substring(0, 4).equalsIgnoreCase("SYS.")) {
                System.setProperty(key.substring(4), value);
                log.debug("+ Java System   Parameter " + key.substring(4) + "=" + value);
            }
        }

        this.parameters = upperCaseParameters;

        if (getAttribute("cmd") != null) {
            this.OCR_COMMAND = getAttribute("cmd");
            log.debug("+ Configuration Parameter CMD=" + this.OCR_COMMAND);
        }

        if (getAttribute("creator") != null) {
            this.OCR_CREATOR = getAttribute("CREATOR");
            log.debug("+ Configuration Parameter CREATOR=" + this.OCR_CREATOR);
        }

        if (getAttribute("tmpext") != null) {
            this.TMP_EXTENSION = getAttribute("tmpext");
            log.debug("+ Configuration Parameter TMPEXT=" + this.TMP_EXTENSION);
        }

        if (getAttribute("tmpdir") != null) {
            this.TMP_DIR = getAttribute("tmpdir");
            log.debug("+ Configuration Parameter TMPDIR=" + this.TMP_DIR);
        }

        log.debug("< readConfiguration");
    }

    /**
     * Get the command line parameters into the
     * parameters HashMap.
     *
     * @param args The command line parameters.
     */
    private void getCommandLineParameters(String[] args) {
        for (int i = 0; i < args.length; i++) {
            String parametername = StringUtility.getParameter(args[i], true, false);
            String parametervalue = StringUtility.getParameter(args[i], false, false);
            if ((parametername != null) && (parametervalue != null)) {
                setAttribute(parametername, parametervalue);
            }
        }
    }

    /**
     * Get an Application Attribute.
     * @param par The Attribute parameter name, not case sensitive.
     * @return The Attribute parameter value.
     */
    private String getAttribute(String par) {
        final String value = this.parameters.get(par.toUpperCase());

        return value;
    }

    /**
     * Set an Application Attribute.
     *
     * Use with great care. This can open a can of worms
     * in terms of concurrency.
     *
     * @param par The Attribute parameter name, not case sensitive.
     * @param val The Attribute parameter value.
     */
    private void setAttribute(String par, String val) {
        if ((null != par) && (null != val)) {
            this.parameters.put(par.toUpperCase(), val);
        }
    }
}