net.ionescu.jhocr2pdf.Jhocr2pdf.java Source code

Introduction

Here is the source code for net.ionescu.jhocr2pdf.Jhocr2pdf.java
Source

/**
 * Copyright (C) 2012 Ovidiu-Laurian Ionescu
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 */

package net.ionescu.jhocr2pdf;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Map;
import static java.util.logging.Level.FINE;
import static java.util.logging.Level.FINER;
import java.util.logging.Logger;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Element;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.BaseFont;
import com.itextpdf.text.pdf.PdfContentByte;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfStamper;

/**
 * Combines a PDF file containing scanned TIFF images with the OCR generated by Tesseract
 * @author Ovidiu Ionescu
 *
 */
public class Jhocr2pdf implements ContentHandler, ErrorHandler {
    private static Logger log = Logger.getLogger(Jhocr2pdf.class.getName());

    PdfStamper stamper;
    PdfContentByte canvas;
    BaseFont bf;
    float xScaling;
    float yScaling;

    boolean visible;
    int siglePageOCRNum = -1;

    StringBuilder text = new StringBuilder();
    String coords;

    @Override
    public void setDocumentLocator(Locator locator) {

    }

    @Override
    public void startDocument() throws SAXException {

    }

    @Override
    public void endDocument() throws SAXException {

    }

    @Override
    public void startPrefixMapping(String prefix, String uri) throws SAXException {

    }

    @Override
    public void endPrefixMapping(String prefix) throws SAXException {

    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
        text.setLength(0);
        coords = null;

        if (localName.equals("span")) {
            log.finer("start of span, we encountered an OCR word");
            String klass = atts.getValue("class");
            if (null != klass && klass.equals("ocr_word")) {
                coords = atts.getValue("title");
            }
        }

        if (localName.equals("div")) {
            String klass = atts.getValue("class");
            if (null != klass && klass.equals("ocr_page")) {
                log.finer("start div, start of page, load the corresponding page of the PDF file");
                int pageNum = siglePageOCRNum;
                if (pageNum < 0) {
                    String id = atts.getValue("id");
                    pageNum = Integer.valueOf(id.split("_")[1], 10);
                    log.log(FINE, "extracted page number from the div id attribute: {0}", pageNum);
                }
                if (visible) {
                    canvas = stamper.getOverContent(pageNum);
                } else {
                    canvas = stamper.getUnderContent(pageNum);
                }

                /*
                 * Calculate the scaling of the image.
                 * Tesseract coordinates differ from the PDF coordinates so we can't use them directly.
                 * But Tesseract gives us the dimension of the scanned image and we also know that the 
                 * image is as large as the page. So we calculate a scaling factor to translate from 
                 * Tesseract coordinates into PDF coordinates.
                 */

                String title[] = atts.getValue("title").split(" ");
                float imgWidth = Float.valueOf(title[title.length - 2]) - Float.valueOf(title[title.length - 4]);
                float imgHeight = Float.valueOf(title[title.length - 1]) - Float.valueOf(title[title.length - 3]);
                xScaling = PageSize.A4.getWidth() / imgWidth;
                yScaling = PageSize.A4.getHeight() / imgHeight;
                log.log(FINE, "Load page {0} with scaling {1}x{2}", new Object[] { pageNum, xScaling, yScaling });
            }
        }
    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {
        // if this is closing a span containing an OCRed word then add the text to the PDF
        if (coords != null && text.length() > 0) {
            log.log(FINER, "OCRed word span closed, coords: {0} {1}", new Object[] { coords, text });
            addSpan(coords, text.toString());
        }
        coords = null;
    }

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
        text.append(ch, start, length);
    }

    @Override
    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {

    }

    @Override
    public void processingInstruction(String target, String data) throws SAXException {

    }

    @Override
    public void skippedEntity(String name) throws SAXException {

    }

    @Override
    public void warning(SAXParseException exception) throws SAXException {

    }

    @Override
    public void error(SAXParseException exception) throws SAXException {

    }

    @Override
    public void fatalError(SAXParseException exception) throws SAXException {

    }

    public void addSpan(String coords, String text) {
        // parse the coordinates
        String[] coord = coords.split(" ");
        Rectangle rect = new Rectangle(Float.valueOf(coord[1]), Float.valueOf(coord[4]), Float.valueOf(coord[3]),
                Float.valueOf(coord[2]));

        canvas.saveState();
        canvas.beginText();
        canvas.setTextRenderingMode(PdfContentByte.TEXT_RENDER_MODE_FILL);
        canvas.setRGBColorFill(0xFF, 0, 0);
        canvas.setLineWidth(0);

        // calculate the font size
        float width = rect.getWidth() * xScaling;

        float tenWidth = bf.getWidthPointKerned(text, 10);
        float fontSize = 10 * width / tenWidth;

        canvas.setFontAndSize(bf, fontSize);
        canvas.showTextAlignedKerned(Element.ALIGN_LEFT, text, rect.getLeft() * xScaling,
                PageSize.A4.getHeight() - rect.getBottom() * yScaling - bf.getDescentPoint(text, fontSize), 0);
        canvas.endText();
        canvas.restoreState();

    }

    @SuppressWarnings("unused")
    private Jhocr2pdf() {

    }

    public Jhocr2pdf(PdfStamper stamper, boolean visible, String fontPath) throws DocumentException, IOException {
        this.stamper = stamper;
        this.visible = visible;
        if (fontPath != null) {
            bf = BaseFont.createFont(fontPath, BaseFont.WINANSI, BaseFont.EMBEDDED);
        } else {
            bf = BaseFont.createFont();
        }
    }

    /**
     * @param pageNum page number if the OCR file corresponds to just one page, -1 for multi page
     */
    public void parse(String fileName, int pageNum) throws SAXException, IOException {
        siglePageOCRNum = pageNum;
        XMLReader reader;
        reader = XMLReaderFactory.createXMLReader();
        reader.setContentHandler(this);
        reader.setErrorHandler(this);

        final BufferedReader characterStream = new BufferedReader(
                new InputStreamReader(new FileInputStream(fileName), "UTF8"));
        // skip past the DOCTYPE line
        characterStream.readLine();

        Reader filter = new Reader() {

            @Override
            public int read(char[] cbuf, int off, int len) throws IOException {
                int chars = characterStream.read(cbuf, off, len);
                //  filter out all non XML characters
                for (int i = 0; i < chars; i++) {
                    int c = cbuf[i + off];
                    if (c == 0x0009 || c == 0x000A || c == 0x000D || (c >= 0x0020 && c <= 0xD7FF)
                            || (0x10000 <= c && c <= 0x10FFFF)) {
                        // c is XML valid, leave it alone
                    } else {
                        cbuf[i + off] = ' ';
                    }
                }
                return chars;
            }

            @Override
            public void close() throws IOException {
                characterStream.close();

            }

        };
        reader.parse(new InputSource(filter));
    }

    /**
     * @param args
     * The arguments are:
     *  [options] output_pdf
     * The name of the output PDF file with the added OCR information must be the last argument
     * Other 
     * flags:
     *   -i pdf_file PDF file containing the TIFF images
     *   -hocr html_file hOCR file generated by Tesseract 
     *   -visible render the text above the image (mostry for debugging)
     *   -font font_path  path to the font file to use
     *   -hocrnameformat string to use to construct the name of the hocr file depending on page number
     *
     * Usage example:
     * java -jar jhorc2pdf.jar -font /usr/share/fonts/truetype/ttf-dejavu/DejaVuSans.ttf -i input.pdf -hocr input.html output.pdf
     *
     * @throws SAXException
     * @throws IOException
     * @throws DocumentException
     */
    public static void main(String[] args) throws SAXException, IOException, DocumentException {
        /*
         * process input parameters
         * 
         * -visible -font font_path inputPDF inputOCR outputPDF
         */
        boolean visible = false;
        String fontPath = null;
        String fileNameFormat = null;
        String inputPDF = null;
        String inputHOCR = null;
        String outputPDF = null;

        for (int i = 0; i < args.length; i++) {
            if (args[i].startsWith("-")) {
                switch (args[i]) {

                case "-visible":
                    visible = true;
                    break;

                case "-font":
                    fontPath = args[++i];
                    break;

                case "-hocrnameformat":
                    fileNameFormat = args[++i];
                    break;

                case "-i":
                    inputPDF = args[++i];
                    break;

                case "-hocr":
                    inputHOCR = args[++i];
                    break;

                default:
                    System.err.println("Invalid parameter: " + args[i]);
                    System.exit(1);
                }
            } else {
                if (i < args.length - 1) {
                    System.err.println("Ouput file name should be the last argument");
                    System.exit(1);
                }
                outputPDF = args[i];
            }
        }

        log.fine("load the PDF file, initialize the Stamper");
        PdfReader reader = new PdfReader(inputPDF);
        PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(outputPDF));

        Map<String, String> info = reader.getInfo();
        info.put("Title", new File(outputPDF).getName());
        stamper.setMoreInfo(info);

        Jhocr2pdf ocrStamp = new Jhocr2pdf(stamper, visible, fontPath);
        if (null != fileNameFormat) {
            log.fine("iterate through all the pages looking for separate hocr files");
            for (int i = 1, maxPages = reader.getNumberOfPages() + 1; i < maxPages; i++) {
                String hocrFileName = String.format(fileNameFormat, i);
                try {
                    ocrStamp.parse(hocrFileName, i);
                } catch (SAXException saxException) {
                    System.err.println("Error processing hocr file: " + hocrFileName);
                    throw saxException;
                }
            }
        } else {
            ocrStamp.parse(inputHOCR, -1);
        }
        stamper.close();
    }
}