pl.marcinmilkowski.hocrtopdf.Main.java Source code

Introduction

Here is the source code for pl.marcinmilkowski.hocrtopdf.Main.java
Source

package pl.marcinmilkowski.hocrtopdf;

/**
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 * Copyright 2007
 * @author Florian Hackenberger <florian@hackenberger.at>
 * Modified by Marcin Mikowski <http://marcinmilkowski.pl>
 */

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;

import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Font;
import com.itextpdf.text.Image;
import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.PdfContentByte;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.text.pdf.codec.TiffImage;
import com.itextpdf.text.pdf.RandomAccessFileOrArray;
import com.itextpdf.text.pdf.BaseFont;

/**
 * A quickhack for converting from hOCR to PDF
 * 
 * @author fhackenberger
 */
public class Main {

    public static final Pattern OCRLINE = Pattern.compile("ocr_line");
    public static final Pattern OCRPAGE = Pattern.compile("ocr_page");
    public static final Pattern OCRPAGEORLINE = Pattern.compile("ocrx_word|ocr_page");
    public static final Pattern OCRXWORD = Pattern.compile("ocrx_word");

    /**
     * @param args
     */
    public static void main(String[] args) {
        try {
            if (args.length < 1 || args[0] == "--help" || args[0] == "-h") {
                System.out.print("Usage: java pl.marcinmilkowski.hocrtopdf.Main INPUTURL.html OUTPUTURL.pdf\n"
                        + "\n" + "Converts hOCR files into PDF\n" + "\n"
                        + "Example: java pl.marcinmilkowski.hocrtopdf.Main hocr.html output.pdf\n");
                if (args.length < 1)
                    System.exit(-1);
                else
                    System.exit(0);
            }
            URL inputHOCRFile = null;
            FileOutputStream outputPDFStream = null;
            try {
                File file = new File(args[0]);
                inputHOCRFile = file.toURI().toURL();
            } catch (MalformedURLException e) {
                System.out.println("The first parameter has to be a valid file.");
                System.out.println("We got an error: " + e.getMessage());
                System.exit(-1);
            }
            try {
                outputPDFStream = new FileOutputStream(args[1]);
            } catch (FileNotFoundException e) {
                System.out.println("The second parameter has to be a valid URL");
                System.exit(-1);
            }

            // The resolution of a PDF file (using iText) is 72pt per inch
            float pointsPerInch = 72.0f;

            // Using the jericho library to parse the HTML file
            Source source = new Source(inputHOCRFile);

            int pageCounter = 1;

            Document pdfDocument = null;
            PdfWriter pdfWriter = null;
            PdfContentByte cb = null;
            RandomAccessFileOrArray ra = null;

            // Find the tag of class ocr_page in order to load the scanned image
            StartTag pageTag = source.getNextStartTag(0, "class", OCRPAGE);
            while (pageTag != null) {
                int prevPos = pageTag.getEnd();
                Pattern imagePattern = Pattern.compile("image\\s+([^;]+)");
                Matcher imageMatcher = imagePattern.matcher(pageTag.getElement().getAttributeValue("title"));
                if (!imageMatcher.find()) {
                    System.out.println("Could not find a tag of class \"ocr_page\", aborting.");
                    System.exit(-1);
                }
                // Load the image
                Image pageImage = null;
                try {
                    File file = new File(imageMatcher.group(1));
                    pageImage = Image.getInstance(file.toURI().toURL());
                } catch (MalformedURLException e) {
                    System.out.println("Could not load the scanned image from: " + "file://" + imageMatcher.group(1)
                            + ", aborting.");
                    System.exit(-1);
                }
                if (pageImage.getOriginalType() == Image.ORIGINAL_TIFF) { // this might
                                                                          // be
                                                                          // multipage
                                                                          // tiff!
                    File file = new File(imageMatcher.group(1));
                    if (pageCounter == 1 || ra == null) {
                        ra = new RandomAccessFileOrArray(file.toURI().toURL());
                    }
                    int nPages = TiffImage.getNumberOfPages(ra);
                    if (nPages > 0 && pageCounter <= nPages) {
                        pageImage = TiffImage.getTiffImage(ra, pageCounter);
                    }
                }
                int dpiX = pageImage.getDpiX();
                if (dpiX == 0) { // for images that don't set the resolution we assume
                                 // 300 dpi
                    dpiX = 300;
                }
                int dpiY = pageImage.getDpiY();
                if (dpiY == 0) { // as above for dpiX
                    dpiY = 300;
                }
                float dotsPerPointX = dpiX / pointsPerInch;
                float dotsPerPointY = dpiY / pointsPerInch;
                float pageImagePixelHeight = pageImage.getHeight();
                if (pdfDocument == null) {
                    pdfDocument = new Document(new Rectangle(pageImage.getWidth() / dotsPerPointX,
                            pageImage.getHeight() / dotsPerPointY));
                    pdfWriter = PdfWriter.getInstance(pdfDocument, outputPDFStream);
                    pdfDocument.open();
                    // Put the text behind the picture (reverse for debugging)
                    // cb = pdfWriter.getDirectContentUnder();
                    cb = pdfWriter.getDirectContent();
                } else {
                    pdfDocument.setPageSize(new Rectangle(pageImage.getWidth() / dotsPerPointX,
                            pageImage.getHeight() / dotsPerPointY));
                    pdfDocument.newPage();
                }
                // first define a standard font for our text
                BaseFont base = BaseFont.createFont(BaseFont.HELVETICA, BaseFont.CP1250, BaseFont.EMBEDDED);
                Font defaultFont = new Font(base, 8);
                // FontFactory.getFont(FontFactory.HELVETICA, 8, Font.BOLD,
                // CMYKColor.BLACK);

                cb.setHorizontalScaling(1.0f);

                pageImage.scaleToFit(pageImage.getWidth() / dotsPerPointX, pageImage.getHeight() / dotsPerPointY);
                pageImage.setAbsolutePosition(0, 0);
                // Put the image in front of the text (reverse for debugging)
                // pdfWriter.getDirectContent().addImage(pageImage);
                pdfWriter.getDirectContentUnder().addImage(pageImage);

                // In order to place text behind the recognised text snippets we are
                // interested in the bbox property
                Pattern bboxPattern = Pattern.compile("bbox(\\s+\\d+){4}");
                // This pattern separates the coordinates of the bbox property
                Pattern bboxCoordinatePattern = Pattern.compile("(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)");
                // Only tags of the ocr_line class are interesting
                StartTag ocrTag = source.getNextStartTag(prevPos, "class", OCRPAGEORLINE);
                while (ocrTag != null) {
                    prevPos = ocrTag.getEnd();
                    if ("ocrx_word".equalsIgnoreCase(ocrTag.getAttributeValue("class"))) {
                        net.htmlparser.jericho.Element lineElement = ocrTag.getElement();
                        Matcher bboxMatcher = bboxPattern.matcher(lineElement.getAttributeValue("title"));
                        if (bboxMatcher.find()) {
                            // We found a tag of the ocr_line class containing a bbox property
                            Matcher bboxCoordinateMatcher = bboxCoordinatePattern.matcher(bboxMatcher.group());
                            bboxCoordinateMatcher.find();
                            int[] coordinates = { Integer.parseInt((bboxCoordinateMatcher.group(1))),
                                    Integer.parseInt((bboxCoordinateMatcher.group(2))),
                                    Integer.parseInt((bboxCoordinateMatcher.group(3))),
                                    Integer.parseInt((bboxCoordinateMatcher.group(4))) };
                            String line = lineElement.getContent().getTextExtractor().toString();
                            float bboxWidthPt = (coordinates[2] - coordinates[0]) / dotsPerPointX;
                            float bboxHeightPt = (coordinates[3] - coordinates[1]) / dotsPerPointY;

                            // Put the text into the PDF
                            cb.beginText();
                            // Comment the next line to debug the PDF output (visible Text)
                            cb.setTextRenderingMode(PdfContentByte.TEXT_RENDER_MODE_INVISIBLE);
                            // height
                            cb.setFontAndSize(defaultFont.getBaseFont(), Math.max(Math.round(bboxHeightPt), 1));
                            // width
                            cb.setHorizontalScaling(bboxWidthPt / cb.getEffectiveStringWidth(line, false));
                            cb.moveText((coordinates[0] / dotsPerPointX),
                                    ((pageImagePixelHeight - coordinates[3]) / dotsPerPointY));
                            cb.showText(line);
                            cb.endText();
                            cb.setHorizontalScaling(1.0f);
                        }
                    } else {
                        if ("ocr_page".equalsIgnoreCase(ocrTag.getAttributeValue("class"))) {
                            pageCounter++;
                            pageTag = ocrTag;
                            break;
                        }
                    }
                    ocrTag = source.getNextStartTag(prevPos, "class", OCRPAGEORLINE);
                }
                if (ocrTag == null) {
                    pdfDocument.close();
                    break;
                }
            }
        } catch (DocumentException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

}