Example usage for com.itextpdf.text Image getOriginalType

Introduction

In this page you can find the example usage for com.itextpdf.text Image getOriginalType.

Prototype

public int getOriginalType()

Source Link

Document

Getter for property originalType.

Usage

From source file:pl.marcinmilkowski.hocrtopdf.Main.java

License:Open Source License

/**
 * @param args//from w w  w.ja  v a 2 s  .  c  o m
 */
public static void main(String[] args) {
    try {
        if (args.length < 1 || args[0] == "--help" || args[0] == "-h") {
            System.out.print("Usage: java pl.marcinmilkowski.hocrtopdf.Main INPUTURL.html OUTPUTURL.pdf\n"
                    + "\n" + "Converts hOCR files into PDF\n" + "\n"
                    + "Example: java pl.marcinmilkowski.hocrtopdf.Main hocr.html output.pdf\n");
            if (args.length < 1)
                System.exit(-1);
            else
                System.exit(0);
        }
        URL inputHOCRFile = null;
        FileOutputStream outputPDFStream = null;
        try {
            File file = new File(args[0]);
            inputHOCRFile = file.toURI().toURL();
        } catch (MalformedURLException e) {
            System.out.println("The first parameter has to be a valid file.");
            System.out.println("We got an error: " + e.getMessage());
            System.exit(-1);
        }
        try {
            outputPDFStream = new FileOutputStream(args[1]);
        } catch (FileNotFoundException e) {
            System.out.println("The second parameter has to be a valid URL");
            System.exit(-1);
        }

        // The resolution of a PDF file (using iText) is 72pt per inch
        float pointsPerInch = 72.0f;

        // Using the jericho library to parse the HTML file
        Source source = new Source(inputHOCRFile);

        int pageCounter = 1;

        Document pdfDocument = null;
        PdfWriter pdfWriter = null;
        PdfContentByte cb = null;
        RandomAccessFileOrArray ra = null;

        // Find the tag of class ocr_page in order to load the scanned image
        StartTag pageTag = source.getNextStartTag(0, "class", OCRPAGE);
        while (pageTag != null) {
            int prevPos = pageTag.getEnd();
            Pattern imagePattern = Pattern.compile("image\\s+([^;]+)");
            Matcher imageMatcher = imagePattern.matcher(pageTag.getElement().getAttributeValue("title"));
            if (!imageMatcher.find()) {
                System.out.println("Could not find a tag of class \"ocr_page\", aborting.");
                System.exit(-1);
            }
            // Load the image
            Image pageImage = null;
            try {
                File file = new File(imageMatcher.group(1));
                pageImage = Image.getInstance(file.toURI().toURL());
            } catch (MalformedURLException e) {
                System.out.println("Could not load the scanned image from: " + "file://" + imageMatcher.group(1)
                        + ", aborting.");
                System.exit(-1);
            }
            if (pageImage.getOriginalType() == Image.ORIGINAL_TIFF) { // this might
                                                                      // be
                                                                      // multipage
                                                                      // tiff!
                File file = new File(imageMatcher.group(1));
                if (pageCounter == 1 || ra == null) {
                    ra = new RandomAccessFileOrArray(file.toURI().toURL());
                }
                int nPages = TiffImage.getNumberOfPages(ra);
                if (nPages > 0 && pageCounter <= nPages) {
                    pageImage = TiffImage.getTiffImage(ra, pageCounter);
                }
            }
            int dpiX = pageImage.getDpiX();
            if (dpiX == 0) { // for images that don't set the resolution we assume
                             // 300 dpi
                dpiX = 300;
            }
            int dpiY = pageImage.getDpiY();
            if (dpiY == 0) { // as above for dpiX
                dpiY = 300;
            }
            float dotsPerPointX = dpiX / pointsPerInch;
            float dotsPerPointY = dpiY / pointsPerInch;
            float pageImagePixelHeight = pageImage.getHeight();
            if (pdfDocument == null) {
                pdfDocument = new Document(new Rectangle(pageImage.getWidth() / dotsPerPointX,
                        pageImage.getHeight() / dotsPerPointY));
                pdfWriter = PdfWriter.getInstance(pdfDocument, outputPDFStream);
                pdfDocument.open();
                // Put the text behind the picture (reverse for debugging)
                // cb = pdfWriter.getDirectContentUnder();
                cb = pdfWriter.getDirectContent();
            } else {
                pdfDocument.setPageSize(new Rectangle(pageImage.getWidth() / dotsPerPointX,
                        pageImage.getHeight() / dotsPerPointY));
                pdfDocument.newPage();
            }
            // first define a standard font for our text
            BaseFont base = BaseFont.createFont(BaseFont.HELVETICA, BaseFont.CP1250, BaseFont.EMBEDDED);
            Font defaultFont = new Font(base, 8);
            // FontFactory.getFont(FontFactory.HELVETICA, 8, Font.BOLD,
            // CMYKColor.BLACK);

            cb.setHorizontalScaling(1.0f);

            pageImage.scaleToFit(pageImage.getWidth() / dotsPerPointX, pageImage.getHeight() / dotsPerPointY);
            pageImage.setAbsolutePosition(0, 0);
            // Put the image in front of the text (reverse for debugging)
            // pdfWriter.getDirectContent().addImage(pageImage);
            pdfWriter.getDirectContentUnder().addImage(pageImage);

            // In order to place text behind the recognised text snippets we are
            // interested in the bbox property
            Pattern bboxPattern = Pattern.compile("bbox(\\s+\\d+){4}");
            // This pattern separates the coordinates of the bbox property
            Pattern bboxCoordinatePattern = Pattern.compile("(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)");
            // Only tags of the ocr_line class are interesting
            StartTag ocrTag = source.getNextStartTag(prevPos, "class", OCRPAGEORLINE);
            while (ocrTag != null) {
                prevPos = ocrTag.getEnd();
                if ("ocrx_word".equalsIgnoreCase(ocrTag.getAttributeValue("class"))) {
                    net.htmlparser.jericho.Element lineElement = ocrTag.getElement();
                    Matcher bboxMatcher = bboxPattern.matcher(lineElement.getAttributeValue("title"));
                    if (bboxMatcher.find()) {
                        // We found a tag of the ocr_line class containing a bbox property
                        Matcher bboxCoordinateMatcher = bboxCoordinatePattern.matcher(bboxMatcher.group());
                        bboxCoordinateMatcher.find();
                        int[] coordinates = { Integer.parseInt((bboxCoordinateMatcher.group(1))),
                                Integer.parseInt((bboxCoordinateMatcher.group(2))),
                                Integer.parseInt((bboxCoordinateMatcher.group(3))),
                                Integer.parseInt((bboxCoordinateMatcher.group(4))) };
                        String line = lineElement.getContent().getTextExtractor().toString();
                        float bboxWidthPt = (coordinates[2] - coordinates[0]) / dotsPerPointX;
                        float bboxHeightPt = (coordinates[3] - coordinates[1]) / dotsPerPointY;

                        // Put the text into the PDF
                        cb.beginText();
                        // Comment the next line to debug the PDF output (visible Text)
                        cb.setTextRenderingMode(PdfContentByte.TEXT_RENDER_MODE_INVISIBLE);
                        // height
                        cb.setFontAndSize(defaultFont.getBaseFont(), Math.max(Math.round(bboxHeightPt), 1));
                        // width
                        cb.setHorizontalScaling(bboxWidthPt / cb.getEffectiveStringWidth(line, false));
                        cb.moveText((coordinates[0] / dotsPerPointX),
                                ((pageImagePixelHeight - coordinates[3]) / dotsPerPointY));
                        cb.showText(line);
                        cb.endText();
                        cb.setHorizontalScaling(1.0f);
                    }
                } else {
                    if ("ocr_page".equalsIgnoreCase(ocrTag.getAttributeValue("class"))) {
                        pageCounter++;
                        pageTag = ocrTag;
                        break;
                    }
                }
                ocrTag = source.getNextStartTag(prevPos, "class", OCRPAGEORLINE);
            }
            if (ocrTag == null) {
                pdfDocument.close();
                break;
            }
        }
    } catch (DocumentException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}