List of usage examples for com.itextpdf.text Image ORIGINAL_TIFF
int ORIGINAL_TIFF
To view the source code for com.itextpdf.text Image ORIGINAL_TIFF.
Click Source Link
From source file:pl.marcinmilkowski.hocrtopdf.Main.java
License:Open Source License
/** * @param args/*from w w w. jav a2 s .c o m*/ */ public static void main(String[] args) { try { if (args.length < 1 || args[0] == "--help" || args[0] == "-h") { System.out.print("Usage: java pl.marcinmilkowski.hocrtopdf.Main INPUTURL.html OUTPUTURL.pdf\n" + "\n" + "Converts hOCR files into PDF\n" + "\n" + "Example: java pl.marcinmilkowski.hocrtopdf.Main hocr.html output.pdf\n"); if (args.length < 1) System.exit(-1); else System.exit(0); } URL inputHOCRFile = null; FileOutputStream outputPDFStream = null; try { File file = new File(args[0]); inputHOCRFile = file.toURI().toURL(); } catch (MalformedURLException e) { System.out.println("The first parameter has to be a valid file."); System.out.println("We got an error: " + e.getMessage()); System.exit(-1); } try { outputPDFStream = new FileOutputStream(args[1]); } catch (FileNotFoundException e) { System.out.println("The second parameter has to be a valid URL"); System.exit(-1); } // The resolution of a PDF file (using iText) is 72pt per inch float pointsPerInch = 72.0f; // Using the jericho library to parse the HTML file Source source = new Source(inputHOCRFile); int pageCounter = 1; Document pdfDocument = null; PdfWriter pdfWriter = null; PdfContentByte cb = null; RandomAccessFileOrArray ra = null; // Find the tag of class ocr_page in order to load the scanned image StartTag pageTag = source.getNextStartTag(0, "class", OCRPAGE); while (pageTag != null) { int prevPos = pageTag.getEnd(); Pattern imagePattern = Pattern.compile("image\\s+([^;]+)"); Matcher imageMatcher = imagePattern.matcher(pageTag.getElement().getAttributeValue("title")); if (!imageMatcher.find()) { System.out.println("Could not find a tag of class \"ocr_page\", aborting."); System.exit(-1); } // Load the image Image pageImage = null; try { File file = new File(imageMatcher.group(1)); pageImage = Image.getInstance(file.toURI().toURL()); } catch (MalformedURLException e) { System.out.println("Could not load the scanned image from: " + "file://" + imageMatcher.group(1) + ", aborting."); System.exit(-1); } if (pageImage.getOriginalType() == Image.ORIGINAL_TIFF) { // this might // be // multipage // tiff! File file = new File(imageMatcher.group(1)); if (pageCounter == 1 || ra == null) { ra = new RandomAccessFileOrArray(file.toURI().toURL()); } int nPages = TiffImage.getNumberOfPages(ra); if (nPages > 0 && pageCounter <= nPages) { pageImage = TiffImage.getTiffImage(ra, pageCounter); } } int dpiX = pageImage.getDpiX(); if (dpiX == 0) { // for images that don't set the resolution we assume // 300 dpi dpiX = 300; } int dpiY = pageImage.getDpiY(); if (dpiY == 0) { // as above for dpiX dpiY = 300; } float dotsPerPointX = dpiX / pointsPerInch; float dotsPerPointY = dpiY / pointsPerInch; float pageImagePixelHeight = pageImage.getHeight(); if (pdfDocument == null) { pdfDocument = new Document(new Rectangle(pageImage.getWidth() / dotsPerPointX, pageImage.getHeight() / dotsPerPointY)); pdfWriter = PdfWriter.getInstance(pdfDocument, outputPDFStream); pdfDocument.open(); // Put the text behind the picture (reverse for debugging) // cb = pdfWriter.getDirectContentUnder(); cb = pdfWriter.getDirectContent(); } else { pdfDocument.setPageSize(new Rectangle(pageImage.getWidth() / dotsPerPointX, pageImage.getHeight() / dotsPerPointY)); pdfDocument.newPage(); } // first define a standard font for our text BaseFont base = BaseFont.createFont(BaseFont.HELVETICA, BaseFont.CP1250, BaseFont.EMBEDDED); Font defaultFont = new Font(base, 8); // FontFactory.getFont(FontFactory.HELVETICA, 8, Font.BOLD, // CMYKColor.BLACK); cb.setHorizontalScaling(1.0f); pageImage.scaleToFit(pageImage.getWidth() / dotsPerPointX, pageImage.getHeight() / dotsPerPointY); pageImage.setAbsolutePosition(0, 0); // Put the image in front of the text (reverse for debugging) // pdfWriter.getDirectContent().addImage(pageImage); pdfWriter.getDirectContentUnder().addImage(pageImage); // In order to place text behind the recognised text snippets we are // interested in the bbox property Pattern bboxPattern = Pattern.compile("bbox(\\s+\\d+){4}"); // This pattern separates the coordinates of the bbox property Pattern bboxCoordinatePattern = Pattern.compile("(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)"); // Only tags of the ocr_line class are interesting StartTag ocrTag = source.getNextStartTag(prevPos, "class", OCRPAGEORLINE); while (ocrTag != null) { prevPos = ocrTag.getEnd(); if ("ocrx_word".equalsIgnoreCase(ocrTag.getAttributeValue("class"))) { net.htmlparser.jericho.Element lineElement = ocrTag.getElement(); Matcher bboxMatcher = bboxPattern.matcher(lineElement.getAttributeValue("title")); if (bboxMatcher.find()) { // We found a tag of the ocr_line class containing a bbox property Matcher bboxCoordinateMatcher = bboxCoordinatePattern.matcher(bboxMatcher.group()); bboxCoordinateMatcher.find(); int[] coordinates = { Integer.parseInt((bboxCoordinateMatcher.group(1))), Integer.parseInt((bboxCoordinateMatcher.group(2))), Integer.parseInt((bboxCoordinateMatcher.group(3))), Integer.parseInt((bboxCoordinateMatcher.group(4))) }; String line = lineElement.getContent().getTextExtractor().toString(); float bboxWidthPt = (coordinates[2] - coordinates[0]) / dotsPerPointX; float bboxHeightPt = (coordinates[3] - coordinates[1]) / dotsPerPointY; // Put the text into the PDF cb.beginText(); // Comment the next line to debug the PDF output (visible Text) cb.setTextRenderingMode(PdfContentByte.TEXT_RENDER_MODE_INVISIBLE); // height cb.setFontAndSize(defaultFont.getBaseFont(), Math.max(Math.round(bboxHeightPt), 1)); // width cb.setHorizontalScaling(bboxWidthPt / cb.getEffectiveStringWidth(line, false)); cb.moveText((coordinates[0] / dotsPerPointX), ((pageImagePixelHeight - coordinates[3]) / dotsPerPointY)); cb.showText(line); cb.endText(); cb.setHorizontalScaling(1.0f); } } else { if ("ocr_page".equalsIgnoreCase(ocrTag.getAttributeValue("class"))) { pageCounter++; pageTag = ocrTag; break; } } ocrTag = source.getNextStartTag(prevPos, "class", OCRPAGEORLINE); } if (ocrTag == null) { pdfDocument.close(); break; } } } catch (DocumentException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }