List of usage examples for org.apache.pdfbox.rendering PDFRenderer renderImage
public BufferedImage renderImage(int pageIndex, float scale, ImageType imageType) throws IOException
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
License:Apache License
void doOCROnCurrentPage() throws IOException, TikaException, SAXException { if (config.getOcrStrategy().equals(NO_OCR)) { return;/*w w w . j a va 2 s. co m*/ } TesseractOCRConfig tesseractConfig = context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG); TesseractOCRParser tesseractOCRParser = new TesseractOCRParser(); if (!tesseractOCRParser.hasTesseract(tesseractConfig)) { throw new TikaException("Tesseract is not available. " + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly"); } PDFRenderer renderer = new PDFRenderer(pdDocument); TemporaryResources tmp = new TemporaryResources(); try { BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOcrImageType()); Path tmpFile = tmp.createTempFile(); try (OutputStream os = Files.newOutputStream(tmpFile)) { //TODO: get output format from TesseractConfig /*ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, config.getOcrDPI());*/ } try (InputStream is = TikaInputStream.get(tmpFile)) { tesseractOCRParser.parseInline(is, xhtml, tesseractConfig); } } catch (IOException e) { handleCatchableIOE(e); } catch (SAXException e) { throw new IOExceptionWithCause("error writing OCR content from PDF", e); } finally { tmp.dispose(); } }
From source file:ve.zoonosis.utils.PDFCreator.java
License:Apache License
public BufferedImage getImagePage(int index) throws IOException { contentStream.close();/*from w ww . j a v a 2 s. co m*/ PDFRenderer pdfRenderer = new PDFRenderer(document); return pdfRenderer.renderImage(index, 1f, ImageType.RGB); }