List of usage examples for org.apache.pdfbox.pdmodel PDPage getResources
@Override
public PDResources getResources()
From source file:com.openkm.extractor.PdfTextExtractor.java
License:Open Source License
/** * {@inheritDoc}// w w w. j a v a 2 s .com */ @SuppressWarnings("rawtypes") public String extractText(InputStream stream, String type, String encoding) throws IOException { try { PDFParser parser = new PDFParser(new BufferedInputStream(stream)); try { parser.parse(); PDDocument document = parser.getPDDocument(); if (document.isEncrypted()) { try { document.decrypt(""); document.setAllSecurityToBeRemoved(true); } catch (Exception e) { throw new IOException("Unable to extract text: document encrypted", e); } } CharArrayWriter writer = new CharArrayWriter(); PDFTextStripper stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); stripper.writeText(document, writer); String st = writer.toString().trim(); log.debug("TextStripped: '{}'", st); if (Config.SYSTEM_PDF_FORCE_OCR || st.length() <= 1) { log.warn("PDF does not contains text layer"); // Extract images from PDF StringBuilder sb = new StringBuilder(); if (!Config.SYSTEM_PDFIMAGES.isEmpty()) { File tmpPdf = FileUtils.createTempFile("pdf"); File tmpDir = new File(EnvironmentDetector.getTempDir()); String baseName = FileUtils.getFileName(tmpPdf.getName()); document.save(tmpPdf); int pgNum = 1; try { for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) { HashMap<String, Object> hm = new HashMap<String, Object>(); hm.put("fileIn", tmpPdf.getPath()); hm.put("firstPage", pgNum); hm.put("lastPage", pgNum++); hm.put("imageRoot", tmpDir + File.separator + baseName); String cmd = TemplateUtils.replace("SYSTEM_PDFIMAGES", Config.SYSTEM_PDFIMAGES, hm); ExecutionUtils.runCmd(cmd); for (File tmp : tmpDir.listFiles()) { if (tmp.getName().startsWith(baseName + "-")) { if (page.findRotation() > 0) { ImageUtils.rotate(tmp, tmp, page.findRotation()); } try { String txt = doOcr(tmp); sb.append(txt).append(" "); log.debug("OCR Extracted: {}", txt); } finally { FileUtils.deleteQuietly(tmp); } } } } } finally { FileUtils.deleteQuietly(tmpPdf); } } else { for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) { PDResources resources = page.getResources(); Map<String, PDXObject> images = resources.getXObjects(); if (images != null) { for (String key : images.keySet()) { PDXObjectImage image = (PDXObjectImage) images.get(key); String prefix = "img-" + key + "-"; File pdfImg = null; try { pdfImg = File.createTempFile(prefix, ".png"); log.debug("Writing image: {}", pdfImg.getPath()); // Won't work until PDFBox 1.8.9 ImageIO.write(image.getRGBImage(), "png", pdfImg); if (page.findRotation() > 0) { ImageUtils.rotate(pdfImg, pdfImg, page.findRotation()); } // Do OCR String txt = doOcr(pdfImg); sb.append(txt).append(" "); log.debug("OCR Extracted: {}", txt); } finally { FileUtils.deleteQuietly(pdfImg); } } } } } return sb.toString(); } else { return writer.toString(); } } finally { try { PDDocument doc = parser.getPDDocument(); if (doc != null) { doc.close(); } } catch (IOException e) { // ignore } } } catch (Exception e) { // it may happen that PDFParser throws a runtime // exception when parsing certain pdf documents log.warn("Failed to extract PDF text content", e); throw new IOException(e.getMessage(), e); } finally { stream.close(); } }
From source file:com.shmsoft.dmass.ocr.PDFImageExtractor.java
License:Apache License
@SuppressWarnings("rawtypes") @Override/* w w w. ja v a 2 s. com*/ public List<String> extractImages() { File extractionDir = new File(conf.getPdfImageExtractionDir()); extractionDir.mkdirs(); List<String> result = new ArrayList<String>(); PDDocument document = null; try { document = PDDocument.load(file); List pages = document.getDocumentCatalog().getAllPages(); Iterator iter = pages.iterator(); int i = 1; int maxNumberOfImages = Project.getProject().getOcrMaxImagesPerPDF(); while (iter.hasNext()) { PDPage page = (PDPage) iter.next(); PDResources resources = page.getResources(); Map pageImages = resources.getImages(); if (pageImages != null) { Iterator imageIter = pageImages.keySet().iterator(); while (imageIter.hasNext()) { if (i > maxNumberOfImages) { return result; } String key = (String) imageIter.next(); PDXObjectImage image = (PDXObjectImage) pageImages.get(key); String fileName = conf.getPdfImageExtractionDir() + OCRUtil.createUniqueFileName("image"); image.write2file(fileName); result.add(fileName + "." + image.getSuffix()); i++; } } } } catch (IOException ex) { ex.printStackTrace(); } return result; }
From source file:compressor.Compressor.java
void extract_images(String src, String dest, String img_name) throws IOException { PDDocument document = null;//w w w. j a va2 s.c om try { document = PDDocument.load(src); } catch (IOException ex) { System.out.println("" + ex); } List pages = document.getDocumentCatalog().getAllPages(); Iterator iter = pages.iterator(); int i = 1; String name = null; File file = new File(dest + "img"); if (!file.exists()) { if (file.mkdir()) { System.out.println("Directory is created!"); } else { System.out.println("Failed to create directory!"); } } dest = dest + "img/"; while (iter.hasNext()) { PDPage page = (PDPage) iter.next(); PDResources resources = page.getResources(); Map pageImages = resources.getImages(); if (pageImages != null) { Iterator imageIter = pageImages.keySet().iterator(); while (imageIter.hasNext()) { String key = (String) imageIter.next(); PDXObjectImage image = (PDXObjectImage) pageImages.get(key); image.write2file(dest + img_name + i); i++; } } } document.close(); }
From source file:cz.muni.pdfjbim.PdfImageExtractor.java
License:Apache License
/** * @deprecated -- do not use doesn't work properly yet * This method extracts images by going through PDF tree structure * @param pdfFile name of input PDF file * @param prefix /* w ww .j ava2 s . com*/ * @param password password for access to PDF if needed * @param pagesToProcess list of pages which should be processed if null given => processed all pages * -- not working yet // * @param silent -- if true error messages are not written to output otherwise they are * @param binarize -- enables processing of nonbitonal images as well (LZW is still not * processed because of output with inverted colors) * @throws PdfRecompressionException if problem to extract images from PDF */ public void extractImagesUsingPdfObjectAccess(String pdfFile, String prefix, String password, Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException { if (binarize == null) { binarize = false; } // checking arguments and setting appropriate variables if (pdfFile == null) { throw new IllegalArgumentException("pdfFile must be defined"); } InputStream inputStream = null; if (password != null) { try { log.debug("PDF probably encrypted, trying to decrypt using given password {}", password); ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream(); PdfReader reader = new PdfReader(pdfFile, password.getBytes(StandardCharsets.UTF_8)); PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream); stamper.close(); inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray()); } catch (DocumentException ex) { throw new PdfRecompressionException(ex); } catch (IOException ex) { throw new PdfRecompressionException("Reading file caused exception", ex); } } else { try { inputStream = new FileInputStream(pdfFile); } catch (FileNotFoundException ex) { throw new PdfRecompressionException("File wasn't found", ex); } } // if prefix is not set then prefix set to name of pdf without .pdf // if pdfFile has unconsistent name (without suffix .pdf) and name longer than 4 chars then last for chars are removed // and this string set as prefix if ((prefix == null) && (pdfFile.length() > 4)) { prefix = pdfFile.substring(0, pdfFile.length() - 4); } PDFParser parser = null; PDDocument doc = null; try { parser = new PDFParser(inputStream); parser.parse(); doc = parser.getPDDocument(); AccessPermission accessPermissions = doc.getCurrentAccessPermission(); if (!accessPermissions.canExtractContent()) { throw new PdfRecompressionException("Error: You do not have permission to extract images."); } // going page by page List pages = doc.getDocumentCatalog().getAllPages(); for (int pageNumber = 0; pageNumber < pages.size(); pageNumber++) { if ((pagesToProcess != null) && (!pagesToProcess.contains(pageNumber + 1))) { continue; } PDPage page = (PDPage) pages.get(pageNumber); PDResources resources = page.getResources(); Map xobjs = resources.getXObjects(); if (xobjs != null) { Iterator xobjIter = xobjs.entrySet().iterator(); while (xobjIter.hasNext()) { Map.Entry entry = (Map.Entry) xobjIter.next(); String key = (String) entry.getKey(); PDXObject xobj = (PDXObject) entry.getValue(); Map images; if (xobj instanceof PDXObjectForm) { PDXObjectForm xform = (PDXObjectForm) xobj; images = xform.getResources().getImages(); } else { images = resources.getImages(); } // reading images from each page and saving them to file if (images != null) { Iterator imageIter = images.entrySet().iterator(); while (imageIter.hasNext()) { Map.Entry imEntry = (Map.Entry) imageIter.next(); String imKey = (String) imEntry.getKey(); PDXObjectImage image = (PDXObjectImage) imEntry.getValue(); PDStream pdStr = new PDStream(image.getCOSStream()); List<COSName> filters = pdStr.getFilters(); if (image.getBitsPerComponent() > 1 && !binarize) { log.info("It is not a bitonal image => skipping"); continue; } // at this moment for preventing bad output (bad coloring) from LZWDecode filter if (filters.contains(COSName.LZW_DECODE)) { log.info("This is LZWDecoded => skipping"); continue; } if (filters.contains(COSName.JBIG2_DECODE)) { if (skipJBig2Images) { log.warn("Allready compressed according to JBIG2 standard => skipping"); continue; } else { log.debug("JBIG2 image detected"); } } // detection of unsupported filters by pdfBox library if (filters.contains(COSName.JPX_DECODE)) { log.info("Unsupported filter JPXDecode => skipping"); continue; } COSObject cosObj = new COSObject(image.getCOSObject()); int objectNum = cosObj.getObjectNumber().intValue(); int genNum = cosObj.getGenerationNumber().intValue(); log.debug(objectNum + " " + genNum + " obj"); String name = getUniqueFileName(prefix + imKey, image.getSuffix()); log.debug("Writing image:" + name); image.write2file(name); PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(), image.getHeight(), objectNum, genNum); originalImageInformations.add(pdfImageInfo); log.debug(pdfImageInfo.toString()); namesOfImages.add(name + "." + image.getSuffix()); } } } } } } catch (IOException ex) { Tools.deleteFilesFromList(namesOfImages); throw new PdfRecompressionException("Unable to parse PDF document", ex); } catch (RuntimeException ex) { Tools.deleteFilesFromList(namesOfImages); } finally { if (doc != null) { try { doc.close(); } catch (IOException ex) { throw new PdfRecompressionException(ex); } } } }
From source file:cz.muni.pdfjbim.PdfImageProcessor.java
License:Apache License
/** * @deprecated -- do not use doesn't work properly yet * This method extracts images by going through PDF tree structure * @param pdfFile name of input PDF file * @param password password for access to PDF if needed * @param pagesToProcess list of pages which should be processed if null given => processed all pages * -- not working yet/*w ww . j a v a2 s .com*/ * @param silent -- if true error messages are not written to output otherwise they are * @param binarize -- enables processing of nonbitonal images as well (LZW is still not * processed because of output with inverted colors) * @throws PdfRecompressionException if problem to extract images from PDF */ public void extractImagesUsingPdfObjectAccess(String pdfFile, String password, Set<Integer> pagesToProcess, Boolean silent, Boolean binarize) throws PdfRecompressionException { if (binarize == null) { binarize = false; } // checking arguments and setting appropriate variables if (pdfFile == null) { throw new IllegalArgumentException(pdfFile); } String prefix = null; InputStream inputStream = null; if (password != null) { try { ByteArrayOutputStream decryptedOutputStream = null; PdfReader reader = new PdfReader(pdfFile, password.getBytes()); PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream); stamper.close(); inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray()); } catch (DocumentException ex) { throw new PdfRecompressionException(ex); } catch (IOException ex) { throw new PdfRecompressionException("Reading file caused exception", ex); } } else { try { inputStream = new FileInputStream(pdfFile); } catch (FileNotFoundException ex) { throw new PdfRecompressionException("File wasn't found", ex); } } // if prefix is not set then prefix set to name of pdf without .pdf // if pdfFile has unconsistent name (without suffix .pdf) and name longer than 4 chars then last for chars are removed // and this string set as prefix if ((prefix == null) && (pdfFile.length() > 4)) { prefix = pdfFile.substring(0, pdfFile.length() - 4); } PDFParser parser = null; PDDocument doc = null; try { parser = new PDFParser(inputStream); parser.parse(); doc = parser.getPDDocument(); AccessPermission accessPermissions = doc.getCurrentAccessPermission(); if (!accessPermissions.canExtractContent()) { throw new PdfRecompressionException("Error: You do not have permission to extract images."); } // going page by page List pages = doc.getDocumentCatalog().getAllPages(); for (int pageNumber = 0; pageNumber < pages.size(); pageNumber++) { if ((pagesToProcess != null) && (!pagesToProcess.contains(pageNumber + 1))) { continue; } PDPage page = (PDPage) pages.get(pageNumber); PDResources resources = page.getResources(); Map xobjs = resources.getXObjects(); if (xobjs != null) { Iterator xobjIter = xobjs.keySet().iterator(); while (xobjIter.hasNext()) { String key = (String) xobjIter.next(); PDXObject xobj = (PDXObject) xobjs.get(key); Map images; if (xobj instanceof PDXObjectForm) { PDXObjectForm xform = (PDXObjectForm) xobj; images = xform.getResources().getImages(); } else { images = resources.getImages(); } // reading images from each page and saving them to file if (images != null) { Iterator imageIter = images.keySet().iterator(); while (imageIter.hasNext()) { String imKey = (String) imageIter.next(); PDXObjectImage image = (PDXObjectImage) images.get(imKey); PDStream pdStr = new PDStream(image.getCOSStream()); List filters = pdStr.getFilters(); if (image.getBitsPerComponent() > 1) { log.info("It is not a bitonal image => skipping"); continue; } // at this moment for preventing bad output (bad coloring) from LZWDecode filter if (filters.contains(COSName.LZW_DECODE.getName())) { log.info("This is LZWDecoded => skipping"); continue; } // detection of unsupported filters by pdfBox library if (filters.contains("JBIG2Decode")) { log.info("Allready compressed according to JBIG2 standard => skipping"); continue; } if (filters.contains("JPXDecode")) { log.info("Unsupported filter JPXDecode => skipping"); continue; } COSObject cosObj = new COSObject(image.getCOSObject()); int objectNum = cosObj.getObjectNumber().intValue(); int genNum = cosObj.getGenerationNumber().intValue(); log.debug(objectNum + " " + genNum + " obj"); String name = getUniqueFileName(prefix + imKey, image.getSuffix()); log.debug("Writing image:" + name); image.write2file(name); PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(), image.getHeight(), objectNum, genNum); originalImageInformations.add(pdfImageInfo); log.debug(pdfImageInfo.toString()); namesOfImages.add(name + "." + image.getSuffix()); } } } } } } catch (IOException ex) { throw new PdfRecompressionException("Unable to parse PDF document", ex); } finally { if (doc != null) { try { doc.close(); } catch (IOException ex) { throw new PdfRecompressionException(ex); } } } }
From source file:de.offis.health.icardea.cied.pdf.extractor.PDFApachePDFBoxExtractor.java
License:Apache License
/** * <p>/* www. j a v a 2 s . c o m*/ * This method searches for all image objects from the currently processed * PDF file and stores them using the correct extension in the given export * directory or in the same directory where the original PDF file is stored. * </p> * <p> * The filename of the images is build based on the original PDF filename * (without extension) and additional details like page number, image * number and if available the internal image name. * </p> * @param fullExportDirectoryPath The optional full export path where the images * should be stored. If not given, the location of the original PDF file is used. * @throws Exception */ @SuppressWarnings("unchecked") private void imageExtractor(String fullExportDirectoryPath) throws Exception { if (fullExportDirectoryPath != null) { fullExportDirectoryPath = GlobalTools.checkDirectoryPath(fullExportDirectoryPath); File exportDirectory = new File(fullExportDirectoryPath); if (!exportDirectory.exists()) { exportDirectory.mkdirs(); } // end if } // end if String baseExportDirectoryPath = fullExportDirectoryPath != null ? fullExportDirectoryPath : this.fullPDFDirectoryPath; String baseFileNameWithoutExtension = GlobalTools.getFileNameWithoutExtension(this.fullPDFFilePath); if (pdfDocument != null) { List<PDPage> pages = pdfDocument.getDocumentCatalog().getAllPages(); Iterator<PDPage> iterator = pages.iterator(); int currentPage = 0; int imageCounter = 0; while (iterator.hasNext()) { currentPage++; PDPage page = iterator.next(); PDResources resources = page.getResources(); Map imageMap = resources.getImages(); if (imageMap != null) { Iterator imageIterator = imageMap.keySet().iterator(); while (imageIterator.hasNext()) { imageCounter++; String key = (String) imageIterator.next(); PDXObjectImage pdfObjectImage = (PDXObjectImage) imageMap.get(key); String imageName = key; String fullExportFileNameWithoutExtension = baseExportDirectoryPath + baseFileNameWithoutExtension + "_(" + ((currentPage) > 0 ? "p" + pageNumberFormat.format((currentPage)) : "p" + pageNumberFormat.format(0)) + "_ref" + REF_NUMBER_FORMAT.format(imageCounter) + (imageName == null ? "_unk" : "_" + imageName) + ")"; logger.debug("Writing image as: " + fullExportFileNameWithoutExtension + "." + pdfObjectImage.getSuffix()); /* * The write2file method will automatically append the extension. */ // pdfObjectImage.write2file(fullExportFileNameWithoutExtension + "." + pdfObjectImage.getSuffix()); pdfObjectImage.write2file(fullExportFileNameWithoutExtension); } // end while } // end if } // end while } else { // TODO: Add own exception. throw new Exception("There is no open PDF to work with."); } }
From source file:fr.aviz.hybridvis.utils.PDF.MultiScalePDFViewer.java
License:Open Source License
/** * @param PDDocument//from w w w . j av a2 s . c o m * Extracts images embedded in a PDF and saves them in PNG * format, problem with embedded PDF images (PDF passed as * PDDocument) */ protected void pdfGetImages_PDFbox(PDDocument document) throws IOException { List<PDPage> pages = document.getDocumentCatalog().getAllPages(); int pageCounter = 0; for (PDPage page : pages) { ++pageCounter; // get pdf resources PDResources resources = page.getResources(); Map<String, PDXObjectImage> imageResources = resources.getImages(); resources.getGraphicsStates(); System.out.println(resources.getImages().size() + " images to be extracted"); // int imageCounter = 0; for (String key : imageResources.keySet()) { PDXObjectImage objectImage = imageResources.get(key); System.out.printf("image key '%s': %d x %d, type %s%n", key, objectImage.getHeight(), objectImage.getWidth(), objectImage.getSuffix()); } } }
From source file:impresionXML.impresion.pdf.JPDFManejo.java
public void extraerImagenes(String psRutaExtraccion) throws Exception { List pages = getDocumento().getDocumentCatalog().getAllPages(); Iterator iter = pages.iterator(); int i = 1;/*from w w w.j a v a 2 s .co m*/ String name = null; while (iter.hasNext()) { PDPage page = (PDPage) iter.next(); PDResources resources = page.getResources(); Map pageImages = resources.getImages(); if (pageImages != null) { Iterator imageIter = pageImages.keySet().iterator(); while (imageIter.hasNext()) { String key = (String) imageIter.next(); PDXObjectImage image = (PDXObjectImage) pageImages.get(key); image.write2file(psRutaExtraccion + "/image" + i); i++; } } } }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java
License:Apache License
/** * * * @param pdf/* w w w.j a v a 2 s . co m*/ * * @throws SAXException */ private void extractImageText(PDDocument pdf) { List<?> pages = pdf.getDocumentCatalog().getAllPages(); Iterator<?> pageIterator = pages.iterator(); pageCount = pages.size(); imageCount = 0; int currentPage = 0; TikaImageHelper helper = new TikaImageHelper(this.metadata); try { while (pageIterator.hasNext()) { PDPage page = (PDPage) pageIterator.next(); PDResources resources = page.getResources(); processResources(resources, helper); helper.addTextToHandler(handler, ++currentPage, pageCount); } } catch (Exception e) { e.printStackTrace(); } finally { if (helper != null) { helper.close(); } } }
From source file:net.padaf.preflight.helpers.PagesValidationHelper.java
License:Apache License
/** * This method check the ExtGState entry of the resource dictionary. * /*from w ww . jav a 2 s. co m*/ * @param page * @param handler * @param result * @return * @throws ValidationException */ protected boolean validateTransparency(PDPage page, DocumentHandler handler, List<ValidationError> result) throws ValidationException { PDResources resources = page.getResources(); COSDocument cDoc = handler.getDocument().getDocument(); ExtGStateContainer extGStates = new ExtGStateContainer(resources.getCOSDictionary(), cDoc); return extGStates.validateTransparencyRules(result); // ---- Even if a Group entry is possible in the Page dictionary, No // restrictions are defined by PDF/A }