List of usage examples for org.apache.pdfbox.pdmodel PDPage getResources
@Override
public PDResources getResources()
From source file:net.padaf.preflight.helpers.PagesValidationHelper.java
License:Apache License
/** * This method check the Shading entry of the resource dictionary if exists. * /*w ww.j ava 2s .co m*/ * @param page * @param handler * @param result * @return * @throws ValidationException */ protected boolean validateShadingPattern(PDPage page, DocumentHandler handler, List<ValidationError> result) throws ValidationException { PDResources resources = page.getResources(); COSDictionary shadings = (COSDictionary) resources.getCOSDictionary() .getDictionaryObject(PATTERN_KEY_SHADING); boolean res = true; if (shadings != null) { for (Object key : shadings.keySet()) { COSDictionary aShading = (COSDictionary) shadings.getDictionaryObject((COSName) key); ShadingPattern sp = new ShadingPattern(handler, aShading); List<ValidationError> lErrors = sp.validate(); if (lErrors != null && !lErrors.isEmpty()) { result.addAll(lErrors); res = false; } } } return res; }
From source file:no.digipost.print.validate.PdfFontValidator.java
License:Apache License
public Collection<PDFont> getPageFonts(PDPage page) throws IOException { PDResources resources = page.getResources(); if (resources != null) { Map<String, PDFont> fontMap = resources.getFonts(); return fontMap.values(); }//from w w w.j a va2s.com return emptySet(); }
From source file:org.ala.harvester.ExtractPubfSciNamesAndImages.java
License:Apache License
private static void extractSciNameAndImages(PDDocument document) throws IOException { PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true);//www. j a v a 2 s.co m Rectangle rect = new Rectangle(10, 60, 275, 20); stripper.addRegion("class1", rect); List allPages = document.getDocumentCatalog().getAllPages(); Writer writer = getSiteMapWriter("anic"); writeColumnHeaders(writer); for (int pageNum = 37; pageNum <= 249; pageNum++) { // for (int pageNum = 156; pageNum <= 156; pageNum++) { PDPage page = (PDPage) allPages.get(pageNum); PDResources resources = page.getResources(); Map images = resources.getImages(); stripper.extractRegions(page); String sciName = stripper.getTextForRegion("class1").trim(); System.out.println("Scientific Name: " + sciName); if (images != null) { Iterator imageIter = images.keySet().iterator(); while (imageIter.hasNext()) { String key = (String) imageIter.next(); PDXObjectImage image = (PDXObjectImage) images.get(key); String name = null; if ("jpg".equals(image.getSuffix())) { name = getUniqueFileName(sciName + "_" + key, image.getSuffix()); System.out.println("Writing image:" + name); image.write2file("/data/tmp/" + name); writer.write(sciName); writer.write(","); writer.write(name + "." + image.getSuffix()); writer.write("\n"); } } } } }
From source file:org.apache.fop.render.pdf.pdfbox.PDFBoxAdapter.java
License:Apache License
/** * Creates a stream (from FOP's PDF library) from a PDF page parsed with PDFBox. * @param sourceDoc the source PDF the given page to be copied belongs to * @param page the page to transform into a stream * @param key value to use as key for the stream * @param atdoc adjustment for stream//from w w w. j a v a 2s .c om * @param fontinfo fonts * @param pos rectangle * @return the stream * @throws IOException if an I/O error occurs */ public String createStreamFromPDFBoxPage(PDDocument sourceDoc, PDPage page, String key, AffineTransform atdoc, FontInfo fontinfo, Rectangle pos) throws IOException { handleAnnotations(sourceDoc, page, atdoc); if (pageNumbers.containsKey(targetPage.getPageIndex())) { pageNumbers.get(targetPage.getPageIndex()).set(0, targetPage.makeReference()); } PDResources sourcePageResources = page.getResources(); PDStream pdStream = getContents(page); COSDictionary fonts = (COSDictionary) sourcePageResources.getCOSObject().getDictionaryObject(COSName.FONT); COSDictionary fontsBackup = null; UniqueName uniqueName = new UniqueName(key, sourcePageResources); String newStream = null; if (fonts != null && pdfDoc.isMergeFontsEnabled()) { fontsBackup = new COSDictionary(fonts); MergeFontsPDFWriter m = new MergeFontsPDFWriter(fonts, fontinfo, uniqueName, parentFonts, currentMCID); newStream = m.writeText(pdStream); // if (newStream != null) { // for (Object f : fonts.keySet().toArray()) { // COSDictionary fontdata = (COSDictionary)fonts.getDictionaryObject((COSName)f); // if (getUniqueFontName(fontdata) != null) { // fonts.removeItem((COSName)f); // } // } // } } if (newStream == null) { PDFWriter writer = new PDFWriter(uniqueName, currentMCID); newStream = writer.writeText(pdStream); currentMCID = writer.getCurrentMCID(); } pdStream = new PDStream(sourceDoc, new ByteArrayInputStream(newStream.getBytes("ISO-8859-1"))); mergeXObj(sourcePageResources.getCOSObject(), fontinfo, uniqueName); PDFDictionary pageResources = (PDFDictionary) cloneForNewDocument(sourcePageResources.getCOSObject()); PDFDictionary fontDict = (PDFDictionary) pageResources.get("Font"); if (fontDict != null && pdfDoc.isMergeFontsEnabled()) { for (Map.Entry<String, Typeface> fontEntry : fontinfo.getUsedFonts().entrySet()) { Typeface font = fontEntry.getValue(); if (font instanceof FOPPDFFont) { FOPPDFFont pdfFont = (FOPPDFFont) font; if (pdfFont.getRef() == null) { pdfFont.setRef(new PDFDictionary()); pdfDoc.assignObjectNumber(pdfFont.getRef()); } fontDict.put(fontEntry.getKey(), pdfFont.getRef()); } } } updateXObj(sourcePageResources.getCOSObject(), pageResources); if (fontsBackup != null) { sourcePageResources.getCOSObject().setItem(COSName.FONT, fontsBackup); } COSStream originalPageContents = pdStream.getCOSObject(); bindOptionalContent(sourceDoc); PDFStream pageStream; Set filter; // if (originalPageContents instanceof COSStreamArray) { // COSStreamArray array = (COSStreamArray)originalPageContents; // pageStream = new PDFStream(); // InputStream in = array.getUnfilteredStream(); // OutputStream out = pageStream.getBufferOutputStream(); // IOUtils.copyLarge(in, out); // filter = FILTER_FILTER; // } else { pageStream = (PDFStream) cloneForNewDocument(originalPageContents); filter = Collections.EMPTY_SET; // } if (pageStream == null) { pageStream = new PDFStream(); } if (originalPageContents != null) { transferDict(originalPageContents, pageStream, filter); } transferPageDict(fonts, uniqueName, sourcePageResources); PDRectangle mediaBox = page.getMediaBox(); PDRectangle cropBox = page.getCropBox(); PDRectangle viewBox = cropBox != null ? cropBox : mediaBox; //Handle the /Rotation entry on the page dict int rotation = PDFUtil.getNormalizedRotation(page); //Transform to FOP's user space float w = (float) pos.getWidth() / 1000f; float h = (float) pos.getHeight() / 1000f; if (rotation == 90 || rotation == 270) { float tmp = w; w = h; h = tmp; } atdoc.setTransform(AffineTransform.getScaleInstance(w / viewBox.getWidth(), h / viewBox.getHeight())); atdoc.translate(0, viewBox.getHeight()); atdoc.rotate(-Math.PI); atdoc.scale(-1, 1); atdoc.translate(-viewBox.getLowerLeftX(), -viewBox.getLowerLeftY()); rotate(rotation, viewBox, atdoc); StringBuilder boxStr = new StringBuilder(); boxStr.append(PDFNumber.doubleOut(mediaBox.getLowerLeftX())).append(' ') .append(PDFNumber.doubleOut(mediaBox.getLowerLeftY())).append(' ') .append(PDFNumber.doubleOut(mediaBox.getWidth())).append(' ') .append(PDFNumber.doubleOut(mediaBox.getHeight())).append(" re W n\n"); return boxStr.toString() + IOUtils.toString(pdStream.createInputStream(null), "ISO-8859-1"); }
From source file:org.apache.fop.render.pdf.PDFBoxAdapterTestCase.java
License:Apache License
private COSDictionary getFont(PDDocument doc, String internalname) throws IOException { PDPage page = (PDPage) doc.getDocumentCatalog().getPages().get(0); PDResources sourcePageResources = page.getResources(); COSDictionary fonts = (COSDictionary) sourcePageResources.getCOSObject().getDictionaryObject(COSName.FONT); return (COSDictionary) fonts.getDictionaryObject(internalname); }
From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {// w ww .java 2s . c o m writeParagraphEnd(); extractImages(page.getResources()); EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { extractMultiOSPDEmbeddedFiles("", fileSpec, extractor); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null) { handler.startElement("div", "class", "annotation"); handler.startElement("a", "href", link); handler.endElement("a"); handler.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { handler.startElement("div", "class", "annotation"); if (title != null) { handler.startElement("div", "class", "annotationTitle"); handler.characters(title); handler.endElement("div"); } if (subject != null) { handler.startElement("div", "class", "annotationSubject"); handler.characters(subject); handler.endElement("div"); } if (contents != null) { handler.startElement("div", "class", "annotationContents"); handler.characters(contents); handler.endElement("div"); } handler.endElement("div"); } } } } handler.endElement("div"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a page", e); } }
From source file:org.apache.tika.parser.pdf.PDF2XHTML.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {/*www . j ava 2 s . c om*/ writeParagraphEnd(); extractImages(page.getResources(), new HashSet<COSBase>()); EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { extractMultiOSPDEmbeddedFiles("", fileSpec, extractor); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null) { handler.startElement("div", "class", "annotation"); handler.startElement("a", "href", link); handler.endElement("a"); handler.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { handler.startElement("div", "class", "annotation"); if (title != null) { handler.startElement("div", "class", "annotationTitle"); handler.characters(title); handler.endElement("div"); } if (subject != null) { handler.startElement("div", "class", "annotationSubject"); handler.characters(subject); handler.endElement("div"); } if (contents != null) { handler.startElement("div", "class", "annotationContents"); handler.characters(contents); handler.endElement("div"); } handler.endElement("div"); } } } } handler.endElement("div"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a page", e); } page.clear(); }
From source file:org.apache.tika.parser.pdf.PDF2XHTMLPureJava.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {//from w w w . java 2s .c om writeParagraphEnd(); try { extractImages(page.getResources(), new HashSet<COSBase>()); } catch (IOException e) { handleCatchableIOE(e); } super.endPage(page); } catch (SAXException e) { throw new IOException("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } }
From source file:org.argrr.extractor.gdrive.downloader.ChartsDownloader.java
License:Open Source License
public static void extractPictures(String path, String fileName) throws IOException { PDDocument document = null;/*from ww w. ja v a 2 s. co m*/ try { document = PDDocument.load(path + "/" + fileName + ".pdf"); } catch (IOException ex) { System.out.println("" + ex); } List pages = document.getDocumentCatalog().getAllPages(); Iterator iter = pages.iterator(); int i = 1; String name = null; while (iter.hasNext()) { PDPage page = (PDPage) iter.next(); PDResources resources = page.getResources(); Map pageImages = resources.getImages(); if (pageImages != null) { Iterator imageIter = pageImages.keySet().iterator(); while (imageIter.hasNext()) { String key = (String) imageIter.next(); PDXObjectImage image = (PDXObjectImage) pageImages.get(key); image.write2file(ChartsDownloader.rootOutputPathCharts + "/" + fileName + "-" + i); i++; } } } }
From source file:org.example.extractimagesfrompdfpages.ExtractImagesFromPDFPagesMain.java
public static void main(String[] args) { try {/*from w ww . ja v a 2 s . c om*/ File thePDFFile = new File(args[0]); PDDocument document = PDDocument.load(thePDFFile); PDPageTree list = document.getPages(); int i = 1; for (PDPage page : list) { Boolean alreadyCreatedFolderForThisPage = false; File thePDFFileDirectory = thePDFFile.getParentFile(); File thePDFPageFolder = new File(thePDFFileDirectory.getAbsolutePath() + "/temp_images" + "/" + i); PDResources pdResources = page.getResources(); int j = 1; for (COSName c : pdResources.getXObjectNames()) { PDXObject o = pdResources.getXObject(c); if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) { if (alreadyCreatedFolderForThisPage == false) { thePDFPageFolder.mkdirs(); alreadyCreatedFolderForThisPage = true; } File file = new File(thePDFPageFolder.getAbsolutePath() + "/" + j + ".png"); ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(), "png", file); System.out.println(thePDFPageFolder.getAbsolutePath() + "/" + j + ".png"); j++; } } i++; } } catch (IOException ex) { Logger.getLogger(ExtractImagesFromPDFPagesMain.class.getName()).log(Level.SEVERE, null, ex); throw new RuntimeException(ex); } }