List of usage examples for org.apache.pdfbox.pdmodel PDPage getContents
@Override public InputStream getContents() throws IOException
From source file:PrintImageLocations.java
License:Apache License
/** * This will print the documents data./*from w w w . j a v a2 s . com*/ * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. */ public static void main2() throws Exception { if (flag) { usage(); } else { PDDocument document = null; try { document = PDDocument.load(PrintTextLocations.INPUTFILE); if (document.isEncrypted()) { try { document.decrypt(""); } catch (InvalidPasswordException e) { System.err.println("Error: Document is encrypted with a password."); System.exit(1); } } PrintImageLocations printer = new PrintImageLocations(); List allPages = document.getDocumentCatalog().getAllPages(); for (int i = 0; i < allPages.size(); i++) { PDPage page = (PDPage) allPages.get(i); System.out.println("Processing page: " + i); printer.processStream(page, page.findResources(), page.getContents().getStream()); } } finally { if (document != null) { document.close(); } } } }
From source file:aplicacion.sistema.indexer.test.PDFTextStripperOrg.java
License:Apache License
/** * This will process all of the pages and the text that is in them. * * @param pages The pages object in the document. * * @throws IOException If there is an error parsing the text. */// w ww . j av a 2 s .c om protected void processPages(List pages) throws IOException { if (startBookmark != null) { startBookmarkPageNumber = getPageNumber(startBookmark, pages); } if (endBookmark != null) { endBookmarkPageNumber = getPageNumber(endBookmark, pages); } if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject()) { //this is a special case where both the start and end bookmark //are the same but point to nothing. In this case //we will not extract any text. startBookmarkPageNumber = 0; endBookmarkPageNumber = 0; } Iterator pageIter = pages.iterator(); while (pageIter.hasNext()) { PDPage nextPage = (PDPage) pageIter.next(); PDStream contentStream = nextPage.getContents(); currentPageNo++; if (contentStream != null) { COSStream contents = contentStream.getStream(); processPage(nextPage, contents); } } }
From source file:at.gv.egiz.pdfas.lib.impl.pdfbox.placeholder.SignaturePlaceholderExtractor.java
License:EUPL
/** * Extracts all placeholders (with placeholder identifier * {@linkplain at.gv.egiz.pdfas.lib.impl.placeholder.PlaceholderExtractorConstants#QR_PLACEHOLDER_IDENTIFIER * QR_PLACEHOLDER_IDENTIFIER}).//from ww w .j a va 2 s . c om * * @param doc * The pdfbox document object. * @return A (unmodifiable) list of signature place holders (never {@code null}). * @throws IOException * Thrown in case of I/O error reading/parsing the pdf document. */ @SuppressWarnings("unchecked") public static List<SignaturePlaceholderData> extract(PDDocument doc) throws IOException { Objects.requireNonNull(doc, "Pdfbox document must not be null."); SignaturePlaceholderExtractor extractor = new SignaturePlaceholderExtractor(QR_PLACEHOLDER_IDENTIFIER, // is ignored anyway PLACEHOLDER_MATCH_MODE_MODERATE // is ignored anyway , doc); int pageNr = 0; for (PDPage page : (Iterable<PDPage>) doc.getDocumentCatalog().getAllPages()) { extractor.setCurrentPage(++pageNr); PDStream contents; PDResources resources; if ((contents = page.getContents()) != null && contents.getStream() != null && (resources = page.findResources()) != null) { extractor.processStream(page, resources, contents.getStream()); } } return ListUtils.unmodifiableList(new ArrayList<SignaturePlaceholderData>(extractor.placeholders)); }
From source file:at.gv.egiz.pdfas.lib.impl.pdfbox.placeholder.SignaturePlaceholderExtractor.java
License:EUPL
/** * Search the document for placeholder images and possibly included * additional info.<br/>/*from ww w. j a v a2 s. c o m*/ * Searches only for the first placeholder page after page from top. * * @param inputStream * @return all available info from the first found placeholder. * @throws PDFDocumentException * if the document could not be read. * @throws PlaceholderExtractionException * if STRICT matching mode was requested and no suitable * placeholder could be found. */ public static SignaturePlaceholderData extract(PDDocument doc, String placeholderId, int matchMode) throws PdfAsException { SignaturePlaceholderContext.setSignaturePlaceholderData(null); SignaturePlaceholderExtractor extractor; try { extractor = new SignaturePlaceholderExtractor(placeholderId, matchMode, doc); } catch (IOException e2) { throw new PDFIOException("error.pdf.io.04", e2); } List<?> pages = doc.getDocumentCatalog().getAllPages(); Iterator<?> iter = pages.iterator(); int pageNr = 0; while (iter.hasNext()) { pageNr++; PDPage page = (PDPage) iter.next(); try { extractor.setCurrentPage(pageNr); if (page.getContents() != null && page.findResources() != null && page.getContents().getStream() != null) { extractor.processStream(page, page.findResources(), page.getContents().getStream()); } SignaturePlaceholderData ret = matchPlaceholderPage(extractor.placeholders, placeholderId, matchMode); if (ret != null) { SignaturePlaceholderContext.setSignaturePlaceholderData(ret); return ret; } } catch (IOException e1) { throw new PDFIOException("error.pdf.io.04", e1); } catch (Throwable e) { throw new PDFIOException("error.pdf.io.04", e); } } if (extractor.placeholders.size() > 0) { SignaturePlaceholderData ret = matchPlaceholderDocument(extractor.placeholders, placeholderId, matchMode); SignaturePlaceholderContext.setSignaturePlaceholderData(ret); return ret; } // no placeholders found, apply strict mode if set if (matchMode == PLACEHOLDER_MATCH_MODE_STRICT) { throw new PlaceholderExtractionException("error.pdf.stamp.09"); } return null; }
From source file:at.gv.egiz.pdfas.lib.impl.pdfbox2.placeholder.SignaturePlaceholderExtractor.java
License:EUPL
/** * Search the document for placeholder images and possibly included * additional info.<br/>//from ww w . ja v a 2s . c om * Searches only for the first placeholder page after page from top. * * @param inputStream * @return all available info from the first found placeholder. * @throws PDFDocumentException * if the document could not be read. * @throws PlaceholderExtractionException * if STRICT matching mode was requested and no suitable * placeholder could be found. */ public static SignaturePlaceholderData extract(PDDocument doc, String placeholderId, int matchMode) throws PdfAsException { SignaturePlaceholderContext.setSignaturePlaceholderData(null); SignaturePlaceholderExtractor extractor; try { extractor = new SignaturePlaceholderExtractor(placeholderId, matchMode, doc); } catch (IOException | ClassNotFoundException | InstantiationException | IllegalAccessException e2) { throw new PDFIOException("error.pdf.io.04", e2); } int pageNr = 0; for (PDPage page : doc.getPages()) { pageNr++; try { extractor.setCurrentPage(pageNr); if (page.getContents() != null && page.getResources() != null && page.getContentStreams() != null) { extractor.processPage(page); //TODO: pdfbox2 - right? } SignaturePlaceholderData ret = matchPlaceholderPage(extractor.placeholders, placeholderId, matchMode); if (ret != null) { SignaturePlaceholderContext.setSignaturePlaceholderData(ret); return ret; } } catch (IOException e1) { throw new PDFIOException("error.pdf.io.04", e1); } catch (Throwable e) { throw new PDFIOException("error.pdf.io.04", e); } } if (extractor.placeholders.size() > 0) { SignaturePlaceholderData ret = matchPlaceholderDocument(extractor.placeholders, placeholderId, matchMode); SignaturePlaceholderContext.setSignaturePlaceholderData(ret); return ret; } // no placeholders found, apply strict mode if set if (matchMode == PLACEHOLDER_MATCH_MODE_STRICT) { throw new PlaceholderExtractionException("error.pdf.stamp.09"); } return null; }
From source file:at.knowcenter.wag.egov.egiz.pdf.PDFUtilities.java
License:EUPL
public static float calculatePageLength(PDPage page, float effectivePageHeight, boolean legacy32, boolean legacy40) throws PDFIOException { try {//w w w. ja v a 2s.c o m PDFPage my_page = new PDFPage(effectivePageHeight, legacy32, legacy40); PDResources resources = page.findResources(); if (page.getContents() != null) { COSStream stream = page.getContents().getStream(); // List<PDThreadBead> articles = page.getThreadBeads(); // my_page.processMyPage(page); my_page.processStream(page, resources, stream); } if (!legacy32) { if (page.getAnnotations() != null) { Iterator<PDAnnotation> annotationsIt = page.getAnnotations().iterator(); while (annotationsIt.hasNext()) { PDAnnotation annotation = annotationsIt.next(); if (!annotation.isInvisible()) { my_page.processAnnotation(annotation); } } } } return my_page.getMaxPageLength(); } catch (IOException e) { throw new PDFIOException("error.pdf.stamp.11", e); } }
From source file:chiliad.parser.pdf.extractor.image.ImageExtractor.java
License:Apache License
@Override public MPage extract(PDPage pageToExtract, MPage pageContent) { try {/* ww w. java 2s . c o m*/ if (pageToExtract.getContents() == null) { throw new IllegalStateException("The PDPage content is null."); } processStream(pageToExtract, pageToExtract.findResources(), pageToExtract.getContents().getStream()); pageContent.addImages(getMImages()); return pageContent; } catch (IOException ex) { throw new ExtractorException("Failed to extract images.", ex); } }
From source file:chiliad.parser.pdf.extractor.text.TextExtractor.java
License:Apache License
@Override public MPage extract(PDPage pageToExtract, MPage pageContent) { try {// w w w.ja v a 2 s . c o m if (pageToExtract.getContents() == null) { throw new IllegalStateException("Empty page content."); } textPositionProcessor = new TextPositionProcessor(); processStream(pageToExtract, pageToExtract.findResources(), pageToExtract.getContents().getStream()); tokens = textPositionProcessor.process(); pageContent.addTokens(getMTokens()); return pageContent; } catch (IOException ex) { throw new ExtractorException("Failed to extract the tokens.", ex); } }
From source file:chiliad.parser.pdf.extractor.vectorgraphics.VectorGraphicsExtractor.java
License:Apache License
@Override public MPage extract(PDPage pageToExtract, MPage pageContent) { try {/* w ww .j a v a 2s.c o m*/ if (pageToExtract.getContents() == null) { throw new ExtractorException("Contents is null."); } pageSize = pageToExtract.findMediaBox().createDimension(); graphics.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON); graphics.setRenderingHint(RenderingHints.KEY_FRACTIONALMETRICS, RenderingHints.VALUE_FRACTIONALMETRICS_ON); // initialize the used stroke with CAP_BUTT instead of CAP_SQUARE graphics.setStroke(new BasicStroke(1.0f, BasicStroke.CAP_BUTT, BasicStroke.JOIN_MITER)); // Only if there is some content, we have to process it. // Otherwise we are done here and we will produce an empty page PDResources resources = pageToExtract.findResources(); processStream(pageToExtract, resources, pageToExtract.getContents().getStream()); List<PDAnnotation> annotations = pageToExtract.getAnnotations(); for (PDAnnotation annotation : annotations) { PDAnnotation annot = (PDAnnotation) annotation; PDRectangle rect = annot.getRectangle(); String appearanceName = annot.getAppearanceStream(); PDAppearanceDictionary appearDictionary = annot.getAppearance(); if (appearDictionary != null) { if (appearanceName == null) { appearanceName = "default"; } Map<String, PDAppearanceStream> appearanceMap = appearDictionary.getNormalAppearance(); if (appearanceMap != null) { PDAppearanceStream appearance = (PDAppearanceStream) appearanceMap.get(appearanceName); if (appearance != null) { Point2D point = new Point2D.Float(rect.getLowerLeftX(), rect.getLowerLeftY()); Matrix matrix = appearance.getMatrix(); if (matrix != null) { // transform the rectangle using the given matrix AffineTransform at = matrix.createAffineTransform(); at.transform(point, point); } graphics.translate((int) point.getX(), -(int) point.getY()); processSubStream(pageToExtract, appearance.getResources(), appearance.getStream()); graphics.translate(-(int) point.getX(), (int) point.getY()); } } } } return handleResult(graphics, pageContent); } catch (IOException ex) { throw new ExtractorException("Failed to extract vector graphics.", ex); } }
From source file:com.amolik.misc.ExtractTextByArea.java
License:Apache License
/** * This will print the documents text in a certain area. * * @param args The command line arguments. * * @throws IOException If there is an error parsing the document. */// w w w . java 2s. com public static void main(String[] args) throws IOException { //args[0]= "E:\\Automation\\uphillit\\Fiscal_demo_data.pdf"; // if( args.length != 1 ) // { // usage(); // } // else // { PDDocument document = null; try { document = PDDocument.load(new File("E:\\Automation\\uphillit\\Fiscal_demo_data.pdf")); int numberOfPages = document.getNumberOfPages(); if (numberOfPages > 0) { PDPage page = (PDPage) document.getPages().get(0); System.out.println(page.getContents()); } PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); Rectangle rect = new Rectangle(3, 1, 600, 6000); stripper.addRegion("class1", rect); PDPage firstPage = document.getPage(0); stripper.extractRegions(firstPage); System.out.println("Text in the area:" + rect); System.out.println(stripper.getTextForRegion("class1")); } finally { if (document != null) { document.close(); } } // } }