Example usage for org.apache.pdfbox.pdmodel PDPage getContents

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDPage getContents.

Prototype

@Override
public InputStream getContents() throws IOException

Source Link

Document

Returns the content stream(s) of this page as a single input stream.

Usage

From source file:PrintImageLocations.java

License:Apache License

/**
 * This will print the documents data./*from w  w w  .  j  a  v  a2 s  . com*/
 *
 * @param args The command line arguments.
 *
 * @throws Exception If there is an error parsing the document.
 */
public static void main2() throws Exception {
    if (flag) {
        usage();
    } else {
        PDDocument document = null;
        try {
            document = PDDocument.load(PrintTextLocations.INPUTFILE);
            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                } catch (InvalidPasswordException e) {
                    System.err.println("Error: Document is encrypted with a password.");
                    System.exit(1);
                }
            }
            PrintImageLocations printer = new PrintImageLocations();
            List allPages = document.getDocumentCatalog().getAllPages();
            for (int i = 0; i < allPages.size(); i++) {
                PDPage page = (PDPage) allPages.get(i);
                System.out.println("Processing page: " + i);
                printer.processStream(page, page.findResources(), page.getContents().getStream());
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
    }
}

From source file:aplicacion.sistema.indexer.test.PDFTextStripperOrg.java

License:Apache License

/**
 * This will process all of the pages and the text that is in them.
 *
 * @param pages The pages object in the document.
 *
 * @throws IOException If there is an error parsing the text.
 *///  w  ww  . j av  a  2 s .c om
protected void processPages(List pages) throws IOException {
    if (startBookmark != null) {
        startBookmarkPageNumber = getPageNumber(startBookmark, pages);
    }

    if (endBookmark != null) {
        endBookmarkPageNumber = getPageNumber(endBookmark, pages);
    }

    if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1
            && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject()) {
        //this is a special case where both the start and end bookmark
        //are the same but point to nothing.  In this case
        //we will not extract any text.
        startBookmarkPageNumber = 0;
        endBookmarkPageNumber = 0;
    }

    Iterator pageIter = pages.iterator();
    while (pageIter.hasNext()) {
        PDPage nextPage = (PDPage) pageIter.next();
        PDStream contentStream = nextPage.getContents();
        currentPageNo++;
        if (contentStream != null) {
            COSStream contents = contentStream.getStream();
            processPage(nextPage, contents);
        }
    }
}

From source file:at.gv.egiz.pdfas.lib.impl.pdfbox.placeholder.SignaturePlaceholderExtractor.java

License:EUPL

/**
 * Extracts all placeholders (with placeholder identifier
 * {@linkplain at.gv.egiz.pdfas.lib.impl.placeholder.PlaceholderExtractorConstants#QR_PLACEHOLDER_IDENTIFIER
 * QR_PLACEHOLDER_IDENTIFIER}).//from  ww  w .j  a  va  2 s  .  c  om
 * 
 * @param doc
 *            The pdfbox document object.
 * @return A (unmodifiable) list of signature place holders (never {@code null}).
 * @throws IOException
 *             Thrown in case of I/O error reading/parsing the pdf document.
 */
@SuppressWarnings("unchecked")
public static List<SignaturePlaceholderData> extract(PDDocument doc) throws IOException {
    Objects.requireNonNull(doc, "Pdfbox document must not be null.");

    SignaturePlaceholderExtractor extractor = new SignaturePlaceholderExtractor(QR_PLACEHOLDER_IDENTIFIER, // is ignored anyway
            PLACEHOLDER_MATCH_MODE_MODERATE // is ignored anyway
            , doc);

    int pageNr = 0;
    for (PDPage page : (Iterable<PDPage>) doc.getDocumentCatalog().getAllPages()) {
        extractor.setCurrentPage(++pageNr);
        PDStream contents;
        PDResources resources;
        if ((contents = page.getContents()) != null && contents.getStream() != null
                && (resources = page.findResources()) != null) {
            extractor.processStream(page, resources, contents.getStream());
        }
    }

    return ListUtils.unmodifiableList(new ArrayList<SignaturePlaceholderData>(extractor.placeholders));
}

From source file:at.gv.egiz.pdfas.lib.impl.pdfbox.placeholder.SignaturePlaceholderExtractor.java

License:EUPL

/**
 * Search the document for placeholder images and possibly included
 * additional info.<br/>/*from  ww  w.  j  a v  a2  s.  c o  m*/
 * Searches only for the first placeholder page after page from top.
 *
 * @param inputStream
 * @return all available info from the first found placeholder.
 * @throws PDFDocumentException
 *             if the document could not be read.
 * @throws PlaceholderExtractionException
 *             if STRICT matching mode was requested and no suitable
 *             placeholder could be found.
 */
public static SignaturePlaceholderData extract(PDDocument doc, String placeholderId, int matchMode)
        throws PdfAsException {
    SignaturePlaceholderContext.setSignaturePlaceholderData(null);

    SignaturePlaceholderExtractor extractor;
    try {
        extractor = new SignaturePlaceholderExtractor(placeholderId, matchMode, doc);
    } catch (IOException e2) {
        throw new PDFIOException("error.pdf.io.04", e2);
    }
    List<?> pages = doc.getDocumentCatalog().getAllPages();
    Iterator<?> iter = pages.iterator();
    int pageNr = 0;
    while (iter.hasNext()) {
        pageNr++;
        PDPage page = (PDPage) iter.next();
        try {
            extractor.setCurrentPage(pageNr);
            if (page.getContents() != null && page.findResources() != null
                    && page.getContents().getStream() != null) {
                extractor.processStream(page, page.findResources(), page.getContents().getStream());
            }
            SignaturePlaceholderData ret = matchPlaceholderPage(extractor.placeholders, placeholderId,
                    matchMode);
            if (ret != null) {
                SignaturePlaceholderContext.setSignaturePlaceholderData(ret);
                return ret;
            }
        } catch (IOException e1) {
            throw new PDFIOException("error.pdf.io.04", e1);
        } catch (Throwable e) {
            throw new PDFIOException("error.pdf.io.04", e);
        }

    }
    if (extractor.placeholders.size() > 0) {
        SignaturePlaceholderData ret = matchPlaceholderDocument(extractor.placeholders, placeholderId,
                matchMode);
        SignaturePlaceholderContext.setSignaturePlaceholderData(ret);
        return ret;
    }
    // no placeholders found, apply strict mode if set
    if (matchMode == PLACEHOLDER_MATCH_MODE_STRICT) {
        throw new PlaceholderExtractionException("error.pdf.stamp.09");
    }

    return null;
}

From source file:at.gv.egiz.pdfas.lib.impl.pdfbox2.placeholder.SignaturePlaceholderExtractor.java

License:EUPL

/**
 * Search the document for placeholder images and possibly included
 * additional info.<br/>//from   ww w  . ja  v a 2s .  c  om
 * Searches only for the first placeholder page after page from top.
 *
 * @param inputStream
 * @return all available info from the first found placeholder.
 * @throws PDFDocumentException
 *             if the document could not be read.
 * @throws PlaceholderExtractionException
 *             if STRICT matching mode was requested and no suitable
 *             placeholder could be found.
 */
public static SignaturePlaceholderData extract(PDDocument doc, String placeholderId, int matchMode)
        throws PdfAsException {
    SignaturePlaceholderContext.setSignaturePlaceholderData(null);

    SignaturePlaceholderExtractor extractor;
    try {
        extractor = new SignaturePlaceholderExtractor(placeholderId, matchMode, doc);
    } catch (IOException | ClassNotFoundException | InstantiationException | IllegalAccessException e2) {
        throw new PDFIOException("error.pdf.io.04", e2);
    }

    int pageNr = 0;
    for (PDPage page : doc.getPages()) {
        pageNr++;

        try {
            extractor.setCurrentPage(pageNr);
            if (page.getContents() != null && page.getResources() != null && page.getContentStreams() != null) {
                extractor.processPage(page); //TODO: pdfbox2 - right?

            }
            SignaturePlaceholderData ret = matchPlaceholderPage(extractor.placeholders, placeholderId,
                    matchMode);
            if (ret != null) {
                SignaturePlaceholderContext.setSignaturePlaceholderData(ret);
                return ret;
            }
        } catch (IOException e1) {
            throw new PDFIOException("error.pdf.io.04", e1);
        } catch (Throwable e) {
            throw new PDFIOException("error.pdf.io.04", e);
        }
    }
    if (extractor.placeholders.size() > 0) {
        SignaturePlaceholderData ret = matchPlaceholderDocument(extractor.placeholders, placeholderId,
                matchMode);
        SignaturePlaceholderContext.setSignaturePlaceholderData(ret);
        return ret;
    }
    // no placeholders found, apply strict mode if set
    if (matchMode == PLACEHOLDER_MATCH_MODE_STRICT) {
        throw new PlaceholderExtractionException("error.pdf.stamp.09");
    }

    return null;
}

From source file:at.knowcenter.wag.egov.egiz.pdf.PDFUtilities.java

License:EUPL

public static float calculatePageLength(PDPage page, float effectivePageHeight, boolean legacy32,
        boolean legacy40) throws PDFIOException {
    try {//w w w.  ja  v  a  2s.c o  m
        PDFPage my_page = new PDFPage(effectivePageHeight, legacy32, legacy40);
        PDResources resources = page.findResources();
        if (page.getContents() != null) {
            COSStream stream = page.getContents().getStream();
            // List<PDThreadBead> articles = page.getThreadBeads();
            // my_page.processMyPage(page);
            my_page.processStream(page, resources, stream);
        }
        if (!legacy32) {
            if (page.getAnnotations() != null) {
                Iterator<PDAnnotation> annotationsIt = page.getAnnotations().iterator();

                while (annotationsIt.hasNext()) {
                    PDAnnotation annotation = annotationsIt.next();
                    if (!annotation.isInvisible()) {
                        my_page.processAnnotation(annotation);
                    }
                }
            }
        }
        return my_page.getMaxPageLength();
    } catch (IOException e) {
        throw new PDFIOException("error.pdf.stamp.11", e);
    }
}

From source file:chiliad.parser.pdf.extractor.image.ImageExtractor.java

License:Apache License

@Override
public MPage extract(PDPage pageToExtract, MPage pageContent) {
    try {/* ww  w.  java  2s  . c  o  m*/
        if (pageToExtract.getContents() == null) {
            throw new IllegalStateException("The PDPage content is null.");
        }
        processStream(pageToExtract, pageToExtract.findResources(), pageToExtract.getContents().getStream());
        pageContent.addImages(getMImages());
        return pageContent;
    } catch (IOException ex) {
        throw new ExtractorException("Failed to extract images.", ex);
    }

}

From source file:chiliad.parser.pdf.extractor.text.TextExtractor.java

License:Apache License

@Override
public MPage extract(PDPage pageToExtract, MPage pageContent) {
    try {//  w w w.ja  v  a 2 s  . c  o m
        if (pageToExtract.getContents() == null) {
            throw new IllegalStateException("Empty page content.");
        }
        textPositionProcessor = new TextPositionProcessor();
        processStream(pageToExtract, pageToExtract.findResources(), pageToExtract.getContents().getStream());
        tokens = textPositionProcessor.process();
        pageContent.addTokens(getMTokens());
        return pageContent;
    } catch (IOException ex) {
        throw new ExtractorException("Failed to extract the tokens.", ex);
    }

}

From source file:chiliad.parser.pdf.extractor.vectorgraphics.VectorGraphicsExtractor.java

License:Apache License

@Override
public MPage extract(PDPage pageToExtract, MPage pageContent) {
    try {/* w ww .j  a v a 2s.c  o m*/
        if (pageToExtract.getContents() == null) {
            throw new ExtractorException("Contents is null.");
        }

        pageSize = pageToExtract.findMediaBox().createDimension();
        graphics.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON);
        graphics.setRenderingHint(RenderingHints.KEY_FRACTIONALMETRICS,
                RenderingHints.VALUE_FRACTIONALMETRICS_ON);
        // initialize the used stroke with CAP_BUTT instead of CAP_SQUARE
        graphics.setStroke(new BasicStroke(1.0f, BasicStroke.CAP_BUTT, BasicStroke.JOIN_MITER));
        // Only if there is some content, we have to process it.
        // Otherwise we are done here and we will produce an empty page

        PDResources resources = pageToExtract.findResources();
        processStream(pageToExtract, resources, pageToExtract.getContents().getStream());

        List<PDAnnotation> annotations = pageToExtract.getAnnotations();
        for (PDAnnotation annotation : annotations) {
            PDAnnotation annot = (PDAnnotation) annotation;
            PDRectangle rect = annot.getRectangle();
            String appearanceName = annot.getAppearanceStream();
            PDAppearanceDictionary appearDictionary = annot.getAppearance();
            if (appearDictionary != null) {
                if (appearanceName == null) {
                    appearanceName = "default";
                }
                Map<String, PDAppearanceStream> appearanceMap = appearDictionary.getNormalAppearance();
                if (appearanceMap != null) {
                    PDAppearanceStream appearance = (PDAppearanceStream) appearanceMap.get(appearanceName);
                    if (appearance != null) {
                        Point2D point = new Point2D.Float(rect.getLowerLeftX(), rect.getLowerLeftY());
                        Matrix matrix = appearance.getMatrix();
                        if (matrix != null) {
                            // transform the rectangle using the given matrix
                            AffineTransform at = matrix.createAffineTransform();
                            at.transform(point, point);
                        }
                        graphics.translate((int) point.getX(), -(int) point.getY());
                        processSubStream(pageToExtract, appearance.getResources(), appearance.getStream());
                        graphics.translate(-(int) point.getX(), (int) point.getY());
                    }
                }
            }
        }
        return handleResult(graphics, pageContent);
    } catch (IOException ex) {
        throw new ExtractorException("Failed to extract vector graphics.", ex);
    }
}

From source file:com.amolik.misc.ExtractTextByArea.java

License:Apache License

/**
 * This will print the documents text in a certain area.
 *
 * @param args The command line arguments.
 *
 * @throws IOException If there is an error parsing the document.
 *///  w w  w  .  java 2s. com
public static void main(String[] args) throws IOException {
    //args[0]= "E:\\Automation\\uphillit\\Fiscal_demo_data.pdf";
    //        if( args.length != 1 )
    //        {
    //            usage();
    //        }
    //        else
    //        {
    PDDocument document = null;
    try {
        document = PDDocument.load(new File("E:\\Automation\\uphillit\\Fiscal_demo_data.pdf"));
        int numberOfPages = document.getNumberOfPages();
        if (numberOfPages > 0) {

            PDPage page = (PDPage) document.getPages().get(0);
            System.out.println(page.getContents());
        }
        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
        stripper.setSortByPosition(true);
        Rectangle rect = new Rectangle(3, 1, 600, 6000);
        stripper.addRegion("class1", rect);
        PDPage firstPage = document.getPage(0);
        stripper.extractRegions(firstPage);
        System.out.println("Text in the area:" + rect);
        System.out.println(stripper.getTextForRegion("class1"));
    } finally {
        if (document != null) {
            document.close();
        }
    }
    //       }
}