List of usage examples for org.apache.pdfbox.pdmodel PDPage getAnnotations
public List<PDAnnotation> getAnnotations() throws IOException
From source file:net.yacy.document.parser.pdfParser.java
License:Open Source License
/** * extract clickable links from pdf//from w w w. ja va 2 s .co m * @param pdf the document to parse * @return all detected links */ private Collection<AnchorURL>[] extractPdfLinks(final PDDocument pdf) { @SuppressWarnings("unchecked") List<PDPage> allPages = pdf.getDocumentCatalog().getAllPages(); @SuppressWarnings("unchecked") Collection<AnchorURL>[] linkCollections = (Collection<AnchorURL>[]) new Collection<?>[allPages.size()]; int pagecount = 0; for (PDPage page : allPages) { final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>(); try { List<PDAnnotation> annotations = page.getAnnotations(); if (annotations != null) { for (PDAnnotation pdfannotation : annotations) { if (pdfannotation instanceof PDAnnotationLink) { PDAction link = ((PDAnnotationLink) pdfannotation).getAction(); if (link != null && link instanceof PDActionURI) { PDActionURI pdflinkuri = (PDActionURI) link; String uristr = pdflinkuri.getURI(); AnchorURL url = new AnchorURL(uristr); pdflinks.add(url); } } } } } catch (IOException ex) { } linkCollections[pagecount++] = pdflinks; } return linkCollections; }
From source file:org.apache.fop.render.pdf.pdfbox.PDFBoxAdapter.java
License:Apache License
private void handleAnnotations(PDDocument sourceDoc, PDPage page, AffineTransform at) throws IOException { PDDocumentCatalog srcCatalog = sourceDoc.getDocumentCatalog(); PDAcroForm srcAcroForm = srcCatalog.getAcroForm(); List pageAnnotations = page.getAnnotations(); if (srcAcroForm == null && pageAnnotations.isEmpty()) { return;/*from w w w . ja v a2 s . co m*/ } moveAnnotations(page, pageAnnotations, at); //Pseudo-cache the target page in place of the original source page. //This essentially replaces the original page reference with the target page. COSObject cosPage = null; COSDictionary parentDic = (COSDictionary) page.getCOSObject().getDictionaryObject(COSName.PARENT, COSName.P); COSArray kids = (COSArray) parentDic.getDictionaryObject(COSName.KIDS); for (int i = 0; i < kids.size(); i++) { //Hopefully safe to cast, as kids need to be indirect objects COSObject kid = (COSObject) kids.get(i); if (!pageNumbers.containsKey(i)) { PDFArray a = new PDFArray(); a.add(null); pdfDoc.assignObjectNumber(a); pdfDoc.addTrailerObject(a); pageNumbers.put(i, a); } cacheClonedObject(kid, pageNumbers.get(i)); if (kid.getObject() == page.getCOSObject()) { cosPage = kid; } } if (cosPage == null) { throw new IOException("Illegal PDF. Page not part of parent page node."); } Set<COSObject> fields = copyAnnotations(page); boolean formAlreadyCopied = getCachedClone(srcAcroForm) != null; PDFRoot catalog = this.pdfDoc.getRoot(); PDFDictionary destAcroForm = (PDFDictionary) catalog.get(COSName.ACRO_FORM.getName()); if (formAlreadyCopied) { //skip, already copied } else if (destAcroForm == null) { if (srcAcroForm != null) { //With this, only the first PDF's AcroForm is copied over. If later AcroForms have //different properties besides the actual fields, these get lost. Only fields //get merged. Collection exclude = Collections.singletonList(COSName.FIELDS); destAcroForm = (PDFDictionary) cloneForNewDocument(srcAcroForm, srcAcroForm, exclude); } else { //Work-around for incorrectly split PDFs which lack an AcroForm but have widgets //on pages. This doesn't handle the case where field dicts have "C" entries //(for the "CO" entry), so this may produce problems, but we have almost no chance //to guess the calculation order. destAcroForm = new PDFDictionary(pdfDoc.getRoot()); } pdfDoc.registerObject(destAcroForm); catalog.put(COSName.ACRO_FORM.getName(), destAcroForm); } PDFArray clonedFields = (PDFArray) destAcroForm.get(COSName.FIELDS.getName()); if (clonedFields == null) { clonedFields = new PDFArray(); destAcroForm.put(COSName.FIELDS.getName(), clonedFields); } for (COSObject field : fields) { PDFDictionary clone = (PDFDictionary) cloneForNewDocument(field, field, Arrays.asList(COSName.KIDS)); clonedFields.add(clone); } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {// ww w.ja va 2 s. c om for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "source", "source", "CDATA", "annotation"); extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } catch (IOException e) { handleCatchableIOE(e); } } else if (annotation instanceof PDAnnotationWidget) { handleWidget((PDAnnotationWidget) annotation); } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { //can't currently associate link to text. //for now, extract link and repeat the link as if it //were the visible text PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null && link.trim().length() > 0) { xhtml.startElement("div", "class", "annotation"); xhtml.startElement("a", "href", link); xhtml.characters(link); xhtml.endElement("a"); xhtml.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { xhtml.startElement("div", "class", "annotation"); if (title != null) { xhtml.startElement("div", "class", "annotationTitle"); xhtml.characters(title); xhtml.endElement("div"); } if (subject != null) { xhtml.startElement("div", "class", "annotationSubject"); xhtml.characters(subject); xhtml.endElement("div"); } if (contents != null) { xhtml.startElement("div", "class", "annotationContents"); xhtml.characters(contents); xhtml.endElement("div"); } xhtml.endElement("div"); } } } } if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { doOCROnCurrentPage(); } PDPageAdditionalActions pageActions = page.getActions(); if (pageActions != null) { handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE); handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN); } xhtml.endElement("div"); } catch (SAXException | TikaException e) { throw new IOExceptionWithCause("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } finally { pageIndex++; } }
From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTMLPureJava.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {// w w w . ja v a2 s . c o m for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "source", "source", "CDATA", "annotation"); extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } catch (IOException e) { handleCatchableIOE(e); } } else if (annotation instanceof PDAnnotationWidget) { handleWidget((PDAnnotationWidget) annotation); } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { //can't currently associate link to text. //for now, extract link and repeat the link as if it //were the visible text PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null && link.trim().length() > 0) { xhtml.startElement("div", "class", "annotation"); xhtml.startElement("a", "href", link); xhtml.characters(link); xhtml.endElement("a"); xhtml.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { xhtml.startElement("div", "class", "annotation"); if (title != null) { xhtml.startElement("div", "class", "annotationTitle"); xhtml.characters(title); xhtml.endElement("div"); } if (subject != null) { xhtml.startElement("div", "class", "annotationSubject"); xhtml.characters(subject); xhtml.endElement("div"); } if (contents != null) { xhtml.startElement("div", "class", "annotationContents"); xhtml.characters(contents); xhtml.endElement("div"); } xhtml.endElement("div"); } } } } PDPageAdditionalActions pageActions = page.getActions(); if (pageActions != null) { handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE); handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN); } xhtml.endElement("div"); } catch (SAXException | TikaException e) { throw new IOExceptionWithCause("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } finally { pageIndex++; } }
From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {//from w w w .jav a2s.c o m writeParagraphEnd(); extractImages(page.getResources()); EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { extractMultiOSPDEmbeddedFiles("", fileSpec, extractor); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null) { handler.startElement("div", "class", "annotation"); handler.startElement("a", "href", link); handler.endElement("a"); handler.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { handler.startElement("div", "class", "annotation"); if (title != null) { handler.startElement("div", "class", "annotationTitle"); handler.characters(title); handler.endElement("div"); } if (subject != null) { handler.startElement("div", "class", "annotationSubject"); handler.characters(subject); handler.endElement("div"); } if (contents != null) { handler.startElement("div", "class", "annotationContents"); handler.characters(contents); handler.endElement("div"); } handler.endElement("div"); } } } } handler.endElement("div"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a page", e); } }
From source file:org.apache.tika.parser.pdf.PDF2XHTML.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {//from w ww .j av a 2s. c o m writeParagraphEnd(); extractImages(page.getResources(), new HashSet<COSBase>()); EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { extractMultiOSPDEmbeddedFiles("", fileSpec, extractor); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null) { handler.startElement("div", "class", "annotation"); handler.startElement("a", "href", link); handler.endElement("a"); handler.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { handler.startElement("div", "class", "annotation"); if (title != null) { handler.startElement("div", "class", "annotationTitle"); handler.characters(title); handler.endElement("div"); } if (subject != null) { handler.startElement("div", "class", "annotationSubject"); handler.characters(subject); handler.endElement("div"); } if (contents != null) { handler.startElement("div", "class", "annotationContents"); handler.characters(contents); handler.endElement("div"); } handler.endElement("div"); } } } } handler.endElement("div"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a page", e); } page.clear(); }
From source file:org.data2semantics.annotate.D2S_SampleAnnotation.java
License:Apache License
/** * This will create a doucument showing various annotations. * /*from w ww . j av a 2 s . c o m*/ * @param args * The command line arguments. * * @throws Exception * If there is an error parsing the document. */ public static void main(String[] args) throws Exception { PDDocument document = new PDDocument(); try { PDPage page = new PDPage(); document.addPage(page); List annotations = page.getAnnotations(); // Setup some basic reusable objects/constants // Annotations themselves can only be used once! float inch = 72; PDGamma colourRed = new PDGamma(); colourRed.setR(1); PDGamma colourBlue = new PDGamma(); colourBlue.setB(1); PDGamma colourBlack = new PDGamma(); PDBorderStyleDictionary borderThick = new PDBorderStyleDictionary(); borderThick.setWidth(inch / 12); // 12th inch PDBorderStyleDictionary borderThin = new PDBorderStyleDictionary(); borderThin.setWidth(inch / 72); // 1 point PDBorderStyleDictionary borderULine = new PDBorderStyleDictionary(); borderULine.setStyle(PDBorderStyleDictionary.STYLE_UNDERLINE); borderULine.setWidth(inch / 72); // 1 point float pw = page.getMediaBox().getUpperRightX(); float ph = page.getMediaBox().getUpperRightY(); // First add some text, two lines we'll add some annotations to this // later PDFont font = PDType1Font.HELVETICA_BOLD; PDPageContentStream contentStream = new PDPageContentStream(document, page); contentStream.beginText(); contentStream.setFont(font, 18); contentStream.moveTextPositionByAmount(inch, ph - inch - 18); contentStream.drawString("PDFBox"); contentStream.moveTextPositionByAmount(0, -(inch / 2)); contentStream.drawString("Click Here"); contentStream.endText(); contentStream.close(); // Now add the markup annotation, a highlight to PDFBox text PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT); txtMark.setColour(colourBlue); txtMark.setConstantOpacity((float) 0.2); // Make the highlight 20% // transparent // Set the rectangle containing the markup float textWidth = (font.getStringWidth("PDFBox") / 1000) * 18; PDRectangle position = new PDRectangle(); position.setLowerLeftX(inch); position.setLowerLeftY(ph - inch - 18); position.setUpperRightX(72 + textWidth); position.setUpperRightY(ph - inch); txtMark.setRectangle(position); // work out the points forming the four corners of the annotations // set out in anti clockwise form (Completely wraps the text) // OK, the below doesn't match that description. // It's what acrobat 7 does and displays properly! float[] quads = new float[8]; quads[0] = position.getLowerLeftX(); // x1 quads[1] = position.getUpperRightY() - 2; // y1 quads[2] = position.getUpperRightX(); // x2 quads[3] = quads[1]; // y2 quads[4] = quads[0]; // x3 quads[5] = position.getLowerLeftY() - 2; // y3 quads[6] = quads[2]; // x4 quads[7] = quads[5]; // y5 txtMark.setQuadPoints(quads); txtMark.setContents("Highlighted since it's important"); annotations.add(txtMark); // Now add the link annotation, so the clickme works PDAnnotationLink txtLink = new PDAnnotationLink(); txtLink.setBorderStyle(borderULine); // Set the rectangle containing the link textWidth = (font.getStringWidth("Click Here") / 1000) * 18; position = new PDRectangle(); position.setLowerLeftX(inch); position.setLowerLeftY(ph - (float) (1.5 * inch) - 20); // down a // couple of // points position.setUpperRightX(72 + textWidth); position.setUpperRightY(ph - (float) (1.5 * inch)); txtLink.setRectangle(position); // add an action PDActionURI action = new PDActionURI(); action.setURI("http://www.pdfbox.org"); txtLink.setAction(action); annotations.add(txtLink); // Now draw a few more annotations PDAnnotationSquareCircle aCircle = new PDAnnotationSquareCircle( PDAnnotationSquareCircle.SUB_TYPE_CIRCLE); aCircle.setContents("Circle Annotation"); aCircle.setInteriorColour(colourRed); // Fill in circle in red aCircle.setColour(colourBlue); // The border itself will be blue aCircle.setBorderStyle(borderThin); // Place the annotation on the page, we'll make this 1" round // 3" down, 1" in on the page position = new PDRectangle(); position.setLowerLeftX(inch); position.setLowerLeftY(ph - (3 * inch) - inch); // 1" height, 3" // down position.setUpperRightX(2 * inch); // 1" in, 1" width position.setUpperRightY(ph - (3 * inch)); // 3" down aCircle.setRectangle(position); // add to the annotations on the page annotations.add(aCircle); // Now a square annotation PDAnnotationSquareCircle aSquare = new PDAnnotationSquareCircle( PDAnnotationSquareCircle.SUB_TYPE_SQUARE); aSquare.setContents("Square Annotation"); aSquare.setColour(colourRed); // Outline in red, not setting a fill aSquare.setBorderStyle(borderThick); // Place the annotation on the page, we'll make this 1" (72points) // square // 3.5" down, 1" in from the right on the page position = new PDRectangle(); // Reuse the variable, but note it's a // new object! position.setLowerLeftX(pw - (2 * inch)); // 1" in from right, 1" // wide position.setLowerLeftY(ph - (float) (3.5 * inch) - inch); // 1" height, 3.5" // down position.setUpperRightX(pw - inch); // 1" in from right position.setUpperRightY(ph - (float) (3.5 * inch)); // 3.5" down aSquare.setRectangle(position); // add to the annotations on the page annotations.add(aSquare); // Now we want to draw a line between the two, one end with an open // arrow PDAnnotationLine aLine = new PDAnnotationLine(); aLine.setEndPointEndingStyle(PDAnnotationLine.LE_OPEN_ARROW); aLine.setContents("Circle->Square"); aLine.setCaption(true); // Make the contents a caption on the line // Set the rectangle containing the line position = new PDRectangle(); // Reuse the variable, but note it's a // new object! position.setLowerLeftX(2 * inch); // 1" in + width of circle position.setLowerLeftY(ph - (float) (3.5 * inch) - inch); // 1" height, 3.5" // down position.setUpperRightX(pw - inch - inch); // 1" in from right, and // width of square position.setUpperRightY(ph - (3 * inch)); // 3" down (top of circle) aLine.setRectangle(position); // Now set the line position itself float[] linepos = new float[4]; linepos[0] = 2 * inch; // x1 = rhs of circle linepos[1] = ph - (float) (3.5 * inch); // y1 halfway down circle linepos[2] = pw - (2 * inch); // x2 = lhs of square linepos[3] = ph - (4 * inch); // y2 halfway down square aLine.setLine(linepos); aLine.setBorderStyle(borderThick); aLine.setColour(colourBlack); // add to the annotations on the page annotations.add(aLine); // Finally all done document.save("testAnnotation.pdf"); } finally { document.close(); } }
From source file:org.nuxeo.pdf.PDFLinks.java
License:Apache License
protected void loadAndPreflightPdf() throws NuxeoException { if (pdfDoc == null) { pdfDoc = PDFUtils.load(pdfBlob, password); @SuppressWarnings("unchecked") List<PDPage> allPages = pdfDoc.getDocumentCatalog().getAllPages(); try {/*from w ww .j a v a 2 s . c o m*/ stripper = new PDFTextStripperByArea(); for (PDPage page : allPages) { List<PDAnnotation> annotations = page.getAnnotations(); for (int j = 0; j < annotations.size(); j++) { PDAnnotation annot = (PDAnnotation) annotations.get(j); if (annot instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annot; PDRectangle rect = link.getRectangle(); // need to reposition link rectangle to match text space float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = page.findRotation(); if (rotation == 0) { PDRectangle pageSize = page.findMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { // do nothing } Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); stripper.addRegion("" + j, awtRect); } } } } catch (IOException e) { throw new NuxeoException("Cannot prefilght and prepare regions", e); } } }
From source file:org.nuxeo.pdf.PDFLinks.java
License:Apache License
@SuppressWarnings("unchecked") protected ArrayList<LinkInfo> parseForLinks(String inSubType) throws IOException { PDActionRemoteGoTo goTo;/* ww w.ja v a 2 s. c o m*/ PDActionLaunch launch; PDActionURI uri; PDFileSpecification fspec; ArrayList<LinkInfo> li = new ArrayList<LinkInfo>(); List<PDPage> allPages; allPages = pdfDoc.getDocumentCatalog().getAllPages(); int pageNum = 0; for (PDPage page : allPages) { pageNum += 1; stripper.extractRegions(page); List<PDAnnotation> annotations = page.getAnnotations(); for (int j = 0; j < annotations.size(); j++) { PDAnnotation annot = annotations.get(j); if (annot instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annot; PDAction action = link.getAction(); if (action.getSubType().equals(inSubType)) { String urlText = stripper.getTextForRegion("" + j); String urlValue = null; switch (inSubType) { case PDActionRemoteGoTo.SUB_TYPE: goTo = (PDActionRemoteGoTo) action; fspec = goTo.getFile(); urlValue = fspec.getFile(); break; case PDActionLaunch.SUB_TYPE: launch = (PDActionLaunch) action; fspec = launch.getFile(); urlValue = fspec.getFile(); break; case PDActionURI.SUB_TYPE: uri = (PDActionURI) action; urlValue = uri.getURI(); break; // . . . Others . . . } if (StringUtils.isNotBlank(urlValue)) { li.add(new LinkInfo(pageNum, inSubType, urlText, urlValue)); } } } } } return li; }
From source file:org.paxle.parser.pdf.impl.PdfParser.java
License:Open Source License
/** * A function to extract embedded URIs from the PDF-document. * //from ww w. ja va2 s . com */ protected void extractURLs(IParserDocument parserDoc, PDDocument pddDoc) throws IOException { final PDDocumentCatalog pddDocCatalog = pddDoc.getDocumentCatalog(); if (pddDocCatalog == null) return; @SuppressWarnings("unchecked") final List<PDPage> allPages = pddDocCatalog.getAllPages(); if (allPages == null || allPages.isEmpty()) return; for (int i = 0; i < allPages.size(); i++) { final PDFTextStripperByArea stripper = new PDFTextStripperByArea(); final PDPage page = (PDPage) allPages.get(i); @SuppressWarnings("unchecked") final List<PDAnnotation> annotations = page.getAnnotations(); if (annotations == null || annotations.isEmpty()) return; //first setup text extraction regions for (int j = 0; j < annotations.size(); j++) { final PDAnnotation annot = (PDAnnotation) annotations.get(j); if (annot instanceof PDAnnotationLink) { final PDAnnotationLink link = (PDAnnotationLink) annot; final PDRectangle rect = link.getRectangle(); //need to reposition link rectangle to match text space float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = page.findRotation(); if (rotation == 0) { PDRectangle pageSize = page.findMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { //do nothing } Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); stripper.addRegion("" + j, awtRect); } } stripper.extractRegions(page); for (int j = 0; j < annotations.size(); j++) { final PDAnnotation annot = (PDAnnotation) annotations.get(j); if (annot instanceof PDAnnotationLink) { final PDAnnotationLink link = (PDAnnotationLink) annot; final PDAction action = link.getAction(); final String urlText = stripper.getTextForRegion("" + j); if (action instanceof PDActionURI) { final PDActionURI embeddedUri = (PDActionURI) action; final URI temp = URI.create(embeddedUri.getURI()); parserDoc.addReference(temp, urlText, Constants.SERVICE_PID + ":" + PID); } } } } }