List of usage examples for org.apache.pdfbox.pdmodel PDPage getAnnotations
public List<PDAnnotation> getAnnotations() throws IOException
From source file:org.pdfmetamodifier.IOHelper.java
License:Apache License
/** * Save all Attached (embedded) files to some directory. * /*from ww w .ja v a2 s . c om*/ * @param pdfFile * Source PDF file. * @param outputDir * Target directory. * @throws IOException */ /* * See: * https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java?view=markup */ public static void saveAttachments(final File pdfFile, final File outputDir) throws IOException { PDDocument document = null; try { // Read PDF file. document = PDDocument.load(pdfFile); if (document.isEncrypted()) { throw new IOException("Document is encrypted."); } // Extract Embedded (attached) files. final PDDocumentNameDictionary documentNameDictionary = new PDDocumentNameDictionary( document.getDocumentCatalog()); final PDEmbeddedFilesNameTreeNode embeddedFilesNameTree = documentNameDictionary.getEmbeddedFiles(); if (embeddedFilesNameTree != null) { extractFiles(outputDir, embeddedFilesNameTree.getNames()); final List<PDNameTreeNode<PDComplexFileSpecification>> kids = embeddedFilesNameTree.getKids(); if (kids != null) { for (PDNameTreeNode<PDComplexFileSpecification> nameTreeNode : kids) { extractFiles(outputDir, nameTreeNode.getNames()); } } } // Extract Embedded (attached) from annotations. for (PDPage page : document.getPages()) { for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { final PDAnnotationFileAttachment fileAttach = (PDAnnotationFileAttachment) annotation; final PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fileAttach .getFile(); extractFile(outputDir, fileSpec); } } } } finally { if (document != null) { document.close(); } } }
From source file:org.xwiki.test.misc.PDFTest.java
License:Open Source License
/** * Code adapted from http://www.docjar.com/html/api/org/apache/pdfbox/examples/pdmodel/PrintURLs.java.html */// w ww .j a v a 2 s. c o m private Map<String, PDAction> extractLinks(PDPage page) throws Exception { Map<String, PDAction> links = new HashMap<String, PDAction>(); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); List<PDAnnotation> annotations = page.getAnnotations(); // First setup the text extraction regions. for (int j = 0; j < annotations.size(); j++) { PDAnnotation annotation = annotations.get(j); if (annotation instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annotation; PDRectangle rect = link.getRectangle(); // Need to reposition link rectangle to match text space. float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = page.getRotation(); if (rotation == 0) { PDRectangle pageSize = page.getMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { // Do nothing. } Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); stripper.addRegion(String.valueOf(j), awtRect); } } stripper.extractRegions(page); for (int j = 0; j < annotations.size(); j++) { PDAnnotation annotation = annotations.get(j); if (annotation instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annotation; String label = stripper.getTextForRegion(String.valueOf(j)).trim(); links.put(label, link.getAction()); } } return links; }
From source file:uk.ac.leeds.ccg.andyt.rdl.web.RDL_ParsePDF.java
/** * https://svn.apache.org/viewvc/pdfbox/trunk/examples/ Based on * https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/PrintURLs.java?view=markup&pathrev=1703066 * * @param f/*from w w w.j ava 2s . c o m*/ * @param filter * @param fis * @return * @throws IOException * @throws TikaException * @throws SAXException */ public static ArrayList<String[]> parseForLinks(File f, String filter, FileInputStream fis) throws IOException, TikaException, SAXException { ArrayList<String[]> result; result = new ArrayList<String[]>(); PDDocument doc = PDDocument.load(f); int pageNum = 0; for (PDPage page : doc.getPages()) { pageNum++; // if (pageNum == 11) { //Degug test hack System.out.println("Parsing page " + pageNum); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); List<PDAnnotation> annotations = page.getAnnotations(); //first setup text extraction regions for (int j = 0; j < annotations.size(); j++) { PDAnnotation annot = annotations.get(j); if (annot instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annot; PDRectangle rect = link.getRectangle(); //need to reposition link rectangle to match text space float x = rect.getLowerLeftX(); float y = rect.getUpperRightY(); float width = rect.getWidth(); float height = rect.getHeight(); int rotation = page.getRotation(); if (rotation == 0) { PDRectangle pageSize = page.getMediaBox(); y = pageSize.getHeight() - y; } else if (rotation == 90) { //do nothing } //Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); // Rounding here could be a problem! Rectangle2D.Double awtRect = new Rectangle2D.Double(x, y, width, height); stripper.addRegion("" + j, awtRect); } } stripper.extractRegions(page); for (int j = 0; j < annotations.size(); j++) { PDAnnotation annot = annotations.get(j); if (annot instanceof PDAnnotationLink) { PDAnnotationLink link = (PDAnnotationLink) annot; PDAction action = link.getAction(); if (action == null) { System.out.println(link.getContents()); System.out.println(annot.getClass().getName()); System.out.println(annot.getAnnotationName()); //System.out.println(annot.getNormalAppearanceStream().toString()); System.out.println(annot.getContents()); System.out.println(annot.getSubtype()); } else { String urlText = stripper.getTextForRegion("" + j); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI) action; String url; url = uri.getURI(); if (url.contains(filter)) { String[] partResult; partResult = new String[3]; partResult[0] = "Page " + pageNum; partResult[1] = "urlText " + urlText; partResult[2] = "URL " + uri.getURI(); System.out.println(partResult[0]); System.out.println(partResult[1]); System.out.println(partResult[2]); System.out.println("URL " + uri.getURI()); result.add(partResult); } else { System.out.println("URL " + uri.getURI()); } } else { System.out.println(action.getType()); } } } else { System.out.println(annot.getClass().getName()); System.out.println(annot.getAnnotationName()); System.out.println(annot.getContents()); System.out.println(annot.getSubtype()); } } //} } // PDDocument doc = PDDocument.load(f); // int pageNum = 0; // for (PDPage page : doc.getPages()) { // pageNum++; // List<PDAnnotation> annotations = page.getAnnotations(); // // for (PDAnnotation annotation : annotations) { // PDAnnotation annot = annotation; // if (annot instanceof PDAnnotationLink) { // PDAnnotationLink link = (PDAnnotationLink) annot; // PDAction action = link.getAction(); // if (action instanceof PDActionURI) { // PDActionURI uri = (PDActionURI) action; // String oldURI = uri.getURI(); // String name = annot.getAnnotationName(); // String contents = annot.getContents(); // PDAppearanceStream a = annot.getNormalAppearanceStream(); // //String newURI = "http://pdfbox.apache.org"; // System.out.println(oldURI + " " + name + " " + contents); // //uri.setURI(newURI); // } // } // } // } // result = parseWithTika(fis); //XMPSchema schema; //schema = new XMPSchema(); //List<String> XMPBagOrSeqList; //XMPBagOrSeqList = getXMPBagOrSeqList(XMPSchema schema, String name) { // PDDocument tPDDocument; // tPDDocument = PDDocument.load(f); // COSDocument tCOSDocument; // tCOSDocument = tPDDocument.getDocument(); // String header; // header = tCOSDocument.getHeaderString(); // System.out.println(header); // PDDocumentCatalog tPDDocumentCatalog; // tPDDocumentCatalog = tPDDocument.getDocumentCatalog(); // PDDocumentNameDictionary tPDDocumentNameDictionary; // tPDDocumentNameDictionary = tPDDocumentCatalog.getNames(); // COSDictionary tCOSDictionary; // tCOSDictionary = tPDDocumentNameDictionary.getCOSDictionary(); //tCOSDictionary. // PDPageNode tPDPageNode; // tPDPageNode = tPDDocumentCatalog.getPages(); // List<COSObject> tCOSObjects; // tCOSObjects = tCOSDocument.getObjects(); // int n; // n = tCOSObjects.size(); // System.out.println(n); // COSObject aCOSObject; // String s; // for (int i = 0; i < n; i++) { // aCOSObject = tCOSObjects.get(i); // s = aCOSObject.toString(); // System.out.println(s); // } // XMPMetadata tXMPMetadata; // tXMPMetadata = getXMPMetadata(tPDDocument); // Document XMPDocument; // XMPDocument = tXMPMetadata.getXMPDocument(); // Node n; // n = XMPDocument.getFirstChild(); // parseNode(n); return result; }
From source file:uk.bl.wa.tika.parser.pdf.pdfbox.PDF2XHTML.java
License:Apache License
@Override protected void endPage(PDPage page) throws IOException { try {/*w ww . j a v a 2 s .c o m*/ writeParagraphEnd(); // TODO: remove once PDFBOX-1143 is fixed: if (extractAnnotationText) { for (Object o : page.getAnnotations()) { if ((o instanceof PDAnnotation) && PDAnnotationMarkup.SUB_TYPE_FREETEXT.equals(((PDAnnotation) o).getSubtype())) { // It's a text annotation: PDAnnotationMarkup annot = (PDAnnotationMarkup) o; String title = annot.getTitlePopup(); String subject = annot.getTitlePopup(); String contents = annot.getContents(); // TODO: maybe also annot.getRichContents()? if (title != null || subject != null || contents != null) { handler.startElement("div", "class", "annotation"); if (title != null) { handler.startElement("div", "class", "annotationTitle"); handler.characters(title); handler.endElement("div"); } if (subject != null) { handler.startElement("div", "class", "annotationSubject"); handler.characters(subject); handler.endElement("div"); } if (contents != null) { handler.startElement("div", "class", "annotationContents"); handler.characters(contents); handler.endElement("div"); } handler.endElement("div"); } } } } handler.endElement("div"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a page", e); } }
From source file:vortext.TextHighlight.java
License:Apache License
@SuppressWarnings("unchecked") public List<PDAnnotationTextMarkup> highlight(final Pattern pattern, final String subType) throws IOException { if (textAggregate == null || document == null) { throw new IllegalArgumentException("TextAggregate was not initilized"); }/*from w w w. j a va 2 s. c o m*/ final List<PDPage> pages = document.getDocumentCatalog().getAllPages(); final ArrayList<PDAnnotationTextMarkup> newAnnotations = new ArrayList<PDAnnotationTextMarkup>(); for (int pageIndex = getStartPage() - 1; pageIndex < getEndPage() && pageIndex < pages.size(); pageIndex++) { final PDPage page = pages.get(pageIndex); final List<PDAnnotation> annotations = page.getAnnotations(); final List<Match> matches = textAggregate.match(pageIndex + 1, pattern); for (final Match match : matches) { final List<PDRectangle> textBoundingBoxes = getTextBoundingBoxes(match.positions); if (textBoundingBoxes.size() > 0) { final PDAnnotationTextMarkup annotation = new PDAnnotationTextMarkup(subType); annotation.setRectangle(textBoundingBoxes.get(0)); final float[] quads = this.getQuads(textBoundingBoxes); annotation.setQuadPoints(quads); annotation.setContents(match.str); annotations.add(annotation); newAnnotations.add(annotation); } } } return newAnnotations; }