org.xwiki.test.misc.PDFTest.java Source code

Introduction

Here is the source code for org.xwiki.test.misc.PDFTest.java
Source

/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package org.xwiki.test.misc;

import java.awt.geom.Rectangle2D;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageXYZDestination;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.tools.PDFText2HTML;

import junit.framework.TestCase;

public class PDFTest extends TestCase {
    /**
     * Verify that the PDF export feature works on a single simple page by downloading the PDF and parsing it using
     * PDFBox.
     * 
     * @see "XWIKI-7048: PDF export templates can display properties of other objects if the XWiki.PDFClass object is
     *      missing"
     */
    public void testExportSingleSimplePageAsPDF() throws Exception {
        // We're using Dashboard.WebHome page because it has objects of type XWiki.GadgetClass and they have a title
        // property which was mistaken with the title property of XWiki.PDFClass before XWIKI-7048 was fixed. The gadget
        // title contains Velocity code that isn't wrapped in a Velocity macro so it is printed as is if not rendered in
        // the right context.
        String text = getPDFContent(new URL("http://localhost:8080/xwiki/bin/export/Dashboard/WebHome?format=pdf"));
        // Note: This is the title of the Pages gadget when it's working
        assertTrue("Invalid content", text.contains("Pages"));
        // Note: This is the title of the Pages gadget before XWIKI-7048 was fixed
        assertFalse("Invalid content", text.contains("$services.localization.render("));
    }

    /**
     * Verify that we can export content having links to attachments.
     * 
     * @see "XWIKI-8978: PDF Export does not handle XWiki links to attached files properly"
     */
    public void testExportContentWithAttachmentLink() throws Exception {
        URL pdfExportURL = new URL("http://localhost:8080/xwiki/bin/export/Sandbox/WebHome?format=pdf");
        Map<String, String> urls = extractURLs(pdfExportURL);
        assertTrue(urls.containsKey("XWikiLogo.png"));
        assertEquals("http://localhost:8080/xwiki/bin/download/Sandbox/WebHome/XWikiLogo.png",
                urls.get("XWikiLogo.png"));

        // Ideally we should be asserting for a value of 1 (for the embedded XWikiLogo.png image) but it seems the PDF
        // contains 2 image objects (for some reason I don't understand ATM - they seem to be variations of the same
        // image - the logo - in color, in black and white, etc).
        assertEquals(2, getImages(pdfExportURL).size());
    }

    /**
     * Verify the PDF export with table of contents.
     * 
     * @see "XWIKI-9370: PDF Export doesn't list the Table of Contents under certain circumstances"
     */
    public void testTableOfContents() throws Exception {
        Map<String, String> internalLinks = extractToLinks(
                new URL("http://localhost:8080/xwiki/bin/export/Sandbox/WebHome"
                        + "?format=pdf&pdftoc=1&attachments=1&pdfcover=0"),
                0);
        // Make sure we have a Table of Contents.
        assertTrue(internalLinks.containsKey("Mixed list"));
        // Make sure the Table of Contents links point to their corresponding heading.
        for (Map.Entry<String, String> entry : internalLinks.entrySet()) {
            assertTrue(entry.getValue().contains(entry.getKey()));
        }
    }

    private String getPDFContent(URL url) throws Exception {
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        InputStream is = connection.getInputStream();
        PDDocument pdd = PDDocument.load(is);
        String text;
        try {
            PDFText2HTML stripper = new PDFText2HTML();
            text = stripper.getText(pdd);
        } finally {
            if (pdd != null) {
                pdd.close();
            }
            if (is != null) {
                is.close();
            }
        }
        return text;
    }

    private Map<String, PDImageXObject> getImages(URL url) throws Exception {
        Map<String, PDImageXObject> results = new HashMap<>();

        PDDocument document = PDDocument.load(IOUtils.toByteArray(url));
        try {
            for (PDPage page : document.getDocumentCatalog().getPages()) {
                PDResources pdResources = page.getResources();
                for (COSName name : pdResources.getXObjectNames()) {
                    if (pdResources.isImageXObject(name)) {
                        PDImageXObject pdxObjectImage = (PDImageXObject) pdResources.getXObject(name);
                        results.put(name.getName(), pdxObjectImage);
                    }
                }
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }

        return results;
    }

    private Map<String, String> extractURLs(URL url) throws Exception {
        Map<String, String> urls = new HashMap<String, String>();
        PDDocument document = null;
        try {
            document = PDDocument.load(IOUtils.toByteArray(url));
            for (Map.Entry<String, PDAction> entry : extractLinks(document).entrySet()) {
                if (entry.getValue() instanceof PDActionURI) {
                    PDActionURI uri = (PDActionURI) entry.getValue();
                    urls.put(entry.getKey(), uri.getURI());
                }
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
        return urls;
    }

    private Map<String, String> extractToLinks(URL url, int tocPageIndex) throws Exception {
        Map<String, String> internalLinks = new HashMap<String, String>();
        PDDocument document = null;
        try {
            document = PDDocument.load(IOUtils.toByteArray(url));
            PDPage tocPage = document.getDocumentCatalog().getPages().get(tocPageIndex);
            for (Map.Entry<String, PDAction> entry : extractLinks(tocPage).entrySet()) {
                if (entry.getValue() instanceof PDActionGoTo) {
                    PDActionGoTo anchor = (PDActionGoTo) entry.getValue();
                    internalLinks.put(entry.getKey(), getDestinationText(anchor.getDestination()));
                }
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
        return internalLinks;
    }

    private Map<String, PDAction> extractLinks(PDDocument document) throws Exception {
        Map<String, PDAction> links = new HashMap<String, PDAction>();
        for (PDPage page : document.getDocumentCatalog().getPages()) {
            links.putAll(extractLinks(page));
        }
        return links;
    }

    /**
     * Code adapted from http://www.docjar.com/html/api/org/apache/pdfbox/examples/pdmodel/PrintURLs.java.html
     */
    private Map<String, PDAction> extractLinks(PDPage page) throws Exception {
        Map<String, PDAction> links = new HashMap<String, PDAction>();
        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
        List<PDAnnotation> annotations = page.getAnnotations();
        // First setup the text extraction regions.
        for (int j = 0; j < annotations.size(); j++) {
            PDAnnotation annotation = annotations.get(j);
            if (annotation instanceof PDAnnotationLink) {
                PDAnnotationLink link = (PDAnnotationLink) annotation;
                PDRectangle rect = link.getRectangle();
                // Need to reposition link rectangle to match text space.
                float x = rect.getLowerLeftX();
                float y = rect.getUpperRightY();
                float width = rect.getWidth();
                float height = rect.getHeight();
                int rotation = page.getRotation();
                if (rotation == 0) {
                    PDRectangle pageSize = page.getMediaBox();
                    y = pageSize.getHeight() - y;
                } else if (rotation == 90) {
                    // Do nothing.
                }

                Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
                stripper.addRegion(String.valueOf(j), awtRect);
            }
        }

        stripper.extractRegions(page);

        for (int j = 0; j < annotations.size(); j++) {
            PDAnnotation annotation = annotations.get(j);
            if (annotation instanceof PDAnnotationLink) {
                PDAnnotationLink link = (PDAnnotationLink) annotation;
                String label = stripper.getTextForRegion(String.valueOf(j)).trim();
                links.put(label, link.getAction());
            }
        }

        return links;
    }

    private String getDestinationText(PDDestination destination) throws Exception {
        if (destination instanceof PDPageXYZDestination) {
            return getDestinationText((PDPageXYZDestination) destination);
        } else if (destination instanceof PDPageDestination) {
            return "Page " + ((PDPageDestination) destination).getPageNumber();
        }
        return destination.toString();
    }

    private String getDestinationText(PDPageXYZDestination destination) throws Exception {
        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
        stripper.addRegion("destination", getRectangleBelowDestination(destination));
        stripper.extractRegions(destination.getPage());
        return stripper.getTextForRegion("destination").trim();
    }

    private Rectangle2D getRectangleBelowDestination(PDPageXYZDestination destination) {
        PDPage page = destination.getPage();
        PDRectangle pageSize = page.getMediaBox();
        float x = destination.getLeft();
        float y = pageSize.getHeight() - destination.getTop();
        float width = pageSize.getWidth();
        float height = destination.getTop();
        return new Rectangle2D.Float(x, y, width, height);
    }
}