org.paxle.parser.pdf.impl.PdfParser.java Source code

Introduction

Here is the source code for org.paxle.parser.pdf.impl.PdfParser.java
Source

/**
 * This file is part of the Paxle project.
 * Visit http://www.paxle.net for more information.
 * Copyright 2007-2010 the original author or authors.
 *
 * Licensed under the terms of the Common Public License 1.0 ("CPL 1.0").
 * Any use, reproduction or distribution of this program constitutes the recipient's acceptance of this agreement.
 * The full license text is available under http://www.opensource.org/licenses/cpl1.0.txt
 * or in the file LICENSE.txt in the root directory of the Paxle distribution.
 *
 * Unless required by applicable law or agreed to in writing, this software is distributed
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

package org.paxle.parser.pdf.impl;

import java.awt.geom.Rectangle2D;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.URI;
import java.util.Arrays;
import java.util.Calendar;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.PDFTextStripperByArea;
import org.osgi.framework.Constants;
import org.paxle.core.doc.ICommandProfile;
import org.paxle.core.doc.IParserDocument;
import org.paxle.parser.IParserContext;
import org.paxle.parser.IParserContextLocal;
import org.paxle.parser.ISubParser;
import org.paxle.parser.ParserException;

@Component(name = PdfParser.PID, metatype = false)
@Service(ISubParser.class)
@Property(name = ISubParser.PROP_MIMETYPES, value = { "application/pdf" })
public class PdfParser implements ISubParser {
    static final String PID = "org.paxle.parser.pdf.impl.PdfParser";

    /**
     * For logging
     */
    private final Log logger = LogFactory.getLog(this.getClass());

    @Reference
    protected IParserContextLocal contextLocal;

    public IParserDocument parse(URI location, String charset, InputStream fileIn)
            throws ParserException, UnsupportedEncodingException, IOException {
        IParserDocument parserDoc = null;
        PDDocument pddDoc = null;

        try {
            final IParserContext pc = this.contextLocal.getCurrentContext();
            final ICommandProfile cmdProfile = pc.getCommandProfile();

            // create an empty document         
            parserDoc = pc.createDocument();

            // parse it
            final PDFParser parser = new PDFParser(fileIn);
            parser.parse();
            pddDoc = parser.getPDDocument();

            // check document encryption
            if (pddDoc.isEncrypted()) {
                if (this.logger.isDebugEnabled()) {
                    this.logger.debug(String.format("Document '%s' is encrypted.", location));
                }

                // determine the decryption password
                String pwd = "";
                if (cmdProfile != null) {
                    String tmp = (String) cmdProfile.getProperty("org.paxle.parser.pdf.impl.decryptionPassword");
                    if (tmp != null)
                        pwd = tmp;
                }

                // try to open document with the given password
                try {
                    final StandardDecryptionMaterial dm = new StandardDecryptionMaterial(pwd);
                    pddDoc.openProtection(dm);
                    final AccessPermission accessPermission = pddDoc.getCurrentAccessPermission();

                    if (accessPermission == null || !accessPermission.canExtractContent()) {
                        if (this.logger.isInfoEnabled()) {
                            this.logger.debug(
                                    String.format("No permission to extract content of document '%s'.", location));
                        }
                        parserDoc.setStatus(IParserDocument.Status.FAILURE, "PDF Document is encrypted.");
                        return parserDoc;
                    }
                } catch (Throwable e) {
                    this.logger.error(String.format("Unable to decrypt document '%s'.", location), e);
                    parserDoc.setStatus(IParserDocument.Status.FAILURE, String
                            .format("Unable to decrypt document. %s: %s", e.getClass().getName(), e.getMessage()));
                    return parserDoc;
                }
            }

            // extract metadata
            this.extractMetaData(parserDoc, pddDoc);

            // extract text
            final PDFTextStripper stripper = new PDFTextStripper();

            // XXX: we could limit the amount of parsed pages via crawling-profile properties?
            // stripper.setStartPage(startPageValue);
            // stripper.setEndPage(endPageValue);

            final Writer pdocWriter = parserDoc.getTextWriter();
            stripper.writeText(pddDoc, pdocWriter);
            pdocWriter.flush();

            // extracting URIs
            this.extractURLs(parserDoc, pddDoc);

            // extracting embedded files
            this.extractEmbeddedFiles(location, parserDoc, pddDoc);

            parserDoc.setStatus(IParserDocument.Status.OK);
            return parserDoc;
        } catch (Throwable e) {
            throw new ParserException("Error parsing pdf document. " + e.getMessage(), e);
        } finally {
            if (pddDoc != null)
                try {
                    pddDoc.close();
                } catch (Exception e) {
                    this.logger.error(e);
                }
        }
    }

    /**
     * A function to extract metadata from the PDF-document.
     */
    protected void extractMetaData(IParserDocument parserDoc, PDDocument pddDoc) throws IOException {
        // extract metadata
        final PDDocumentInformation metadata = pddDoc.getDocumentInformation();
        if (metadata == null)
            return;

        // document title
        final String title = metadata.getTitle();
        if (title != null && title.length() > 0)
            parserDoc.setTitle(title);

        // document author(s)
        final String author = metadata.getAuthor();
        if (author != null && author.length() > 0)
            parserDoc.setAuthor(author);
        ;

        // subject
        final String summary = metadata.getSubject();
        if (summary != null && summary.length() > 0)
            parserDoc.setSummary(summary);

        // keywords
        final String keywords = metadata.getKeywords();
        if (keywords != null && keywords.length() > 0) {
            String[] keywordArray = keywords.split("[,;\\s]");
            if (keywordArray != null && keywordArray.length > 0) {
                parserDoc.setKeywords(Arrays.asList(keywordArray));
            }
        }

        // last modification date
        final Calendar lastMod = metadata.getModificationDate();
        if (lastMod != null) {
            parserDoc.setLastChanged(lastMod.getTime());
        }
    }

    /**
     * A function to extract embedded URIs from the PDF-document.
     * 
     */
    protected void extractURLs(IParserDocument parserDoc, PDDocument pddDoc) throws IOException {
        final PDDocumentCatalog pddDocCatalog = pddDoc.getDocumentCatalog();
        if (pddDocCatalog == null)
            return;

        @SuppressWarnings("unchecked")
        final List<PDPage> allPages = pddDocCatalog.getAllPages();
        if (allPages == null || allPages.isEmpty())
            return;

        for (int i = 0; i < allPages.size(); i++) {
            final PDFTextStripperByArea stripper = new PDFTextStripperByArea();
            final PDPage page = (PDPage) allPages.get(i);

            @SuppressWarnings("unchecked")
            final List<PDAnnotation> annotations = page.getAnnotations();
            if (annotations == null || annotations.isEmpty())
                return;

            //first setup text extraction regions
            for (int j = 0; j < annotations.size(); j++) {
                final PDAnnotation annot = (PDAnnotation) annotations.get(j);
                if (annot instanceof PDAnnotationLink) {
                    final PDAnnotationLink link = (PDAnnotationLink) annot;
                    final PDRectangle rect = link.getRectangle();

                    //need to reposition link rectangle to match text space
                    float x = rect.getLowerLeftX();
                    float y = rect.getUpperRightY();
                    float width = rect.getWidth();
                    float height = rect.getHeight();
                    int rotation = page.findRotation();
                    if (rotation == 0) {
                        PDRectangle pageSize = page.findMediaBox();
                        y = pageSize.getHeight() - y;
                    } else if (rotation == 90) {
                        //do nothing
                    }

                    Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
                    stripper.addRegion("" + j, awtRect);
                }
            }

            stripper.extractRegions(page);

            for (int j = 0; j < annotations.size(); j++) {
                final PDAnnotation annot = (PDAnnotation) annotations.get(j);
                if (annot instanceof PDAnnotationLink) {
                    final PDAnnotationLink link = (PDAnnotationLink) annot;
                    final PDAction action = link.getAction();
                    final String urlText = stripper.getTextForRegion("" + j);

                    if (action instanceof PDActionURI) {
                        final PDActionURI embeddedUri = (PDActionURI) action;
                        final URI temp = URI.create(embeddedUri.getURI());

                        parserDoc.addReference(temp, urlText, Constants.SERVICE_PID + ":" + PID);
                    }
                }
            }
        }
    }

    /**
     * A function to extract the content of embedded files from a PDF document.
     */
    protected void extractEmbeddedFiles(URI location, IParserDocument parserDoc, PDDocument pddDoc)
            throws IOException {
        final PDDocumentCatalog pddDocCatalog = pddDoc.getDocumentCatalog();
        if (pddDocCatalog == null)
            return;

        final PDDocumentNameDictionary nameDic = pddDocCatalog.getNames();
        if (nameDic == null)
            return;

        final PDEmbeddedFilesNameTreeNode embeddedFiles = nameDic.getEmbeddedFiles();
        if (embeddedFiles == null)
            return;

        @SuppressWarnings("unchecked")
        final Map<String, Object> names = embeddedFiles.getNames();
        if (names == null || names.isEmpty())
            return;

        final IParserContext context = this.contextLocal.getCurrentContext();

        for (Entry<String, Object> name : names.entrySet()) {
            // final String fileDesc = name.getKey();
            final Object fileObj = name.getValue();
            if (fileObj == null)
                continue;

            if (fileObj instanceof PDComplexFileSpecification) {
                final PDComplexFileSpecification embeddedFileSpec = (PDComplexFileSpecification) fileObj;
                final PDEmbeddedFile embeddedFile = embeddedFileSpec.getEmbeddedFile();

                // getting the embedded file name and mime-type
                final String fileName = embeddedFileSpec.getFile();
                final String fileMimeType = embeddedFile.getSubtype();
                if (fileMimeType == null) {
                    this.logger.warn(String.format("No mime-type specified form embedded file '%s#%s'.", location,
                            fileName));
                    continue;
                }

                // getting a parser to parse the content
                final ISubParser sp = context.getParser(fileMimeType);
                if (sp == null) {
                    this.logger.warn(String.format("No parser found to parse embedded file '%s#%s' with type '%s'.",
                            location, fileName, fileMimeType));
                    continue;
                }

                // parsing content
                InputStream embeddedFileStream = null;
                try {
                    embeddedFileStream = embeddedFile.createInputStream();
                    final IParserDocument subParserDoc = sp.parse(location, "UTF-8", embeddedFileStream);
                    if (subParserDoc.getMimeType() == null) {
                        subParserDoc.setMimeType(fileMimeType);
                    }

                    parserDoc.addSubDocument(fileName, subParserDoc);
                } catch (ParserException e) {
                    this.logger.error(String.format(
                            "Unexpected error while parsing parse embedded file '%s#%s' with type '%s': %s",
                            location, fileName, fileMimeType, e.getMessage()));
                } finally {
                    if (embeddedFileStream != null)
                        try {
                            embeddedFileStream.close();
                        } catch (Exception e) {
                            this.logger.error(e);
                        }
                }
            }
        }
    }

    public IParserDocument parse(URI location, String charset, File content)
            throws ParserException, UnsupportedEncodingException, IOException {
        InputStream fileIn = null;
        try {
            // open file
            fileIn = new BufferedInputStream(new FileInputStream(content));
            return parse(location, charset, fileIn);
        } finally {
            if (fileIn != null)
                try {
                    fileIn.close();
                } catch (Exception e) {
                    this.logger.error(e);
                }
        }
    }

}