disko.PDFDocument.java Source code

Java tutorial

Introduction

Here is the source code for disko.PDFDocument.java

Source

/*******************************************************************************
 * Copyright (c) 2005, Kobrix Software, Inc.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Lesser Public License v2.1
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
 * 
 * Contributors:
 *     Borislav Iordanov - initial API and implementation
 *     Murilo Saraiva de Queiroz - initial API and implementation
 ******************************************************************************/
package disko;

import java.io.File;

import java.io.FilenameFilter;
import java.io.IOException;
import java.io.StringWriter;
import java.lang.ref.WeakReference;
import java.net.URL;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDPage;
import org.pdfbox.util.PDFTextStripper;

public class PDFDocument extends DefaultTextDocument {
    public static final String PAGE_ANN = "page";

    private static Log log = LogFactory.getLog(PDFDocument.class);

    protected synchronized String load() {
        final StringWriter stringOutput = new StringWriter();
        PDDocument document = null;
        try {
            document = PDDocument.load(ContentDownloader.getInstance().getInputStream(getUrlString()));
            title = document.getDocumentInformation().getTitle();
            PDFTextStripper stripper = new PDFTextStripper() {
                int startPage = 0;

                private int getIndex() {
                    return stringOutput.getBuffer().length();
                }

                @Override
                protected void startPage(PDPage page) throws IOException {
                    startPage = getIndex();
                    log.debug("START PAGE " + getCurrentPageNo() + " AT " + startPage);
                }

                @Override
                protected void endPage(PDPage page) throws IOException {
                    int endPage = getIndex();
                    DocumentAnn ann = new DocumentAnn(startPage, endPage, PAGE_ANN);
                    annotations.add(ann);
                    log.debug("END PAGE " + getCurrentPageNo() + " AT " + endPage);
                }

            };
            //            stripper.setSortByPosition(false);
            //            String maxPageSetting = System.getProperty("disco.pdf.max.pages", 
            //                                             Integer.toString(Integer.MAX_VALUE));
            //            stripper.setEndPage(Integer.parseInt(maxPageSetting));
            stripper.writeText(document, stringOutput);
        } catch (Throwable t) {
            throw new RuntimeException("Unable to read resource at URL '" + url + "'", t);
        } finally {
            if (stringOutput != null) {
                try {
                    stringOutput.close();
                } catch (IOException e) {
                }
            }
            if (document != null) {
                try {
                    document.close();
                } catch (IOException e) {
                }
            }
        }

        String s = DU.replaceUnicodePunctuation(stringOutput.toString());
        ParagraphDetector.detectParagraphs(s, this.annotations);

        return (fullText = new WeakReference<String>(s)).get();
    }

    public PDFDocument() {
    }

    public PDFDocument(URL url) {
        super(url);
    }

    public PDFDocument(File f) {
        super(f);
    }

    public static void main(String[] args) {
        File dir = new File("/var/tmp/muriloq/mdc/selected");

        File[] pdfFiles = dir.listFiles(new FilenameFilter() {
            public boolean accept(File dir, String name) {
                return name.toLowerCase().endsWith(".pdf");
            }
        });

        for (File f : pdfFiles) {
            try {
                System.out.println(f.getName());
                PDFDocument pdf = new PDFDocument(f);
                pdf.getFullText();
                System.out.println(pdf.getFullText());
            } catch (Throwable t) {
                t.printStackTrace();
            }
        }
    }

}