ome.services.fulltext.PdfParser.java Source code

Java tutorial

Introduction

Here is the source code for ome.services.fulltext.PdfParser.java

Source

/*
 *   $Id$
 *
 *   Copyright 2008 Glencoe Software, Inc. All rights reserved.
 *   Use is subject to license terms supplied in LICENSE.txt
 */
package ome.services.fulltext;

import java.io.File;
import java.io.IOException;
import java.io.PipedReader;
import java.io.PipedWriter;
import java.io.Reader;

import ome.services.messages.RegisterServiceCleanupMessage;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

/**
 * {@link FileParser} for "application/pdf" files using <a
 * href="http://pdfbox.org/">PDFBox</a>.
 * 
 * @author Josh Moore, josh at glencoesoftware.com
 */
public class PdfParser extends FileParser {

    private final static Log log = LogFactory.getLog(PdfParser.class);

    @Override
    public Iterable<Reader> doParse(File file) throws Exception {

        final PdfThread pdfThread = new PdfThread(file);
        this.context.publishEvent(new RegisterServiceCleanupMessage(this, pdfThread) {
            @Override
            public void close() {
                try {
                    pdfThread.close();
                } catch (Exception e) {
                    log.warn("Error closing PdfThread " + pdfThread, e);
                }
            }

        });

        pdfThread.start();
        return wrap(pdfThread.getReader());
    }
}

class PdfThread extends Thread {

    private final static Log log = LogFactory.getLog(PdfThread.class);

    final File file;
    final PipedWriter writer;
    final PipedReader reader;
    PDDocument document = null;

    PdfThread(File file) throws IOException {
        this.file = file;
        reader = new PipedReader();
        writer = new PipedWriter(reader);
    }

    Reader getReader() {
        return reader;
    }

    @Override
    public void run() {

        try {
            document = PDDocument.load(file);
        } catch (IOException e) {
            log.warn("Could not load Pdf " + file, e);
            try {
                writer.close();
            } catch (IOException ioe) {
                // What can we do?
            }
        }

        try {
            if (document != null && !document.isEncrypted()) {
                try {
                    PDFTextStripper stripper = null;
                    stripper = new PDFTextStripper();
                    stripper.writeText(document, writer);
                } finally {
                    close();
                }
            }
        } catch (IOException e) {
            log.warn("Error reading pdf file", e);
        }
    }

    public void close() {
        if (writer != null) {
            try {
                writer.close();
            } catch (Exception e) {
                log.warn("Error closing writer", e);
            }
        }
        if (document != null) {
            try {
                document.close();
            } catch (Exception e) {
                log.warn("Error closing PDF document", e);
            }
        }

    }

}