uk.bl.dpt.qa.flint.wrappers.iTextWrapper.java Source code

Java tutorial

Introduction

Here is the source code for uk.bl.dpt.qa.flint.wrappers.iTextWrapper.java

Source

/*
 * Copyright 2014 The British Library/SCAPE Project Consortium
 * Author: William Palmer (William.Palmer@bl.uk)
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package uk.bl.dpt.qa.flint.wrappers;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;

import com.itextpdf.text.ExceptionConverter;
import com.itextpdf.text.exceptions.BadPasswordException;
import com.itextpdf.text.exceptions.InvalidPdfException;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Class to wrap iText for the purposes of extracting text from PDFs 
 * Initial testing seems to show PDFBox/Calibre doing a better job
 * @author wpalmer
 *
 */
public class iTextWrapper {

    private static Logger LOGGER = LoggerFactory.getLogger(iTextWrapper.class);

    public iTextWrapper() {
    }

    /**
     * Extracts text from a PDF.
     * @param pFile input file
     * @param pOutput output file
     * @param pOverwrite whether or not to overwrite an existing output file
     * @return true if converted ok, otherwise false
     */
    public boolean extractTextFromPDF(File pFile, File pOutput, boolean pOverwrite) {
        if (pOutput.exists() & (!pOverwrite))
            return false;

        boolean ret = true;

        PrintWriter pw = null;
        PdfReader reader = null;

        try {
            pw = new PrintWriter(new FileWriter(pOutput));
            reader = new PdfReader(pFile.getAbsolutePath());
            PdfReaderContentParser parser = new PdfReaderContentParser(reader);
            TextExtractionStrategy strategy;
            for (int i = 0; i < reader.getNumberOfPages(); i++) {
                try {
                    //page numbers start at 1
                    strategy = parser.processContent((i + 1), new SimpleTextExtractionStrategy());
                    //write text out to file
                    pw.println(strategy.getResultantText());
                } catch (ExceptionConverter e) {
                    e.printStackTrace();
                    ret = false;
                    pw.println("iText Exception: Page " + (i + 1) + ": " + e.getClass().getName() + ": "
                            + e.getMessage());
                }
            }
        } catch (IOException e) {
            ret = false;
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {
            if (pw != null)
                pw.close();
            if (reader != null)
                reader.close();
        }

        return ret;
    }

    /**
     * Check if a PDF file is valid or not
     * @param pFile file to check
     * @return whether the file is valid or not
     */
    public boolean isValid(File pFile) {

        boolean ret = false;

        PdfReader reader = null;
        try {
            reader = new PdfReader(pFile.getAbsolutePath());
            LOGGER.debug("validating through {} pages of {}", reader.getNumberOfPages(), pFile.getName());
            for (int i = 0; i < reader.getNumberOfPages(); i++) {
                //page numbers start at 1
                PdfTextExtractor.getTextFromPage(reader, (i + 1));
            }
            ret = true;
        } catch (BadPasswordException e) {
            //actually an error???
        } catch (InvalidPdfException e) {
            LOGGER.warn("InvalidPdfException leads to invalidity: {}", e);
        } catch (IOException e) {
            LOGGER.warn("IOException leads to invalidity: {}", e);
        } catch (Exception e) {
            LOGGER.warn("Exception leads to invalidity: {}", e);
        } finally {
            if (reader != null)
                reader.close();
        }

        return ret;
    }

    /**
     * Check if a PDF file has DRM or not
     * @param pFile file to check
     * @return whether the file is had DRM or not
     */
    public boolean hasDRM(File pFile) {

        boolean drm = false;

        PdfReader reader = null;
        try {
            reader = new PdfReader(pFile.getAbsolutePath());
            drm = reader.isEncrypted();
        } catch (BadPasswordException e) {
            //assume drm
            drm = true;
        } catch (IOException e) {
            // TODO Auto-generated catch block
            //e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (reader != null)
                reader.close();
        }

        return drm;
    }

}