de.mpg.escidoc.services.extraction.ExtractionChain.java Source code

Introduction

Here is the source code for de.mpg.escidoc.services.extraction.ExtractionChain.java
Source

/*
*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at license/ESCIDOC.LICENSE
* or http://www.escidoc.org/license.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at license/ESCIDOC.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright 2006-2012 Fachinformationszentrum Karlsruhe Gesellschaft
* fr wissenschaftlich-technische Information mbH and Max-Planck-
* Gesellschaft zur Frderung der Wissenschaft e.V.
* All rights reserved. Use is subject to license terms.
*/

package de.mpg.escidoc.services.extraction;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.Date;
import java.util.Properties;

import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;

/**
 * TODO Description
 *
 * @author franke (initial creation)
 * @author $Author$ (last modification)
 * @version $Revision$ $LastChangedDate$
 *
 */
public class ExtractionChain {
    private static final int TIKA_CONTENT_SIZE = 10 * 2048 * 2048;

    public static boolean verbose = false;

    private static Logger logger = Logger.getLogger(ExtractionChain.class);

    private String pdftotext = System.getenv("extract.pdftotext.path");
    private String pdfboxAppJar = System.getenv("extract.pdfbox-app-jar.path");

    private OutputStreamWriter outputStreamWriter;

    public enum ExtractionResult {
        OK, FAILURE;
    }

    public ExtractionChain() {
        this.pdftotext = System.getenv("extract.pdftotext.path");
        this.pdfboxAppJar = System.getenv("extract.pdfbox-app-jar.path");

        return;
    }

    // if called by Runtime.exec(cmd, envp) the following properties are got from the environment parameter. 
    // Otherwise the properties are expected to be set by the calling instance explicitly using this method.

    public void setProperties(Properties p, Logger logger) {
        if (pdftotext == null) {
            pdftotext = p.getProperty("extract.pdftotext.path");
        }
        if (pdfboxAppJar == null) {
            pdfboxAppJar = p.getProperty("extract.pdfbox-app-jar.path");
        }

        this.logger = logger;
    }

    public ExtractionResult doExtract(String infileName, String outfileName) {
        File outfile = new File(outfileName);

        Date stepStart = new Date();
        Date current;

        logger.info("Extracting PDF content ----------------------------------------");
        logger.info("Infile: " + infileName);
        logger.info("Outfile: " + outfileName);

        logger.info(stepStart + " -- started");

        // xPDF

        try {
            logger.info("Extracting with xPDF");

            StringBuffer command = new StringBuffer(2048);
            command.append(System.getProperty("os.name").contains("Windows") ? pdftotext + " -enc UTF-8 "
                    : "/usr/bin/pdftotext -enc UTF-8 ");
            command.append(infileName);
            command.append(" ");
            command.append(outfileName);

            Process proc = Runtime.getRuntime().exec(command.toString());

            StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "xPDF");
            StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "xPDF");

            inputGobbler.start();
            errorGobbler.start();

            int exitCode = proc.waitFor();

            if (proc.exitValue() == 0) {

                if (verbose) {
                    BufferedReader bufferedReader = new BufferedReader(
                            new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
                    String line;
                    while ((line = bufferedReader.readLine()) != null) {
                        logger.info(line);
                    }
                    bufferedReader.close();
                }
                current = new Date();
                logger.info(current + " -- finished successfully");
                logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

                return ExtractionResult.OK;
            }
        } catch (Exception e) {
            logger.warn("Error extracting PDF with xPDF:");
            logger.warn(e.getStackTrace());
        }

        current = new Date();
        logger.info(current + " -- finished unsuccessfully");
        logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime()));

        // PDFBox
        try {
            logger.info("Extracting with PDFBox");
            stepStart = new Date();

            StringBuffer command = new StringBuffer(1024);
            command.append(System.getProperty("os.name").contains("Windows")
                    ? "java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText "
                    : "/usr/bin/java -Dfile.encoding=UTF-8 -jar " + pdfboxAppJar + " ExtractText ");
            command.append(infileName);
            command.append(" ");
            command.append(outfileName);

            Process proc = Runtime.getRuntime().exec(command.toString());
            StreamGobbler inputGobbler = new StreamGobbler(proc.getInputStream(), "PDFBox");
            StreamGobbler errorGobbler = new StreamGobbler(proc.getErrorStream(), "PDFBox");

            inputGobbler.start();
            errorGobbler.start();

            int exitCode = proc.waitFor();

            if (exitCode == 0) {

                if (verbose) {
                    BufferedReader bufferedReader = new BufferedReader(
                            new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
                    String line;
                    while ((line = bufferedReader.readLine()) != null) {
                        logger.info(line);
                    }
                    bufferedReader.close();
                }
                current = new Date();
                logger.info(current + " -- finished successfully");
                logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

                return ExtractionResult.OK;
            }
        } catch (Exception e) {
            logger.warn("Error extracting PDF with PDFBox:");
            logger.warn(e.getStackTrace());
        }

        current = new Date();
        logger.info(current + " -- finished unsuccessfully");
        logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime()));

        // iText
        try {
            logger.info("Extracting with iText");
            stepStart = new Date();

            PdfReader reader = new PdfReader(infileName);
            int numberOfPages = reader.getNumberOfPages();

            outputStreamWriter = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
            for (int i = 0; i < numberOfPages; i++) {
                outputStreamWriter.write(PdfTextExtractor.getTextFromPage(reader, i + 1));
            }

            if (verbose) {
                BufferedReader bufferedReader = new BufferedReader(
                        new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
                String line;
                while ((line = bufferedReader.readLine()) != null) {
                    logger.info(line);
                }
                bufferedReader.close();
            }

            current = new Date();
            logger.info(current + " -- finished successfully");
            logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

            return ExtractionResult.OK;

        } catch (Exception e) {
            logger.warn("Error extracting PDF with iText:", e);
        }

        // tika

        InputStream stream = null;

        try {
            logger.info("Extracting with Tika");
            stepStart = new Date();

            stream = TikaInputStream.get(new File(infileName));

            ContentHandler handler = new BodyContentHandler(TIKA_CONTENT_SIZE);

            new AutoDetectParser().parse(stream, handler, new Metadata(), new ParseContext());

            String content = handler.toString();

            FileUtils.writeStringToFile(outfile, content);

            stream.close();

            if (verbose) {
                BufferedReader bufferedReader = new BufferedReader(
                        new InputStreamReader(new FileInputStream(outfile), "UTF-8"));
                String line;
                while ((line = bufferedReader.readLine()) != null) {
                    logger.info(line);
                }
                bufferedReader.close();
            }

            current = new Date();
            logger.info(current + " -- finished successfully");
            logger.info("Extraction took " + (current.getTime() - stepStart.getTime()));

            return ExtractionResult.OK;

        } catch (Exception e) {
            logger.warn("Error extracting Tika:", e);
            try {
                stream.close();
            } catch (IOException e1) {
                e1.printStackTrace();
            }
        }

        current = new Date();
        logger.warn(current + " -- finished unsuccessfully");
        logger.info("Extraction attempt took " + (current.getTime() - stepStart.getTime()));

        logger.info("... giving up");

        return ExtractionResult.FAILURE;
    }

    class StreamGobbler extends Thread {
        InputStream is;
        String name;

        StreamGobbler(InputStream is, String name) {
            this.is = is;
            this.name = name;
        }

        public void run() {
            try {
                InputStreamReader isr = new InputStreamReader(is, "UTF-8");
                BufferedReader br = new BufferedReader(isr);
                String line = null;
                while ((line = br.readLine()) != null)
                    logger.info("[" + name + "] " + line);
            } catch (IOException ioe) {
                ioe.printStackTrace();
            }
        }
    }

    /**
     * @param args
     */
    public static void main(String[] args) {
        ExtractionResult ret = ExtractionResult.FAILURE;

        if (args.length < 2) {
            System.out.println("usage: ExctractionChain [-v[erbose]] <infile> <outfile>");
        } else {
            int offset = 0;

            if ("-v".equals(args[0]) || "-verbose".equals(args[0])) {
                offset = 1;
                verbose = true;
            }
            ExtractionChain exctractionChain = new ExtractionChain();
            ret = exctractionChain.doExtract(args[0 + offset], args[1 + offset]);

            System.out.println("ret in main " + ret);
        }

        System.exit(ret.ordinal());
    }
}