com.odc.pdfextractor.parser.CleanPdfParser.java Source code

Java tutorial

Introduction

Here is the source code for com.odc.pdfextractor.parser.CleanPdfParser.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.odc.pdfextractor.parser;

import org.apache.pdfbox.exceptions.InvalidPasswordException;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;

import com.odc.pdfextractor.comparator.TopToBottomComparator;
import com.odc.pdfextractor.model.CharacterLocation;
import com.odc.pdfextractor.model.DocumentLocation;
import com.odc.pdfextractor.model.builder.DocumentBuilder;

import java.io.IOException;

import java.util.List;

/**
 * This is an example on how to get some x/y coordinates of text.
 *
 * Usage: java org.apache.pdfbox.examples.util.PrintTextLocations <input-pdf>
 *
 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
 * @version $Revision: 1.7 $
 */
public class CleanPdfParser extends PDFTextStripper implements PdfParser {
    private DocumentBuilder docBuilder = new DocumentBuilder(3, new TopToBottomComparator());

    /**
     * Default constructor.
     *
     * @throws IOException If there is an error loading text stripper properties.
     */
    public CleanPdfParser() throws IOException {
        super.setSortByPosition(false);
    }

    /**
     * This will print the documents docBuilder.
     *
     * @param args The command line arguments.
     *
     * @throws Exception If there is an error parsing the document.
     */
    public DocumentLocation processPdf(String filename) throws Exception {

        PDDocument document = null;
        try {
            document = PDDocument.load(filename);
            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                } catch (InvalidPasswordException e) {
                    System.err.println("Error: Document is encrypted with a password.");
                    System.exit(1);
                }
            }
            List allPages = document.getDocumentCatalog().getAllPages();
            System.out.print("Extracting text from PDF");
            for (int i = 0; i < allPages.size(); i++) {
                PDPage page = (PDPage) allPages.get(i);
                System.out.print(".");
                PDStream contents = page.getContents();
                if (contents != null) {
                    this.processStream(page, page.findResources(), page.getContents().getStream());
                }
                docBuilder.incrementPage();
            }
        } finally {
            System.out.println();
            if (document != null) {
                document.close();
            }
        }
        return docBuilder.getDoc();
    }

    /**
     * A method provided as an event interface to allow a subclass to perform
     * some specific functionality when text needs to be processed.
     *
     * @param docBuilder.text The text to be processed
     */
    protected void processTextPosition(TextPosition textPos) {
        char character = textPos.getCharacter().charAt(0);
        int x = Math.round(textPos.getX());
        int width = Math.round(textPos.getWidth());
        int y = Math.round(textPos.getY());
        int height = Math.round(textPos.getHeight());
        int bottom = y + height;
        int right = x + width;
        CharacterLocation charLoc = new CharacterLocation(x, right, y, bottom, docBuilder.getPage(), character);
        docBuilder.addCharacter(charLoc);
    }

    /**
     * This will print the usage for this document.
     */
    private static void usage() {
        System.err.println("Usage: java org.apache.pdfbox.examples.pdmodel.PrintTextLocations <input-pdf>");
    }

}