edu.uwm.jiaoduan.lab.ExtractTextByArea.java Source code

Java tutorial

Introduction

Here is the source code for edu.uwm.jiaoduan.lab.ExtractTextByArea.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.uwm.jiaoduan.lab;

import org.apache.pdfbox.exceptions.InvalidPasswordException;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFTextStripperByArea;
import org.apache.pdfbox.util.TextPosition;

import java.awt.Rectangle;
import java.awt.geom.Rectangle2D;

import java.util.List;

/**
 * This is an example on how to extract text from a specific area on the PDF document.
 *
 * Usage: java org.apache.pdfbox.examples.util.ExtractTextByArea <input-pdf>
 *
 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
 * @version $Revision: 1.2 $
 */
public class ExtractTextByArea {
    private ExtractTextByArea() {
        //utility class and should not be constructed.
    }

    /**
     * This will print the documents text in a certain area.
     *
     * @param args The command line arguments.
     *
     * @throws Exception If there is an error parsing the document.
     */
    public static void main(String[] args) throws Exception {
        args = new String[] { "test.pdf" };
        if (args.length != 1) {
            usage();
        } else {
            PDDocument document = null;
            try {
                document = PDDocument.load(args[0]);
                if (document.isEncrypted()) {
                    try {
                        document.decrypt("");
                    } catch (InvalidPasswordException e) {
                        System.err.println("Error: Document is encrypted with a password.");
                        System.exit(1);
                    }
                }
                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                stripper.setSortByPosition(true);
                //Rectangle rect = new Rectangle( 99,219,80,15 );
                //convert xfdf coordinate to rectangle

                Rectangle2D.Double rect = new Rectangle2D.Double();

                List allPages = document.getDocumentCatalog().getAllPages();
                PDPage firstPage = (PDPage) allPages.get(0);

                double pageHeight = firstPage.getMediaBox().getHeight();

                //125.824906,672.39502,390.577109,694.679017
                double[] coords = new double[] { 58.50615, 500.847504, 302.919073, 552.419312 };
                //rect.height = 694.679017 - 672.39502;
                rect.height = coords[3] - coords[1];
                //rect.width = 390.577109 - 125.824906;
                rect.width = coords[2] - coords[0];
                ;

                //rect.x = 125.824906;
                rect.x = coords[0];
                //rect.y = pageHeight -672.39502 - rect.height; 
                rect.y = pageHeight - coords[1] - rect.height;
                System.out.println(rect);

                stripper.addRegion("class1", rect);
                stripper.extractRegions(firstPage);

                System.out.println("Text in the area:" + rect);
                System.out.println(stripper.getTextForRegion("class1"));

            } finally {
                if (document != null) {
                    document.close();
                }
            }
        }
    }

    /**
     * This will print the usage for this document.
     */
    private static void usage() {
        System.err.println("Usage: java org.apache.pdfbox.examples.util.ExtractTextByArea <input-pdf>");
    }

}