net.bookinaction.TextInfoExtractor.java Source code

Introduction

Here is the source code for net.bookinaction.TextInfoExtractor.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.bookinaction;

import net.bookinaction.listener.ImageLocationListener;
import net.bookinaction.listener.TextLocationListener;

import net.bookinaction.model.StripLine;
import net.bookinaction.model.StripString;
import net.bookinaction.model.StripperParam;
import net.bookinaction.model.Token;
import net.bookinaction.utils.*;
import net.bookinaction.analyzer.PatternAnalyzer;
import net.bookinaction.analyzer.SimpleTokenizer;

import javafx.scene.control.ProgressIndicator;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;

import java.awt.geom.Rectangle2D;
import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * This is an example on how to extract text from a specific area on the PDF document.
 */
public final class TextInfoExtractor {

    private String textCoordFile;
    //private static StripperParam  stripperParam;

    /**
     * This will print the documents text in a certain area.
     *
     * @param args The command line arguments.
     * @throws IOException If there is an error parsing the document.
     */

    static String[] circles_patterns = { "([]{1})", "([]{1})", "([-]{1})", // "([-]{1})",
            "([-?]{1})", "([?-]{1})", "([-]{1})", "([-]{1})", "([-]{1})", "([0-9]+)" };

    public TextInfoExtractor() {
    }

    public TextInfoExtractor(String textCoordFile) {
        this.textCoordFile = textCoordFile;
        //this.stripperParam = stripperParam;
    }

    public void doTextPosition(String source, StripperParam sParam) throws IOException {

        if (this.textCoordFile != null)
            doTextPosition(source, this.textCoordFile, sParam);

    }

    public static void getTextPositionFromPage(PDDocument document, StripperParam stripperParam, int pageNum,
            PrintWriter writer, boolean testMode) throws IOException {
        //System.out.println(String.format("getPage: %d", pageNum));

        PDPage page = document.getPage(pageNum - 1); // pdfbox uses the 0-base index
        PDRectangle cropBox = page.getCropBox();

        // extract image locations
        ImageLocationListener imageLocationsListener = new ImageLocationListener();

        List<Rectangle2D> imageRects = new ArrayList<Rectangle2D>();
        imageLocationsListener.setImageRects(imageRects);
        imageLocationsListener.processPage(page);

        // extract Text locations
        StripString stripString = new StripString();

        TextLocationListener stripper = new TextLocationListener(stripperParam, stripString);
        stripper.setSortByPosition(true);

        List<StripLine> stripLines = new ArrayList<StripLine>();

        stripper.setStartPage(pageNum);
        stripper.setEndPage(pageNum);

        try {
            stripper.writeText(document, new OutputStreamWriter(new ByteArrayOutputStream()));
        } catch (IOException e) {
            return;
        }

        if (page.getContents() != null)
            stripper.processPage(page);

        // declare canvas and keep this position
        PDPageContentStream canvas = new PDPageContentStream(document, page, true, true, true);

        Stamper s = new Stamper(); // utility class

        if (testMode) {
            // draw the bounding box of each character
            for (int i = 0; i < stripString.size(); i++) {
                // original Rectangle
                s.showBox(canvas, stripString.boundingRect(i), cropBox, Color.GRAY80);
            }
        }

        s.recordPageSize(writer, pageNum, cropBox);

        // splits into lines
        int lineNum = 1;
        int lineStart = 0, lineEnd = 0;
        String[] splits = stripString.toString().split("\r");

        SimpleTokenizer simpleTokenizer = new SimpleTokenizer();

        for (String lineText : splits) {

            if (lineText.length() < 1)
                continue;

            lineEnd = lineStart + lineText.length();

            Rectangle2D mergedRect = stripString.boundingRect(lineStart, lineEnd - 1);
            String sub = stripString.substring(lineStart, lineEnd);

            stripLines.add(new StripLine(pageNum, lineNum, lineStart, lineEnd, mergedRect));

            //System.out.println(String.format("%d-%d: %s - [%.0f %.0f %.0f %.0f]", pageNum, lineNum, sub,
            //        mergedRect.getX(), mergedRect.getY(), mergedRect.getWidth(), mergedRect.getHeight()));
            if (testMode) {
                s.showBox(canvas, mergedRect, cropBox, Color.GREEN);
            }

            s.recordTextPosition(writer, sub, pageNum, mergedRect, "LINE");

            /******* get words in the line *********/
            List<Token> tokens = simpleTokenizer.getTokens(sub);

            for (String pattern : circles_patterns) {
                List<Token> symbolTokens = PatternAnalyzer.getTokensByPattern(sub, pattern);
                tokens.addAll(symbolTokens);
            }

            for (Token t : tokens) {
                mergedRect = stripString.boundingRect(lineStart + t.getStart(), lineStart + t.getEnd() - 1);
                //System.out.println(String.format("%d-%d: %s - [%.0f %.0f %.0f %.0f]", pageNum, lineNum, t.getStem(), mergedRect.getX(), mergedRect.getY(), mergedRect.getWidth(), mergedRect.getHeight()));

                s.recordTextPosition(writer, t.getStem(), pageNum, mergedRect, "TEXT");

                if (testMode) {
                    s.showBox(canvas, mergedRect, cropBox, Color.RED);
                }

            }

            lineStart += lineText.length() + 1;
            lineNum++;
        }

        // -------------------

        // markup textMark annotation to the image
        int imageNum = 1;
        for (Rectangle2D imRect : imageRects) {
            //page.getAnnotations().add(annotationMaker.textMarkupAnnotation(Color.YELLOW, (Rectangle2D.Float) imRect, "image"+imageNum));

            if (testMode) {
                s.showBox(canvas, imRect, cropBox, Color.YELLOW);
            }
            s.recordTextPosition(writer, "[image" + imageNum + "]", pageNum, imRect, "IMAGE");

            imageNum++;
        }

        canvas.close();
    }

    public void doTextPosition(String source, String coord_text, StripperParam stripperParam) throws IOException {

        String source_pdf = source;
        String new_file = source.split("\\.")[0] + "-new.pdf";

        PDDocument document = PDDocument.load(new File(source_pdf));

        PrintWriter writer = new PrintWriter(new File(coord_text));

        //s.recordHeader(writer, source_pdf, document.getNumberOfPages(), sParam);

        for (int i = 0; i < document.getNumberOfPages(); i++) {
            getTextPositionFromPage(document, stripperParam, i + 1, writer, true);

        }

        if (document != null) {
            document.save(new_file);
            document.close();
        }

        if (writer != null)
            writer.close();

    }

}