net.timendum.pdf.Images2HTML.java Source code

Java tutorial

Introduction

Here is the source code for net.timendum.pdf.Images2HTML.java

Source

/*
   This file is part of pdf2html.
    
pdf2html is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
    
pdf2html is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with pdf2html.  If not, see <http://www.gnu.org/licenses/>.
    
Copyright by timendum.
Based on org.apache.pdfbox.examples.util.PrintImageLocations by Apache Software Foundation
 */
package net.timendum.pdf;

import java.awt.geom.AffineTransform;
import java.io.File;
import java.io.IOException;
import java.util.AbstractMap;
import java.util.AbstractMap.SimpleEntry;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import net.timendum.pdf.beans.Image;

import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.PDFOperator;
import org.apache.pdfbox.util.PDFStreamEngine;
import org.apache.pdfbox.util.ResourceLoader;

import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;

public class Images2HTML extends PDFStreamEngine {

    private static final String INVOKE_OPERATOR = "Do";

    public Images2HTML() throws IOException {
        super(ResourceLoader.loadProperties("org/apache/pdfbox/resources/PDFTextStripper.properties", true));
    }

    public void processDocument(PDDocument document) throws IOException {
        List allPages = document.getDocumentCatalog().getAllPages();
        for (int i = 0; i < allPages.size(); i++) {
            PDPage page = (PDPage) allPages.get(i);
            processStream(page, page.findResources(), page.getContents().getStream());
        }
    }

    private Table<PDPage, Float, Image> images = HashBasedTable.create();
    private boolean lazyImages = false;
    private File basePath = null;
    private String relativePath = null;
    private String prefix = "image";

    public void setLazyImages(boolean lazyImages) {
        this.lazyImages = lazyImages;
    }

    public void setBasePath(File basePath) {
        this.basePath = basePath;
    }

    public void setRelativePath(String relativePath) {
        this.relativePath = relativePath;
    }

    public void setPrefix(String prefix) {
        this.prefix = prefix;
    }

    @Override
    protected void processOperator(PDFOperator operator, List arguments) throws IOException {
        String operation = operator.getOperation();
        if (INVOKE_OPERATOR.equals(operation)) {
            COSName objectName = (COSName) arguments.get(0);
            Map<String, PDXObject> xobjects = getResources().getXObjects();
            PDXObject xobject = xobjects.get(objectName.getName());
            if (xobject instanceof PDXObjectImage) {
                PDXObjectImage image = (PDXObjectImage) xobject;
                PDPage page = getCurrentPage();
                int imageWidth = image.getWidth();
                int imageHeight = image.getHeight();
                double pageHeight = page.getMediaBox().getHeight();

                Matrix ctmNew = getGraphicsState().getCurrentTransformationMatrix();
                float yScaling = ctmNew.getYScale();
                float angle = (float) Math.acos(ctmNew.getValue(0, 0) / ctmNew.getXScale());
                if (ctmNew.getValue(0, 1) < 0 && ctmNew.getValue(1, 0) > 0) {
                    angle = (-1) * angle;
                }
                ctmNew.setValue(2, 1, (float) (pageHeight - ctmNew.getYPosition() - Math.cos(angle) * yScaling));
                ctmNew.setValue(2, 0, (float) (ctmNew.getXPosition() - Math.sin(angle) * yScaling));
                // because of the moved 0,0-reference, we have to shear in the opposite direction
                ctmNew.setValue(0, 1, (-1) * ctmNew.getValue(0, 1));
                ctmNew.setValue(1, 0, (-1) * ctmNew.getValue(1, 0));
                AffineTransform ctmAT = ctmNew.createAffineTransform();
                ctmAT.scale(1f / imageWidth, 1f / imageHeight);

                Image entry = new Image();
                entry.x = ctmNew.getXPosition();
                entry.image = image;
                entry.name = objectName.getName();
                images.put(page, ctmNew.getYPosition(), entry);

            } else if (xobject instanceof PDXObjectForm) {
                // save the graphics state
                getGraphicsStack().push((PDGraphicsState) getGraphicsState().clone());
                PDPage page = getCurrentPage();

                PDXObjectForm form = (PDXObjectForm) xobject;
                COSStream invoke = (COSStream) form.getCOSObject();
                PDResources pdResources = form.getResources();
                if (pdResources == null) {
                    pdResources = page.findResources();
                }
                // if there is an optional form matrix, we have to
                // map the form space to the user space
                Matrix matrix = form.getMatrix();
                if (matrix != null) {
                    Matrix xobjectCTM = matrix.multiply(getGraphicsState().getCurrentTransformationMatrix());
                    getGraphicsState().setCurrentTransformationMatrix(xobjectCTM);
                }
                processSubStream(page, pdResources, invoke);

                // restore the graphics state
                setGraphicsState(getGraphicsStack().pop());
            }

        } else {
            super.processOperator(operator, arguments);
        }
    }

    public Table<PDPage, Float, Image> getImages() {
        return images;
    }

    protected Map<String, String> imageCache = new HashMap<String, String>();

    public String printImage(Image image) throws IOException {
        PDXObjectImage pdfImage = image.image;
        String key = image.name;

        if (lazyImages) {
            String name = imageCache.get(key);
            if (name != null) {
                return name;
            }
        }

        String suffix = pdfImage.getSuffix();
        String name = getUniqueFileName(prefix + "_" + key, suffix);
        if (!"jpg".equals(suffix) && !"png".equals(suffix)) {
            System.err.println("Image format: " + suffix);
        }
        pdfImage.write2file(basePath.getAbsolutePath() + File.separator + name);
        String path = relativePath + File.separator + name + "." + suffix;

        if (lazyImages) {
            imageCache.put(key, path);
        }
        return path;
    }

    private int imageCounter = 1;

    protected String getUniqueFileName(String prefix, String suffix) {
        String uniqueName = null;
        File f = null;
        while (f == null || f.exists()) {
            uniqueName = prefix + "-" + imageCounter;
            f = new File(basePath, uniqueName + "." + suffix);
            imageCounter++;
        }
        return uniqueName;
    }

}