Java tutorial
/* * PDFCorpusAnalyser * * Copyright (c) 2012, E&E information consultants AG. All rights reserved. * Authors: * Peter Jentsch * Nico Hezel * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301 USA */ package de.ee.hezel; import java.awt.geom.Rectangle2D; import java.io.File; import java.io.FilenameFilter; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.commons.io.IOCase; import org.apache.commons.io.filefilter.AndFileFilter; import org.apache.commons.io.filefilter.PrefixFileFilter; import org.apache.commons.io.filefilter.SuffixFileFilter; import org.apache.log4j.Logger; import org.icepdf.core.pobjects.Document; import org.icepdf.core.pobjects.PDimension; import org.icepdf.core.pobjects.Page; import org.icepdf.core.pobjects.graphics.Shapes; import org.icepdf.core.pobjects.graphics.text.LineText; import org.icepdf.core.pobjects.graphics.text.PageText; import org.icepdf.core.pobjects.graphics.text.WordText; import org.icepdf.core.util.GraphicsRenderingHints; import de.ee.hezel.logger.ICompareLogger; import de.ee.hezel.model.PDFHolder; import de.ee.hezel.model.PDFInfoHolder; import de.ee.hezel.model.PDFPageHolder; import de.ee.hezel.model.PDFInfoHolder.DifferenceType; import de.ee.hezel.model.pdfelemente.PDFImageHolder; import de.ee.hezel.model.pdfelemente.PDFTextHolder; /** * This class analizes the structure of the pdf files * It created the PDFxxxHolder objects. They represent * the structure of the pdf * * All data get extracted with the icePDF lib. * * @author hezeln * */ public class PDFCorpusAnalyser extends AbstractPDFCompare { static Logger log = Logger.getLogger(PDFCorpusAnalyser.class.getName()); // should be always 1 static final float ZOOM = 1.0f; //2.77f; public PDFCorpusAnalyser(ICompareLogger diffLog) { setDifferenceLogger(diffLog); } /** * Analize the the structure. * Create a simple PDF structure. * @param PDFInfoHolder contains the comparing pdfs * @throws Exception */ public void analyse(PDFInfoHolder pdfInfoHolder) throws Exception { // can't analize if there is no document if (pdfInfoHolder.getDifferent() == DifferenceType.MISSINGDOCUMENT) return; Document pdfFile1 = new Document(); Document pdfFile2 = new Document(); try { // get pdf document -> create a PDFHolder objects, which contains the entire structure of the document pdfFile1.setFile(pdfInfoHolder.getPDFFile1().getAbsolutePath()); pdfInfoHolder.setPDFStructure1(analysePDF(pdfFile1, pdfInfoHolder)); pdfFile2.setFile(pdfInfoHolder.getPDFFile2().getAbsolutePath()); pdfInfoHolder.setPDFStructure2(analysePDF(pdfFile2, pdfInfoHolder)); } catch (Exception e) { pdfFile1.dispose(); pdfFile2.dispose(); throw new Exception( pdfInfoHolder.getFilename() + ": Could not load PDF Structure. Reason: " + e.getMessage(), e); } // check for different page amount if (pdfInfoHolder.getPDF1().getNumPages() != pdfInfoHolder.getPDF2().getNumPages()) { pdfInfoHolder.setDifferent(DifferenceType.MISSINGPAGE); diff.log(pdfInfoHolder.getFilename() + ": Different amount of pages: " + pdfFile1.getNumberOfPages() + " to " + pdfFile2.getNumberOfPages()); } } /** * Exctract the structure of the pdf document and save it in the pdfholder object * * @param pdfFile * @param pdfInfoHolder * @return PDFHolder */ private PDFHolder analysePDF(Document pdfFile, PDFInfoHolder pdfInfoHolder) { PDFHolder pdfHolder = new PDFHolder(pdfFile.getNumberOfPages()); // run thru all pages of this document int numPgs = pdfFile.getNumberOfPages(); for (int pageNumber = 0; pageNumber < numPgs; pageNumber++) { Page page = pdfFile.getPageTree().getPage(pageNumber, this); // dimension of the page PDimension sz = page.getSize(0.0f, ZOOM); float pageWidth = sz.getWidth(); float pageHeight = sz.getHeight(); // analize the structure of this page PDFPageHolder pdfPageHolder = analysePDFPage(pdfInfoHolder, page, pageNumber, pageWidth, pageHeight); // release the page resource pdfFile.getPageTree().releasePage(pageNumber, this); // add the analized page to the pdf holder pdfHolder.addPageHolders(pdfPageHolder); } return pdfHolder; } /** * Exctract the structure of the pdf page and save it in the PDFPageHolder object * * @param page * @param pageNumber * @param pageWidth * @param pageHeight * @return */ private PDFPageHolder analysePDFPage(PDFInfoHolder pdfInfoHolder, Page page, int pageNumber, float pageWidth, float pageHeight) { // create a holder for this page PDFPageHolder pdfPageHolder = new PDFPageHolder(pageNumber, pageWidth, pageHeight); // get all textelement for this page, from the icePDF lib PageText pt = page.getText(); // create for all text elements, structure holder for (LineText lt : pt.getPageLines()) { for (WordText wt : lt.getWords()) { // ignore white spaces if (wt.isWhiteSpace()) continue; // get the dimension for this element Rectangle2D.Float rect = wt.getBounds(); // pdf documents does have their coordinate origin in the lower left corner double y = (pageHeight - rect.y) - rect.height; // create the text holder PDFTextHolder pdfTextHolder = new PDFTextHolder(rect.x, y, rect.width, rect.height, wt.getText()); pdfPageHolder.addElement(pdfTextHolder); } } // get all shapes for this page (e.g. images) Shapes shapes = page.getShapes(); //modified lib // create for all images structure holder if (shapes != null) { // get the image position and dimension (best results with zoom 1.0) Set<Rectangle2D.Double> imageBoundaries = shapes.getBoundingBoxesForImages((int) pageWidth, (int) pageHeight, GraphicsRenderingHints.PRINT, Page.BOUNDARY_CROPBOX, 0.0f, ZOOM, page); //modifizierte Bibliothek // add all found image to the page holder for (Rectangle2D.Double rect : imageBoundaries) { PDFImageHolder pdfImageHolder = new PDFImageHolder(rect.x, rect.y, rect.width, rect.height); pdfPageHolder.addElement(pdfImageHolder); } } return pdfPageHolder; } /** * Run thru the given folders and find pdf document which have the same name. * For every pair, a PDFInfoHolder objects gets created. * * @param pdfs1 for the 1st directory * @param pdfs2 for the 2nd directory * @param prefix * @return list of all pdf pairs */ public static Set<PDFInfoHolder> getSimplePDFInfoHolders(File pdfs1, File pdfs2, String prefix) { Set<PDFInfoHolder> pdfInfoHolders = new HashSet<PDFInfoHolder>(); // are those valid pathes if (pdfs1 != null && pdfs2 != null && pdfs1.isDirectory() && pdfs2.isDirectory()) { // create a filter to only get pdf files List<FilenameFilter> filters = new ArrayList<FilenameFilter>(); if (null != prefix && prefix.length() > 0) { PrefixFileFilter filter = new PrefixFileFilter(prefix, IOCase.SYSTEM); filters.add(filter); } filters.add(new SuffixFileFilter(".pdf", IOCase.INSENSITIVE)); FilenameFilter filter = new AndFileFilter(filters); //get all pdf file sin this folder String[] pdfDocuments1 = pdfs1.list(filter); for (int i = 0; i < pdfDocuments1.length; i++) { // get the pdf file name String pdfFilename1 = pdfDocuments1[i]; // get the path for both pdf files File pdfFile1 = new File(pdfs1, pdfFilename1); File pdfFile2 = new File(pdfs2, pdfFilename1); // bind them together in a PDFInfoHolder objects PDFInfoHolder newPDFInfoHolder = new PDFInfoHolder(pdfFile1, pdfFile2); // remember them all pdfInfoHolders.add(newPDFInfoHolder); } // TODO what should happen if there are less reference documents than new generated ones } else { log.error("The path is not valid."); } return pdfInfoHolders; } }