com.jaeksoft.searchlib.util.pdfbox.PDFBoxUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.jaeksoft.searchlib.util.pdfbox.PDFBoxUtils.java

Source

/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2012-2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see <http://www.gnu.org/licenses/>.
 **/

package com.jaeksoft.searchlib.util.pdfbox;

import java.io.IOException;
import java.util.Map;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.apache.pdfbox.util.PDFTextStripper;

public class PDFBoxUtils {

    public static class TolerantPDFTextStripper extends PDFTextStripper {

        public TolerantPDFTextStripper() throws IOException {
            super("UTF-8");
        }

        @Override
        public String getText(PDDocument doc) throws IOException {
            try {
                return super.getText(doc);
            } catch (RuntimeException e) {
                String text = output.toString();
                if (text == null || text.length() == 0)
                    throw e;
                return text;
            }
        }
    }

    public static final int countCheckImage(PDPage page) throws IOException {
        PDResources resources = page.getResources();
        Map<String, PDXObject> objects = resources.getXObjects();
        if (objects == null)
            return 0;
        int count = 0;
        for (PDXObject object : objects.values())
            if (object instanceof PDXObjectImage)
                count++;
        return count;
    }

}