net.sf.regain.crawler.preparator.PdfItextPreparator.java Source code

Introduction

Here is the source code for net.sf.regain.crawler.preparator.PdfItextPreparator.java
Source

/*
 * regain - A file search engine providing plenty of formats
 * Copyright (C) 2004  Til Schneider
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Contact: Til Schneider, info@murfman.de
 *
 * CVS information:
 *  $RCSfile$
 *   $Source$
 *     $Date: 2011-01-02 15:09:46 -0200 (Sun, 02 Jan 2011) $
 *   $Author: thtesche $
 * $Revision: 477 $
 */
package net.sf.regain.crawler.preparator;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import lombok.extern.slf4j.Slf4j;
import net.sf.regain.RegainException;
import net.sf.regain.crawler.document.AbstractPreparator;
import net.sf.regain.crawler.document.RawDocument;

import java.io.IOException;
import java.io.InputStream;
import java.util.Map;

/**
 * Prpariert ein PDF-Dokument fr die Indizierung.
 * <p/>
 * Dabei werden die Rohdaten des Dokuments von Formatierungsinformation befreit,
 * es wird der Titel extrahiert.
 *
 * @author Til Schneider, www.murfman.de
 */
@Slf4j
public class PdfItextPreparator extends AbstractPreparator {
    public static byte[] OWNER_PASSWORD = "".getBytes();

    /**
     * Creates a new instance of PdfBoxPreparator.
     *
     * @throws net.sf.regain.RegainException If creating the preparator failed.
     */
    public PdfItextPreparator() throws RegainException {
        super("application/pdf");
    }

    /**
     * Prpariert ein Dokument fr die Indizierung.
     *
     * @param rawDocument Das zu prpariernde Dokument.
     * @throws net.sf.regain.RegainException Wenn die Prparation fehl schlug.
     */
    @SuppressWarnings("unchecked")
    public void prepare(RawDocument rawDocument) throws RegainException {
        String url = rawDocument.getUrl();

        InputStream stream = null;
        PdfReader reader = null;

        try {
            // Create a InputStream that reads the content.
            stream = rawDocument.getContentAsStream();

            // Parse the content
            reader = new PdfReader(stream);
            if (reader.isEncrypted()) {
                reader = new PdfReader(stream, OWNER_PASSWORD);
            }
            PdfReaderContentParser parser = new PdfReaderContentParser(reader);

            TextExtractionStrategy strategy;
            StringBuilder stringBuilder = new StringBuilder();
            for (int i = 1; i <= reader.getNumberOfPages(); i++) {
                strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
                stringBuilder.append(strategy.getResultantText());

            }

            setCleanedContent(stringBuilder.toString());

            // Get metadata
            Map<String, String> info = reader.getInfo();

            StringBuilder metaData = new StringBuilder();
            metaData.append("p.");
            metaData.append(Integer.toString(reader.getNumberOfPages()));
            metaData.append(" ");

            // Check if fields are null
            String author = info.get("Author");
            String creator = info.get("Creator");
            String subject = info.get("Subject");
            String keywords = info.get("Keywords");
            String title = info.get("Title");

            if (author != null) {
                metaData.append(author);
                metaData.append(" ");
            }
            if (creator != null) {
                metaData.append(creator);
                metaData.append(" ");
            }
            if (subject != null) {
                metaData.append(subject);
                metaData.append(" ");
            }
            if (keywords != null) {
                metaData.append(keywords);
                metaData.append(" ");
            }

            if (title != null) {
                setTitle(title);
            }

            setCleanedMetaData(metaData.toString());
            if (log.isDebugEnabled()) {
                log.debug("Extracted meta data ::" + getCleanedMetaData() + ":: from " + rawDocument.getUrl());
            }

        } catch (IOException exc) {
            throw new RegainException("Error reading document: " + url, exc);
        } catch (Exception exc) {
            // They didn't supply a password and the default of "" was wrong.
            throw new RegainException("Unknown error parsing document: " + url, exc);

        } finally {
            if (stream != null) {
                try {
                    stream.close();
                } catch (Exception exc) {
                }
            }
            if (reader != null) {
                try {
                    reader.close();
                } catch (Exception exc) {
                }
            }
        }
    }
}