Java tutorial
/* * regain - A file search engine providing plenty of formats * Copyright (C) 2004 Til Schneider * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Contact: Til Schneider, info@murfman.de * * CVS information: * $RCSfile$ * $Source$ * $Date: 2011-01-02 15:09:46 -0200 (Sun, 02 Jan 2011) $ * $Author: thtesche $ * $Revision: 477 $ */ package net.sf.regain.crawler.preparator; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.PdfReaderContentParser; import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy; import com.itextpdf.text.pdf.parser.TextExtractionStrategy; import lombok.extern.slf4j.Slf4j; import net.sf.regain.RegainException; import net.sf.regain.crawler.document.AbstractPreparator; import net.sf.regain.crawler.document.RawDocument; import java.io.IOException; import java.io.InputStream; import java.util.Map; /** * Prpariert ein PDF-Dokument fr die Indizierung. * <p/> * Dabei werden die Rohdaten des Dokuments von Formatierungsinformation befreit, * es wird der Titel extrahiert. * * @author Til Schneider, www.murfman.de */ @Slf4j public class PdfItextPreparator extends AbstractPreparator { public static byte[] OWNER_PASSWORD = "".getBytes(); /** * Creates a new instance of PdfBoxPreparator. * * @throws net.sf.regain.RegainException If creating the preparator failed. */ public PdfItextPreparator() throws RegainException { super("application/pdf"); } /** * Prpariert ein Dokument fr die Indizierung. * * @param rawDocument Das zu prpariernde Dokument. * @throws net.sf.regain.RegainException Wenn die Prparation fehl schlug. */ @SuppressWarnings("unchecked") public void prepare(RawDocument rawDocument) throws RegainException { String url = rawDocument.getUrl(); InputStream stream = null; PdfReader reader = null; try { // Create a InputStream that reads the content. stream = rawDocument.getContentAsStream(); // Parse the content reader = new PdfReader(stream); if (reader.isEncrypted()) { reader = new PdfReader(stream, OWNER_PASSWORD); } PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; StringBuilder stringBuilder = new StringBuilder(); for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); stringBuilder.append(strategy.getResultantText()); } setCleanedContent(stringBuilder.toString()); // Get metadata Map<String, String> info = reader.getInfo(); StringBuilder metaData = new StringBuilder(); metaData.append("p."); metaData.append(Integer.toString(reader.getNumberOfPages())); metaData.append(" "); // Check if fields are null String author = info.get("Author"); String creator = info.get("Creator"); String subject = info.get("Subject"); String keywords = info.get("Keywords"); String title = info.get("Title"); if (author != null) { metaData.append(author); metaData.append(" "); } if (creator != null) { metaData.append(creator); metaData.append(" "); } if (subject != null) { metaData.append(subject); metaData.append(" "); } if (keywords != null) { metaData.append(keywords); metaData.append(" "); } if (title != null) { setTitle(title); } setCleanedMetaData(metaData.toString()); if (log.isDebugEnabled()) { log.debug("Extracted meta data ::" + getCleanedMetaData() + ":: from " + rawDocument.getUrl()); } } catch (IOException exc) { throw new RegainException("Error reading document: " + url, exc); } catch (Exception exc) { // They didn't supply a password and the default of "" was wrong. throw new RegainException("Unknown error parsing document: " + url, exc); } finally { if (stream != null) { try { stream.close(); } catch (Exception exc) { } } if (reader != null) { try { reader.close(); } catch (Exception exc) { } } } } }