Java tutorial
/* * regain - A file search engine providing plenty of formats * Copyright (C) 2004-2012 Thomas Tesche * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Contact: Thomas Tesche, info@clustersystems.de * * CVS information: * $RCSfile$ * $Source$ * $Date: 2008-03-16 20:50:37 +0100 (So, 16 Mr 2008) $ * $Author: thtesche $ * $Revision: 281 $ */ package net.sf.regain.crawler.preparator; import net.sf.regain.RegainException; import net.sf.regain.crawler.config.DummyCrawlerConfig; import net.sf.regain.crawler.document.AbstractPreparator; import net.sf.regain.crawler.document.Preparator; import net.sf.regain.crawler.document.RawDocument; import org.apache.commons.compress.archivers.ArchiveException; import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.io.IOUtils; import org.apache.commons.io.output.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.util.Date; import java.util.zip.ZipInputStream; /** * Prepares archive files (zipped content) for indexing * <p/> * The following information will be extracted: * filename (toLowerCase) * * @author Thomas Tesche, cluster:Systems CSG GmbH, http://www.clustersystems.info */ public class ZipPreparator extends AbstractPreparator { /** * Creates a new instance of ZipPreparator. * * @throws RegainException If creating the preparator failed. */ public ZipPreparator() throws RegainException { super(new String[] { "application/zip" }); } /** * Prepares the document for indexing * * @param rawDocument the document * @throws RegainException if preparation goes wrong */ @Override public void prepare(RawDocument rawDocument) throws RegainException { ArchiveInputStream ain = null; ZipInputStream zipInputStream = new ZipInputStream(rawDocument.getContentAsStream()); PreparatorFactory preparatorFactory = new PreparatorFactory(new DummyCrawlerConfig()); try { ain = new ArchiveStreamFactory().createArchiveInputStream("zip", rawDocument.getContentAsStream()); ZipArchiveEntry entry; while ((entry = (ZipArchiveEntry) ain.getNextEntry()) != null) { String s = String.format("Entry: %s len %d added %TD", entry.getName(), entry.getSize(), new Date(entry.getTime())); System.out.println(s); Preparator preparator = null; ByteArrayOutputStream byteArrayOutputStream = null; RawDocument rawZipDocument = new RawDocument(null, null, null, null); rawZipDocument.setUrl(new File(entry.getName()).toURI().toString()); try { byteArrayOutputStream = new ByteArrayOutputStream(); IOUtils.copy(zipInputStream, byteArrayOutputStream); rawZipDocument.setContent(byteArrayOutputStream.toByteArray()); preparator = preparatorFactory.get(rawZipDocument); } finally { IOUtils.closeQuietly(byteArrayOutputStream); } if (preparator != null) { preparator.prepare(rawZipDocument); // concatenates contents setCleanedContent(new StringBuilder().append(getCleanedContent()).append("\n") .append(preparator.getCleanedContent()).toString()); setTitle(getTitle() + " " + preparator.getTitle()); setSummary(getSummary() + " " + preparator.getSummary()); setCleanedMetaData(getCleanedMetaData() + " " + preparator.getCleanedMetaData()); setHeadlines(getHeadlines() + " " + preparator.getHeadlines()); preparator.cleanUp(); } } } catch (IOException | ArchiveException e) { e.printStackTrace(); } finally { //IOUtils.closeQuietly(zipInputStream); IOUtils.closeQuietly(ain); } } }