net.sf.regain.crawler.preparator.ZipPreparator.java Source code

Introduction

Here is the source code for net.sf.regain.crawler.preparator.ZipPreparator.java
Source

/*
 * regain - A file search engine providing plenty of formats
 * Copyright (C) 2004-2012  Thomas Tesche
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Contact: Thomas Tesche, info@clustersystems.de
 *
 * CVS information:
 *  $RCSfile$
 *   $Source$
 *     $Date: 2008-03-16 20:50:37 +0100 (So, 16 Mr 2008) $
 *   $Author: thtesche $
 * $Revision: 281 $
 */
package net.sf.regain.crawler.preparator;

import net.sf.regain.RegainException;
import net.sf.regain.crawler.config.DummyCrawlerConfig;
import net.sf.regain.crawler.document.AbstractPreparator;
import net.sf.regain.crawler.document.Preparator;
import net.sf.regain.crawler.document.RawDocument;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.ByteArrayOutputStream;

import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.zip.ZipInputStream;

/**
 * Prepares  archive files (zipped content) for indexing
 * <p/>
 * The following information will be extracted:
 * filename (toLowerCase)
 *
 * @author Thomas Tesche, cluster:Systems CSG GmbH, http://www.clustersystems.info
 */
public class ZipPreparator extends AbstractPreparator {

    /**
     * Creates a new instance of ZipPreparator.
     *
     * @throws RegainException If creating the preparator failed.
     */
    public ZipPreparator() throws RegainException {
        super(new String[] { "application/zip" });
    }

    /**
     * Prepares the document for indexing
     *
     * @param rawDocument the document
     * @throws RegainException if preparation goes wrong
     */
    @Override
    public void prepare(RawDocument rawDocument) throws RegainException {
        ArchiveInputStream ain = null;
        ZipInputStream zipInputStream = new ZipInputStream(rawDocument.getContentAsStream());

        PreparatorFactory preparatorFactory = new PreparatorFactory(new DummyCrawlerConfig());
        try {
            ain = new ArchiveStreamFactory().createArchiveInputStream("zip", rawDocument.getContentAsStream());
            ZipArchiveEntry entry;
            while ((entry = (ZipArchiveEntry) ain.getNextEntry()) != null) {
                String s = String.format("Entry: %s len %d added %TD", entry.getName(), entry.getSize(),
                        new Date(entry.getTime()));
                System.out.println(s);

                Preparator preparator = null;
                ByteArrayOutputStream byteArrayOutputStream = null;

                RawDocument rawZipDocument = new RawDocument(null, null, null, null);
                rawZipDocument.setUrl(new File(entry.getName()).toURI().toString());

                try {
                    byteArrayOutputStream = new ByteArrayOutputStream();
                    IOUtils.copy(zipInputStream, byteArrayOutputStream);
                    rawZipDocument.setContent(byteArrayOutputStream.toByteArray());
                    preparator = preparatorFactory.get(rawZipDocument);

                } finally {
                    IOUtils.closeQuietly(byteArrayOutputStream);
                }

                if (preparator != null) {
                    preparator.prepare(rawZipDocument);
                    // concatenates contents
                    setCleanedContent(new StringBuilder().append(getCleanedContent()).append("\n")
                            .append(preparator.getCleanedContent()).toString());
                    setTitle(getTitle() + " " + preparator.getTitle());
                    setSummary(getSummary() + " " + preparator.getSummary());
                    setCleanedMetaData(getCleanedMetaData() + " " + preparator.getCleanedMetaData());
                    setHeadlines(getHeadlines() + " " + preparator.getHeadlines());
                    preparator.cleanUp();
                }
            }

        } catch (IOException | ArchiveException e) {
            e.printStackTrace();
        } finally {
            //IOUtils.closeQuietly(zipInputStream);
            IOUtils.closeQuietly(ain);
        }

    }
}