com.seajas.search.contender.service.modifier.ArchiveModifierService.java Source code

Introduction

Here is the source code for com.seajas.search.contender.service.modifier.ArchiveModifierService.java
Source

/**
 * Copyright (C) 2013 Seajas, the Netherlands.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 3, as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.seajas.search.contender.service.modifier;

import com.seajas.search.bridge.jms.model.Archive;
import com.seajas.search.contender.scripting.ResolvingTransformerCache;
import com.seajas.search.contender.service.cache.CacheService;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.SyndFeedInput;
import com.sun.syndication.io.XmlReader;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.net.ftp.FTPClient;
import org.apache.commons.net.ftp.FTPFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.sax.SAXSource;
import javax.xml.transform.stream.StreamResult;

/**
 * Modifier service
 * 
 * @author Jasper van Veghel <jasper@seajas.com>
 */
@Service
public class ArchiveModifierService extends AbstractModifierService {
    /**
     * The logger.
     */
    private static final Logger logger = LoggerFactory.getLogger(ArchiveModifierService.class);

    /**
     * The transformer cache.
     */
    @Autowired
    private ResolvingTransformerCache transformerCache;

    /**
     * The auto-detect parser.
     */
    @Autowired
    private AutoDetectParser autoDetectParser;

    /**
     * The cache service.
     */
    @Autowired
    private CacheService cacheService;

    /**
     * The packages location (not including the specific folder (%1.))
     */
    @Value("${contender.project.archives.location}${file.separator}%1${file.separator}packages")
    private String packagesLocation;

    /**
     * The results location (not including the specific folder (%1.))
     */
    @Value("${contender.project.archives.location}${file.separator}%1${file.separator}results")
    private String resultsLocation;

    /**
     * Default constructor.
     */
    public ArchiveModifierService() {
        super();
    }

    /**
     * Default constructor.
     * 
     * @param maximumContentLength
     * @param preferredEnclosures
     */
    @Autowired
    public ArchiveModifierService(
            @Value("${contender.project.http.maximum.result.size}") final Long maximumContentLength,
            @Value("${contender.project.rss.reader.preferred.enclosures}") final String preferredEnclosures) {
        super(maximumContentLength, preferredEnclosures);
    }

    /**
     * Test an archive connection by logging onto the given URL and retrieving a directory listing.
     * 
     * @param uri
     * @return boolean
     */
    public boolean testConnection(final URI uri) {
        if (!uri.getScheme().equalsIgnoreCase("ftp") && !uri.getScheme().equalsIgnoreCase("ftps")) {
            logger.error("Archive URL " + uri + " using protocol '" + uri.getScheme() + "' is not supported");

            return false;
        }

        FTPClient ftpClient = retrieveFtpClient(uri);

        if (ftpClient != null) {
            try {
                logger.info("Retrieving testing content for archive with URI " + uri);

                // Retrieve a file listing

                ftpClient.listFiles(uri.getPath());

                // Disconnect the client

                ftpClient.disconnect();
            } catch (IOException e) {
                logger.error("Could not perform all FTP operations during archive testing", e);

                return false;
            }

            return true;
        } else
            return false;
    }

    /**
     * Handle the given archive's content by sending each entry to the given handler.
     * 
     * @param archive
     * @param handler
     * @throws Exception
     */
    public void handleArchive(final Archive archive, final ArchiveResultHandler handler) throws Exception {
        // Create a validating SAX parser

        final SAXParserFactory parserFactory = SAXParserFactory.newInstance();

        parserFactory.setValidating(true);
        parserFactory.setNamespaceAware(true);

        // Create a common transformer per thread

        Transformer transformer = null;

        try {
            transformer = transformerCache.getTransformer(archive.getId(), "archive",
                    archive.getModificationDate());

            if (transformer == null)
                transformer = transformerCache.putContent(archive.getId(), "archive", archive.getModificationDate(),
                        archive.getTransformerContent());
        } catch (TransformerConfigurationException e) {
            logger.error("Unable to generate a (cached) transformer from the given content", e);

            return;
        } catch (TransformerFactoryConfigurationError e) {
            logger.error("Unable to generate a (cached) transformer from the given content", e);

            return;
        }

        // Store and process the files

        try {
            Map<File, String> storedResults = storeAndDecompressFiles(archive);

            List<String> deletedLinks = new ArrayList<String>();

            // Only process deletes when a deletion expression has been provided

            if (StringUtils.hasText(archive.getDeletionExpression())) {
                // Process all entries beforehand, so to exclude deletes from the final result

                for (Map.Entry<File, String> storedResult : storedResults.entrySet()) {
                    File storedResultsFolder = storedResult.getKey();

                    for (File entryLocation : storedResultsFolder.listFiles())
                        if (entryLocation.getName().matches(archive.getDeletionExpression())) {
                            String deleteLink = entryLocation.getName().replaceAll(archive.getDeletionExpression(),
                                    archive.getInternalLink());

                            deletedLinks.add(deleteLink);

                            // Delete the actual link

                            cacheService
                                    .addDeleted(new CacheService.DeletedEntry(archive.getCollection(), deleteLink));
                        }
                }
            }

            // Now process the stored results themselves

            for (Map.Entry<File, String> storedResult : storedResults.entrySet()) {
                File storedResultsFolder = storedResult.getKey();

                // Create the descriptions folder if it doesn't already exist

                File descriptionsFolderLocation = new File(storedResultsFolder, "descriptions");

                if (!descriptionsFolderLocation.exists())
                    descriptionsFolderLocation.mkdirs();

                for (File entryLocation : storedResultsFolder.listFiles()) {
                    if (entryLocation.isDirectory()) {
                        if (!entryLocation.getName().equals("descriptions"))
                            logger.warn("Unknown folder '" + entryLocation.getName()
                                    + "'found in decompressed archive folder '"
                                    + storedResultsFolder.getAbsolutePath() + "'");

                        continue;
                    } else if (StringUtils.hasText(archive.getDeletionExpression())
                            && entryLocation.getName().matches(archive.getDeletionExpression()))
                        continue;

                    InputStream transformationInputStream = null;

                    try {
                        transformationInputStream = new BufferedInputStream(new FileInputStream(entryLocation));

                        // Now determine the content type and create a reader in case of structured content

                        MediaType entryMediaType = autoDetectParser.getDetector().detect(transformationInputStream,
                                new Metadata());

                        if (!(entryMediaType.getSubtype().equals("xml")
                                || entryMediaType.getSubtype().endsWith("+xml"))) {
                            logger.warn("Archive entry " + entryLocation.getAbsolutePath() + " contains "
                                    + entryMediaType + " data which is unstructured, ignoring");

                            continue;
                        }
                    } catch (IOException e) {
                        logger.error("Could not close input stream during archive processing", e);

                        if (transformationInputStream != null)
                            transformationInputStream.close();

                        continue;
                    }

                    // Process it as (semi-)structured content

                    XmlReader transformationReader = new XmlReader(transformationInputStream, true);
                    StringWriter transformationWriter = new StringWriter();

                    try {
                        transformer.setOutputProperty(OutputKeys.ENCODING, "UTF8");

                        // Use a SAX reader for entity resolution

                        XMLReader xmlReader = parserFactory.newSAXParser().getXMLReader();
                        InputSource inputSource = new InputSource(transformationReader);

                        xmlReader.setEntityResolver(transformerCache.getEntityResolver());
                        inputSource.setSystemId("file://" + transformerCache.getDtdImportPath() + "/template.xsl");

                        // Perform the actual transformation

                        transformer.setParameter("substituteUrl", archive.getInternalLink());
                        transformer.transform(new SAXSource(xmlReader, inputSource),
                                new StreamResult(transformationWriter));
                    } catch (TransformerException e) {
                        logger.error("Unable to perform content transformation for entry "
                                + entryLocation.getAbsolutePath());

                        continue;
                    } catch (SAXException e) {
                        logger.error("Unable to perform content transformation for entry "
                                + entryLocation.getAbsolutePath());

                        continue;
                    } catch (ParserConfigurationException e) {
                        logger.error("Unable to perform content transformation for entry "
                                + entryLocation.getAbsolutePath());

                        continue;
                    } finally {
                        transformationInputStream.close();
                        transformationReader.close();
                    }

                    // Create a syndication feed from the given result

                    String resultContent = transformationWriter.toString();

                    SyndFeed resultFeed = null;

                    try {
                        SyndFeedInput feedInput = new SyndFeedInput();

                        resultFeed = feedInput.build(new StringReader(resultContent));
                    } catch (FeedException e) {
                        logger.error("Could not parse the feed resulting from the archive entry transformation");

                        continue;
                    } finally {
                        transformationWriter.close();
                    }

                    // Write the <description> content to a separate file and add it as an <enclosure>

                    if (resultFeed.getEntries().size() > 0) {
                        Integer entryNumber = 0;

                        for (SyndEntry feedEntry : (Collection<SyndEntry>) resultFeed.getEntries()) {
                            if (!deletedLinks.contains(feedEntry.getLink())) {
                                String description = feedEntry.getDescription().getValue().trim();

                                File descriptionLocation = new File(descriptionsFolderLocation,
                                        stripExtension(entryLocation.getName()) + "-" + entryNumber++
                                                + (feedEntry.getDescription().getType().equals("text/html")
                                                        ? ".html"
                                                        : ".xml"));

                                Writer descriptionWriter = new OutputStreamWriter(
                                        new FileOutputStream(descriptionLocation), "UTF-8");

                                if (!description.endsWith("</html>"))
                                    descriptionWriter.write("<html>\n<head>\n\t<title>" + feedEntry.getTitle()
                                            + "</title>\n</head>\n<body>\n");
                                descriptionWriter.write(description);
                                if (!description.endsWith("</html>"))
                                    descriptionWriter.write("\n</body>\n</html>");

                                descriptionWriter.flush();
                                descriptionWriter.close();

                                // Remove the link from the processed cache should it already be in there, taking care of updates

                                cacheService.deleteElement(feedEntry.getLink());

                                // Then offer it up to the handler

                                if (logger.isDebugEnabled())
                                    logger.debug("Adding result content (" + entryNumber
                                            + ") for archive entry with path " + entryLocation.getAbsolutePath());

                                try {
                                    // NOTE: The encoding of 'UTF-8' is implied for archive-related files

                                    handler.process(new URI(feedEntry.getLink()), archive.getHostname(), feedEntry,
                                            resultFeed);
                                } catch (FeedException e) {
                                    logger.error(String.format(
                                            "Could not offer feed entry with link '%s' - invalid entry",
                                            feedEntry.getLink()), e);
                                } catch (URISyntaxException e) {
                                    logger.error(String.format(
                                            "Could not offer feed entry with link '%s' - invalid link",
                                            feedEntry.getLink()), e);
                                }
                            } else
                                logger.info("Skipping over feed entry with link '" + feedEntry.getLink()
                                        + "' - marked for deletion");
                        }
                    } else if (logger.isDebugEnabled())
                        logger.debug("No entries were found in archive entry with path "
                                + entryLocation.getAbsolutePath());
                }

                logger.info("Finished processing archive with name " + storedResult.getValue());

                // Now archive the entry in the cache

                cacheService.addArchived(storedResult.getValue());
            }

            logger.info("Finishing archive populator thread");
        } catch (IOException e) {
            logger.error("Could not close input stream during archive processing", e);
        }
    }

    /**
     * Store the not-already-cached files from the given URL and return their locations.
     * 
     * @param archive
     * @return Map<File, String>
     */
    private Map<File, String> storeAndDecompressFiles(final Archive archive) {
        Map<File, String> result = new HashMap<File, String>();

        // Create the FTP client

        FTPClient ftpClient = retrieveFtpClient(archive.getUri());

        try {
            // Retrieve the directory listing

            List<ArchiveFile> files = retrieveFiles(archive.getUri().getPath(), ftpClient,
                    archive.getExclusionExpression());

            Integer archiveNumber = -1, archiveTotal = files.size();

            logger.info("Archive with name '" + archive.getName() + "' produced " + archiveTotal + " files");

            // An empty archive typically indicates failure

            if (archiveTotal == 0)
                logger.warn("The given archive produced no entries - something probably went wrong");

            // Handle all archive files

            for (ArchiveFile archiveFile : files) {
                archiveNumber++;

                // Check whether the file already exists within the cache

                String baseUrl = (StringUtils.hasText(archive.getUri().getScheme())
                        ? archive.getUri().getScheme() + "://"
                        : "") + archive.getUri().getHost()
                        + (archive.getUri().getPort() != -1 ? ":" + archive.getUri().getPort() : "");

                if (!cacheService.isArchived(baseUrl + archiveFile.getFullPath())) {
                    logger.info("Started decompressing archive " + archiveNumber + "/" + archiveTotal
                            + " with name " + archiveFile.getFullPath());

                    // Write out the archive to disk so we can determine the MIME type

                    File archiveFileFolder = new File(archiveFile
                            .getTranslatedPath(packagesLocation.replace("%1", String.valueOf(archive.getId()))));

                    if (!archiveFileFolder.exists())
                        archiveFileFolder.mkdirs();

                    File archiveFileLocation = new File(archiveFileFolder, archiveFile.getFile().getName());

                    InputStream archiveInputStream = ftpClient.retrieveFileStream(archiveFile.getFullPath());
                    OutputStream archiveOutputStream = new FileOutputStream(archiveFileLocation);

                    IOUtils.copy(archiveInputStream, archiveOutputStream);

                    archiveInputStream.close();
                    ftpClient.completePendingCommand();

                    archiveOutputStream.flush();
                    archiveOutputStream.close();

                    // Now unpack the archive and transform each file

                    InputStream compressedArchiveInputStream = new FileInputStream(archiveFileLocation);

                    // Now determine the content type and create a reader in case of structured content

                    MediaType archiveMediaType = autoDetectParser.getDetector()
                            .detect(new BufferedInputStream(compressedArchiveInputStream), new Metadata());

                    if (!(archiveMediaType.getType().equals("application")
                            && archiveMediaType.getSubtype().equals("zip"))) {
                        logger.warn("Archive file " + archiveFile.getFullPath() + " contains " + archiveMediaType
                                + " data, which is not yet supported");

                        compressedArchiveInputStream.close();

                        continue;
                    } else
                        compressedArchiveInputStream.close();

                    // Create a new ZIP file from the given archive and decompress it

                    ZipFile zipFile = new ZipFile(archiveFileLocation);

                    File resultsLocationFolder = new File(archiveFile
                            .getTranslatedPath(resultsLocation.replace("%1", String.valueOf(archive.getId()))));

                    if (!resultsLocationFolder.exists())
                        resultsLocationFolder.mkdirs();

                    File resultsLocation = new File(resultsLocationFolder,
                            stripExtension(archiveFile.getFile().getName()));

                    if (!resultsLocation.exists())
                        resultsLocation.mkdirs();

                    logger.info("Started processing archive with name " + archiveFile.getFullPath());

                    Enumeration<? extends ZipEntry> zipEnumerator = zipFile.entries();

                    while (zipEnumerator.hasMoreElements()) {
                        ZipEntry entry = zipEnumerator.nextElement();

                        // Store it locally first

                        File entryLocation = new File(resultsLocation, entry.getName());

                        try {
                            InputStream entryInputStream = zipFile.getInputStream(entry);
                            OutputStream entryOutputStream = new FileOutputStream(entryLocation);

                            IOUtils.copy(entryInputStream, entryOutputStream);

                            entryInputStream.close();
                            entryOutputStream.close();
                        } catch (IOException e) {
                            logger.error("Could not store the compressed archive entry on disk", e);

                            continue;
                        }
                    }

                    zipFile.close();

                    // Add it to the results

                    result.put(resultsLocation, baseUrl + archiveFile.getFullPath());

                    logger.info("Finished processing archive with name " + archiveFile.getFullPath());
                } else if (logger.isDebugEnabled())
                    logger.debug("Skipping previously processed archive with name " + archiveFile.getFullPath());
            }
        } catch (IOException e) {
            logger.error("Could not close input stream during archive processing", e);
        } finally {
            try {
                if (ftpClient.isConnected())
                    ftpClient.disconnect();
            } catch (IOException e) {
                logger.error("Could not disconnect the FTP client", e);
            }
        }

        return result;
    }

    /**
     * Recursively retrieve a directory listing from the FTP server.
     * 
     * @param path
     * @param ftpClient
     * @param exclusionExpression
     * @return List<FTPFile>
     * @throws IOException
     */
    private static List<ArchiveFile> retrieveFiles(final String path, final FTPClient ftpClient,
            final String exclusionExpression) throws IOException {
        List<FTPFile> files = Arrays.asList(ftpClient.listFiles(path));
        List<ArchiveFile> result = new Vector<ArchiveFile>();

        for (FTPFile file : files)
            if (!file.getName().equals(".") && !file.getName().equals("..")) {
                if (file.isDirectory()) {
                    String folderPath = path + (path.endsWith("/") ? "" : "/") + file.getName();

                    if (!StringUtils.hasText(exclusionExpression) || !folderPath.matches(exclusionExpression))
                        result.addAll(retrieveFiles(folderPath, ftpClient, exclusionExpression));
                } else if (file.isFile() && (!StringUtils.hasText(exclusionExpression)
                        || !(path + "/" + file.getName()).matches(exclusionExpression)))
                    result.add(new ArchiveFile(path, file));
            }

        return result;
    }

    /**
     * Return the given name without the part after the last '.'.
     * 
     * @param name
     * @return String
     */
    private static String stripExtension(final String name) {
        if (name.lastIndexOf('.') > 0)
            return name.substring(0, name.lastIndexOf('.'));

        return name;
    }

    /**
     * Archive file class, which adds a parent folder indicator to an FTPFile.
     * 
     * @author Jasper van Veghel <jasper@seajas.com>
     */
    private static class ArchiveFile {
        /**
         * Parent folder.
         */
        private final String parent;

        /**
         * Actual file.
         */
        private final FTPFile file;

        /**
         * Default constructor.
         * 
         * @param parent
         * @param file
         */
        public ArchiveFile(final String parent, final FTPFile file) {
            this.parent = parent;
            this.file = file;
        }

        /**
         * Retrieve the parent.
         * 
         * @return String
         */
        public String getParent() {
            return parent;
        }

        /**
         * Retrieve the file.
         * 
         * @return FTPFile
         */
        public FTPFile getFile() {
            return file;
        }

        /**
         * Retrieve the full path.
         * 
         * @return String
         */
        public String getFullPath() {
            return getParent() + "/" + getFile().getName();
        }

        /**
         * Retrieve a translated path.
         * 
         * @param base
         * @return String
         */
        public String getTranslatedPath(final String base) {
            if (base.endsWith(File.separator))
                return base.substring(0, base.length() - 1) + parent.replace('/', File.separatorChar);
            else
                return base + parent.replace('/', File.separatorChar);
        }
    }

    /**
     * Archive result handler interface.
     * 
     * @author Jasper van Veghel <jasper@seajas.com>
     */
    public static interface ArchiveResultHandler {
        /**
         * Process the given archive result.
         * 
         * @param uri
         * @param hostname
         * @param entry
         * @param feed
         */
        void process(final URI uri, final String hostname, final SyndEntry entry, final SyndFeed feed)
                throws FeedException;
    }
}