com.seajas.search.contender.service.modifier.FeedModifierService.java Source code

Introduction

Here is the source code for com.seajas.search.contender.service.modifier.FeedModifierService.java
Source

/**
 * Copyright (C) 2013 Seajas, the Netherlands.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 3, as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.seajas.search.contender.service.modifier;

import com.seajas.search.bridge.profiler.model.modifier.Modifier;
import com.seajas.search.bridge.profiler.model.modifier.ModifierFilter;
import com.seajas.search.bridge.profiler.model.modifier.ModifierScript;
import com.seajas.search.contender.WebResolverSettings;
import com.seajas.search.contender.http.HttpClientFeedFetcher;
import com.seajas.search.contender.http.SizeRestrictedHttpResponse;
import com.seajas.search.contender.http.SizeRestrictedResponseHandler;
import com.seajas.search.contender.replication.ModifierCache;
import com.seajas.search.contender.scripting.XmlHtmlReader;
import com.seajas.search.contender.service.builder.RSSDirectoryBuilder;
import com.seajas.search.utilities.web.WebFeeds;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.SyndFeedInput;
import com.sun.syndication.io.XmlReader;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.apache.commons.io.IOUtils;
import org.apache.commons.net.ftp.FTPClient;
import org.apache.commons.net.ftp.FTPSClient;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.message.BasicHeader;
import org.apache.http.params.CoreProtocolPNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.rometools.fetcher.FeedFetcher;
import org.rometools.fetcher.FetcherException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;

import javax.script.ScriptException;

/**
 * Feed modifier service.
 * 
 * @author Jasper van Veghel <jasper@seajas.com>
 */
@Service
public class FeedModifierService extends AbstractModifierService {
    /**
     * The logger.
     */
    private static final Logger logger = LoggerFactory.getLogger(FeedModifierService.class);

    /**
     * The retrieval HTTP client.
     */
    @Autowired
    @Qualifier("retrievalHttpClient")
    private HttpClient httpClient;

    /**
     * The feed fetcher.
     */
    @Autowired
    @Qualifier("retrievalFeedFetcher")
    private FeedFetcher feedFetcher;

    /**
     * The modifier filter processor.
     */
    @Autowired
    private ModifierFilterProcessor modifierFilterProcessor;

    /**
     * The modifier script processor.
     */
    @Autowired
    private ModifierScriptProcessor modifierScriptProcessor;

    /**
     * The modifier cache.
     */
    @Autowired
    private ModifierCache modifierCache;

    /**
     * The auto-detect parser.
     */
    @Autowired
    private AutoDetectParser autoDetectParser;

    /**
     * Default constructor.
     */
    public FeedModifierService() {
        super();
    }

    /**
     * Default constructor.
     * 
     * @param maximumContentLength
     * @param preferredEnclosures
     */
    @Autowired
    public FeedModifierService(
            @Value("${contender.project.http.maximum.result.size}") final Long maximumContentLength,
            @Value("${contender.project.rss.reader.preferred.enclosures}") final String preferredEnclosures) {
        super(maximumContentLength, preferredEnclosures);
    }

    /**
     * Test a feed by retrieving the content and then discarding it.
     * 
     * @param uri
     * @param encodingOverride
     * @param userAgent
     * @return boolean
     */
    public boolean testConnection(final URI uri, final String encodingOverride, final String userAgent) {
        try {
            // Retrieve the reader, then close the stream

            logger.info("Retrieving testing content for feed with URI " + uri);

            Reader result = getContent(uri, encodingOverride, userAgent, null);

            if (result != null)
                result.close();
            else {
                logger.error("Could not retrieve testing content for feed with URI " + uri);

                return false;
            }

            return true;
        } catch (IOException e) {
            logger.error("Could not retrieve testing content for feed with URI " + uri, e);

            return false;
        }
    }

    /**
     * Test a given feed modifier chain by its (feed) modifier ID.
     * 
     * @param id
     * @param uri
     * @param encodingOverride
     * @param userAgent
     * @throws Exception
     * @return List<String, Boolean>
     */
    public Map<String, Boolean> testModifier(Integer id, URI uri, String encodingOverride, String userAgent)
            throws Exception {
        WebResolverSettings settings = new WebResolverSettings();
        settings.setMaximumContentLength(maximumContentLength);
        settings.setUserAgent(userAgent);

        Map<String, Boolean> result = new HashMap<String, Boolean>();

        logger.info("Testing feed modifier with ID " + id + " and URI " + uri);

        try {
            Modifier modifier = modifierCache.getFeedModifierById(id);

            if (!Pattern.matches(modifier.getUrlExpression(), uri.toString()))
                throw new Exception("The given testing feed URI is not covered by the modifier expression");

            Reader reader = getContent(uri, encodingOverride, userAgent, null);

            if (reader != null) {
                // Run it through the modifier

                for (ModifierFilter filter : modifier.getFilters()) {
                    StringBuffer current = new StringBuffer(), updated = new StringBuffer();

                    reader = readerToBuffer(current, reader, false);
                    reader = modifierFilterProcessor.process(filter, reader);
                    reader = readerToBuffer(updated, reader, false);

                    result.put("Filter_" + filter.getId(), !current.toString().equals(updated.toString()));

                    reader.close();
                }

                for (ModifierScript script : modifier.getScripts()) {
                    StringBuffer current = new StringBuffer(), updated = new StringBuffer();

                    reader = readerToBuffer(current, reader, false);
                    reader = modifierScriptProcessor.process(script, extractAndClose(reader), uri, settings, false);
                    reader = readerToBuffer(updated, reader, false);

                    result.put("Script_" + script.getId(), !current.toString().equals(updated.toString()));

                    reader.close();
                }
            } else
                throw new Exception("Could not retrieve the result feed content");
        } catch (ScriptException e) {
            throw new Exception("Could not test the given feed: " + e.getMessage(), e);
        } catch (IOException e) {
            throw new Exception("Could not test the given feed: " + e.getMessage(), e);
        }

        return result;
    }

    /**
     * Retrieve a feed from the URL modified by the relevant modifiers.
     * 
     * @param uri
     * @param encodingOverride
     * @param userAgent
     * @param resultHeaders
     * @param suppressErrors
     * @return SyndFeed
     */
    public SyndFeed getFeed(URI uri, String encodingOverride, String userAgent,
            Map<String, String> resultParameters, Map<String, String> resultHeaders, Boolean suppressErrors) {
        WebResolverSettings settings = new WebResolverSettings();

        settings.setMaximumContentLength(maximumContentLength);
        settings.setUserAgent(userAgent);
        settings.setResultParameters(resultParameters);
        settings.setResultHeaders(resultHeaders);

        try {
            SyndFeed resultFeed = null;

            // We can only retrieve unmodified feeds using conditional gets

            List<Modifier> modifiers = modifierCache.getFeedModifiersByUrlMatch(uri.toString());

            if (modifiers.size() == 0
                    && (uri.getScheme().equalsIgnoreCase("http") || uri.getScheme().equalsIgnoreCase("https"))) {
                if (feedFetcher instanceof HttpClientFeedFetcher)
                    resultFeed = ((HttpClientFeedFetcher) feedFetcher).retrieveFeed(userAgent, uri.toURL(),
                            resultHeaders);
                else
                    resultFeed = feedFetcher.retrieveFeed(userAgent, uri.toURL());
                WebFeeds.validate(resultFeed, uri);
            } else {
                Reader reader = getContent(uri, encodingOverride, userAgent, resultHeaders);

                if (reader != null) {
                    try {
                        // Run it through the modifiers

                        reader = executeModifiers(modifiers, reader, uri, settings);

                        // Fill in the result feed

                        SyndFeedInput feedInput = new SyndFeedInput();

                        resultFeed = feedInput.build(reader);
                    } finally {
                        reader.close();
                    }
                } else {
                    logger.error("No content could be retrieved from the given URL. Skipping feed.");

                    return null;
                }
            }

            return resultFeed;
        } catch (FetcherException e) {
            if (!suppressErrors)
                logger.error("Could not retrieve the given feed (" + uri + "): " + e.getMessage());
        } catch (IllegalArgumentException e) {
            if (!suppressErrors)
                logger.error("Could not retrieve the given feed (" + uri + "): " + e.getMessage(), e);
        } catch (FeedException e) {
            if (!suppressErrors)
                logger.error("Could not retrieve the given feed (" + uri + "): " + e.getMessage(), e);
        } catch (ScriptException e) {
            if (!suppressErrors)
                logger.error("Could not retrieve the given feed (" + uri + "): " + e.getMessage(), e);
        } catch (IOException e) {
            if (!suppressErrors)
                logger.error("Could not retrieve the given feed (" + uri + "): " + e.getMessage(), e);
        }

        return null;
    }

    /**
     * Retrieve the content of a result feed URL.
     * 
     * @param uri
     * @param encodingOverride
     * @param userAgent
     * @param resultHeaders
     * @return Reader
     */
    private Reader getContent(final URI uri, final String encodingOverride, final String userAgent,
            final Map<String, String> resultHeaders) {
        Reader result = null;
        String contentType = null;

        // Retrieve the feed

        try {
            InputStream inputStream = null;

            if (uri.getScheme().equalsIgnoreCase("ftp") || uri.getScheme().equalsIgnoreCase("ftps")) {
                FTPClient ftpClient = uri.getScheme().equalsIgnoreCase("ftps") ? new FTPSClient() : new FTPClient();

                try {
                    ftpClient.connect(uri.getHost(), uri.getPort() != -1 ? uri.getPort() : 21);

                    if (StringUtils.hasText(uri.getUserInfo())) {
                        if (uri.getUserInfo().contains(":"))
                            ftpClient.login(uri.getUserInfo().substring(0, uri.getUserInfo().indexOf(":")),
                                    uri.getUserInfo().substring(uri.getUserInfo().indexOf(":") + 1));
                        else
                            ftpClient.login(uri.getUserInfo(), "");

                        inputStream = ftpClient.retrieveFileStream(uri.getPath());
                    }
                } finally {
                    ftpClient.disconnect();
                }
            } else if (uri.getScheme().equalsIgnoreCase("file")) {
                File file = new File(uri);

                if (!file.isDirectory())
                    inputStream = new FileInputStream(uri.getPath());
                else
                    inputStream = RSSDirectoryBuilder.build(file);
            } else if (uri.getScheme().equalsIgnoreCase("http") || uri.getScheme().equalsIgnoreCase("https")) {
                try {
                    HttpGet method = new HttpGet(uri.toString());

                    if (resultHeaders != null)
                        for (Entry<String, String> resultHeader : resultHeaders.entrySet())
                            method.setHeader(new BasicHeader(resultHeader.getKey(), resultHeader.getValue()));
                    if (userAgent != null)
                        method.setHeader(CoreProtocolPNames.USER_AGENT, userAgent);

                    SizeRestrictedHttpResponse response = httpClient.execute(method,
                            new SizeRestrictedResponseHandler(maximumContentLength, uri));

                    try {
                        if (response != null) {
                            inputStream = new ByteArrayInputStream(response.getResponse());
                            contentType = response.getContentType() != null ? response.getContentType().getValue()
                                    : null;
                        } else
                            return null;
                    } catch (RuntimeException e) {
                        method.abort();

                        throw e;
                    }
                } catch (IllegalArgumentException e) {
                    logger.error("Invalid URL " + uri.toString() + " - not returning content", e);

                    return null;
                }
            } else {
                logger.error("Unknown protocol " + uri.getScheme() + ". Skipping feed.");

                return null;
            }

            // Guess the character encoding using ROME's reader, then buffer it so we can discard the input stream (and close the connection)

            InputStream readerInputStream = new BufferedInputStream(inputStream);
            MediaType mediaType = autoDetectParser.getDetector().detect(readerInputStream, new Metadata());

            try {
                Reader reader = null;

                if (mediaType.getType().equals("application")) {
                    if (mediaType.getSubtype().equals("x-gzip")) {
                        GZIPInputStream gzipInputStream = new GZIPInputStream(readerInputStream);

                        if (encodingOverride != null)
                            reader = readerToBuffer(new StringBuffer(),
                                    new InputStreamReader(gzipInputStream, encodingOverride), false);
                        else
                            reader = readerToBuffer(new StringBuffer(),
                                    contentType != null ? new XmlHtmlReader(gzipInputStream, contentType, true)
                                            : new XmlReader(gzipInputStream, true),
                                    false);

                        gzipInputStream.close();
                    } else if (mediaType.getSubtype().equals("zip")) {
                        ZipFile zipFile = null;

                        // ZipInputStream can't do read-aheads, so we have to use a temporary on-disk file instead

                        File temporaryFile = File.createTempFile("profiler-", ".zip");

                        try {
                            FileOutputStream zipOutputStream = new FileOutputStream(temporaryFile);
                            IOUtils.copy(readerInputStream, zipOutputStream);

                            readerInputStream.close();

                            zipOutputStream.flush();
                            zipOutputStream.close();

                            // Create a new entry and process it

                            zipFile = new ZipFile(temporaryFile);
                            Enumeration<? extends ZipEntry> zipEnumeration = zipFile.entries();

                            ZipEntry zipEntry = zipEnumeration.nextElement();

                            if (zipEntry == null || zipEntry.isDirectory() || zipEnumeration.hasMoreElements()) {
                                logger.error(
                                        "ZIP files are currently expected to contain one and only one entry, which is to be a file");

                                return null;
                            }

                            // We currently only perform prolog stripping for ZIP files

                            InputStream zipInputStream = new BufferedInputStream(zipFile.getInputStream(zipEntry));

                            if (encodingOverride != null)
                                reader = readerToBuffer(new StringBuffer(), new InputStreamReader(
                                        new BufferedInputStream(zipInputStream), encodingOverride), true);
                            else
                                result = readerToBuffer(new StringBuffer(),
                                        contentType != null
                                                ? new XmlHtmlReader(new BufferedInputStream(zipInputStream),
                                                        contentType, true)
                                                : new XmlReader(new BufferedInputStream(zipInputStream), true),
                                        true);
                        } catch (Exception e) {
                            logger.error("An error occurred during ZIP file processing", e);

                            return null;
                        } finally {
                            if (zipFile != null)
                                zipFile.close();

                            if (!temporaryFile.delete())
                                logger.error("Unable to delete temporary file");
                        }
                    }
                }

                if (result == null) {
                    if (encodingOverride != null)
                        result = readerToBuffer(new StringBuffer(), reader != null ? reader
                                : new InputStreamReader(readerInputStream, encodingOverride), false);
                    else
                        result = readerToBuffer(new StringBuffer(),
                                reader != null ? reader
                                        : contentType != null
                                                ? new XmlHtmlReader(readerInputStream, contentType, true)
                                                : new XmlReader(readerInputStream, true),
                                false);
                }
            } catch (Exception e) {
                logger.error("An error occurred during stream processing", e);

                return null;
            } finally {
                inputStream.close();
            }
        } catch (IOException e) {
            logger.error("Could not retrieve the given feed: " + e.getMessage(), e);

            return null;
        }

        return result;
    }

    /**
     * Execute the given modifiers, passing in the relevant class-bound arguments.
     * 
     * @param modifiers
     * @param reader
     * @param settings
     * @return Reader
     * @throws IOException
     * @throws ScriptException
     */
    private Reader executeModifiers(List<Modifier> modifiers, Reader reader, URI uri, WebResolverSettings settings)
            throws IOException, ScriptException {
        Reader result = reader;

        for (Modifier modifier : modifiers) {
            for (ModifierFilter filter : modifier.getFilters())
                result = modifierFilterProcessor.process(filter, result);
            for (ModifierScript script : modifier.getScripts()) {
                result = modifierScriptProcessor.process(script, extractAndClose(result), uri, settings, true);
            }
        }

        return result;
    }

    /**
     * Extract a String and close quietly.
     *
     * @param reader
     * @return String
     * @throws IOException
     */
    private String extractAndClose(final Reader reader) throws IOException {
        try {
            return IOUtils.toString(reader);
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }
}