com.seajas.search.contender.jms.processor.FeedProcessor.java Source code

Java tutorial

Introduction

Here is the source code for com.seajas.search.contender.jms.processor.FeedProcessor.java

Source

/**
 * Copyright (C) 2013 Seajas, the Netherlands.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 3, as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.seajas.search.contender.jms.processor;

import com.seajas.search.bridge.jms.model.CompositeEntry;
import com.seajas.search.bridge.jms.model.Feed;
import com.seajas.search.bridge.jms.model.SourceElement;
import com.seajas.search.bridge.jms.model.state.CompositeState;
import com.seajas.search.contender.jms.service.InjectionService;
import com.seajas.search.contender.service.cache.CacheService;
import com.seajas.search.contender.service.modifier.FeedModifierService;
import com.seajas.search.contender.service.storage.StorageService;
import com.seajas.search.utilities.web.WebFeeds;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndEntryImpl;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.FeedException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.bson.types.ObjectId;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;

/**
 * Feed processor.
 * 
 * @author Jasper van Veghel <jasper@seajas.com>
 * @author Pascal S. de Kloe <pascal@quies.net>
 */
@Service
public class FeedProcessor {

    /**
     * The logger.
     */
    private static final Logger logger = LoggerFactory.getLogger(FeedProcessor.class);

    /**
     * The random minimum.
     */
    private static final Integer RANDOM_RANGE_MINIMUM = 0;

    /**
     * The cache service
     */
    @Autowired
    private CacheService cacheService;

    /**
     * The injection service.
     */
    @Autowired
    private InjectionService injectionService;

    /**
     * The feed modifier service.
     */
    @Autowired
    private FeedModifierService feedModifierService;

    /**
     * The storage service.
     */
    @Autowired
    private StorageService storageService;

    /**
     * The default user agent.
     */
    @Value("${contender.project.http.user.agent}")
    private String defaultUserAgent;

    /**
     * The random number generator for random delays and user agents.
     */
    private final Random randomGenerator = new Random();

    /**
     * Process the given feed.
     * 
     * @param feed
     */
    public void process(final Feed feed) {
        if (logger.isInfoEnabled())
            logger.info(String.format("Retrieving feed '%s' with URL '%s'", feed.getName(), feed.getUri()));

        SyndFeed resultFeed = feedModifierService.getFeed(feed.getUri(), feed.getFeedEncodingOverride(),
                feed.getUserAgent(), feed.getResultParameters(), feed.getRetrievalRequestHeaders(), false);

        if (resultFeed == null) {
            logger.error(String.format("No feed created from feed '%s' with URL '%s' - discarding", feed.getName(),
                    feed.getUri()));

            return;
        }

        List<SyndEntry> entries = resultFeed.getEntries();

        if (logger.isInfoEnabled())
            logger.info(String.format("Feed with name '%s' produced %d entries", feed.getName(), entries.size()));

        // Add, update or ignore should it already have been processed

        List<SourceElement> elements = new ArrayList<SourceElement>();

        for (SyndEntry resultEntry : entries) {
            try {
                WebFeeds.validateEntry(resultEntry, resultFeed);

                // Make sure to strip out any unserializable components

                resultEntry.setForeignMarkup(null);
                ((SyndEntryImpl) resultEntry).setWireEntry(null);

                // Create a proper URI and use it for both the source element and the cache key generation to avoid any cache misses

                URI entryLink = URI.create(feedModifierService.getEntryLink(resultEntry));

                String cacheKey = cacheService.createCompositeKey(entryLink.toString(), feed.getResultParameters());

                if (!cacheService.isCached(cacheKey)) {
                    SourceElement element = new SourceElement();

                    element.setUri(entryLink);
                    element.setHostname(
                            StringUtils.hasText(entryLink.getHost()) ? entryLink.getHost().replace("www.", "")
                                    : "localhost");
                    element.setEntry(resultEntry);
                    element.setUserAgent(determineUserAgent(feed));

                    elements.add(element);
                } else {
                    if (logger.isDebugEnabled())
                        logger.debug(String.format("Not injecting feed with entry link '%s' - already in cache",
                                resultEntry.getLink()));
                }
            } catch (FeedException e) {
                logger.warn(String.format("Skipping entry in feed %s at %s", feed.getId(), feed.getUri()), e);
            }
        }

        if (logger.isInfoEnabled())
            logger.info(String.format(
                    "Finished retrieving feed '%s' with URL '%s' (and hostname %s) - injecting the %d result(s)",
                    feed.getName(), feed.getUri(),
                    feed.getHostname() != null ? "'" + feed.getHostname() + "'" : "(null)", elements.size()));

        // Don't consider injecting locally or remotely - everything goes over the queue - it's easier that way

        for (SourceElement element : elements) {
            Long elementDelay = feed.getElementDelay();

            if (feed.isElementDelayRandomized() && elementDelay > 0)
                elementDelay = (long) randomGenerator.nextInt(elementDelay.intValue() - RANDOM_RANGE_MINIMUM + 1)
                        + RANDOM_RANGE_MINIMUM;

            if (logger.isInfoEnabled())
                logger.info(String.format("Injecting feed element with hostname %s%s - URI is %s",
                        element.getHostname() != null ? "'" + element.getHostname() + "'" : "(null)",
                        elementDelay != null ? " and delay " + elementDelay : "", element.getUri()));

            // This element might already exist in the storage back-end (cache-sync issues, etc.)

            // String compositeUrl = cacheService.createCompositeKey(element.getUri().toString(), feed.getResultParameters());

            // CompositeEntry existingEntry = storageService.retrieveEntryByCompositeUrl(compositeUrl);

            // if (existingEntry != null)
            //   logger.warn("The storage back-end already contains a fully processed element with composite (enricher) ID '" + compositeUrl + "' - probably due to cache sync - will reuse");

            CompositeEntry existingEntry = null;

            // Create and inject a new CompositeEntry

            CompositeEntry entry = existingEntry != null ? existingEntry : new CompositeEntry();

            if (existingEntry == null)
                entry.setId(ObjectId.get());

            entry.setSource(feed);
            entry.setElement(element);
            entry.setCurrentState(CompositeState.SourceElement);

            storageService.saveEntry(entry);

            injectionService.injectElement(entry.getId(), element.getHostname(), elementDelay);
        }

        if (logger.isInfoEnabled())
            logger.info(String.format("Finished injecting results for feed '%s' with URL '%s'", feed.getName(),
                    feed.getUri()));
    }

    /**
     * Retrieve a user agent override from the given feed, or use the default.
     * 
     * @param feed
     * @return String
     */
    private String determineUserAgent(final Feed feed) {
        String userAgent = feed.getUserAgentsOverride() != null && feed.getUserAgentsOverride().size() > 0
                ? feed.getUserAgentsOverride().get(0)
                : defaultUserAgent;

        if (feed.getUserAgentsOverride() != null && feed.getUserAgentsOverride().size() > 1
                && feed.isUserAgentsOverrideRandomized())
            userAgent = feed.getUserAgentsOverride().get(
                    randomGenerator.nextInt((feed.getUserAgentsOverride().size() - 1) - RANDOM_RANGE_MINIMUM + 1)
                            + RANDOM_RANGE_MINIMUM);

        return userAgent;
    }
}