Java tutorial
/** * Copyright (C) 2013 Seajas, the Netherlands. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License version 3, as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.seajas.search.contender.jms.processor; import com.seajas.search.bridge.jms.model.CompositeEntry; import com.seajas.search.bridge.jms.model.Feed; import com.seajas.search.bridge.jms.model.SourceElement; import com.seajas.search.bridge.jms.model.state.CompositeState; import com.seajas.search.contender.jms.service.InjectionService; import com.seajas.search.contender.service.cache.CacheService; import com.seajas.search.contender.service.modifier.FeedModifierService; import com.seajas.search.contender.service.storage.StorageService; import com.seajas.search.utilities.web.WebFeeds; import com.sun.syndication.feed.synd.SyndEntry; import com.sun.syndication.feed.synd.SyndEntryImpl; import com.sun.syndication.feed.synd.SyndFeed; import com.sun.syndication.io.FeedException; import java.net.URI; import java.util.ArrayList; import java.util.List; import java.util.Random; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.bson.types.ObjectId; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import org.springframework.util.StringUtils; /** * Feed processor. * * @author Jasper van Veghel <jasper@seajas.com> * @author Pascal S. de Kloe <pascal@quies.net> */ @Service public class FeedProcessor { /** * The logger. */ private static final Logger logger = LoggerFactory.getLogger(FeedProcessor.class); /** * The random minimum. */ private static final Integer RANDOM_RANGE_MINIMUM = 0; /** * The cache service */ @Autowired private CacheService cacheService; /** * The injection service. */ @Autowired private InjectionService injectionService; /** * The feed modifier service. */ @Autowired private FeedModifierService feedModifierService; /** * The storage service. */ @Autowired private StorageService storageService; /** * The default user agent. */ @Value("${contender.project.http.user.agent}") private String defaultUserAgent; /** * The random number generator for random delays and user agents. */ private final Random randomGenerator = new Random(); /** * Process the given feed. * * @param feed */ public void process(final Feed feed) { if (logger.isInfoEnabled()) logger.info(String.format("Retrieving feed '%s' with URL '%s'", feed.getName(), feed.getUri())); SyndFeed resultFeed = feedModifierService.getFeed(feed.getUri(), feed.getFeedEncodingOverride(), feed.getUserAgent(), feed.getResultParameters(), feed.getRetrievalRequestHeaders(), false); if (resultFeed == null) { logger.error(String.format("No feed created from feed '%s' with URL '%s' - discarding", feed.getName(), feed.getUri())); return; } List<SyndEntry> entries = resultFeed.getEntries(); if (logger.isInfoEnabled()) logger.info(String.format("Feed with name '%s' produced %d entries", feed.getName(), entries.size())); // Add, update or ignore should it already have been processed List<SourceElement> elements = new ArrayList<SourceElement>(); for (SyndEntry resultEntry : entries) { try { WebFeeds.validateEntry(resultEntry, resultFeed); // Make sure to strip out any unserializable components resultEntry.setForeignMarkup(null); ((SyndEntryImpl) resultEntry).setWireEntry(null); // Create a proper URI and use it for both the source element and the cache key generation to avoid any cache misses URI entryLink = URI.create(feedModifierService.getEntryLink(resultEntry)); String cacheKey = cacheService.createCompositeKey(entryLink.toString(), feed.getResultParameters()); if (!cacheService.isCached(cacheKey)) { SourceElement element = new SourceElement(); element.setUri(entryLink); element.setHostname( StringUtils.hasText(entryLink.getHost()) ? entryLink.getHost().replace("www.", "") : "localhost"); element.setEntry(resultEntry); element.setUserAgent(determineUserAgent(feed)); elements.add(element); } else { if (logger.isDebugEnabled()) logger.debug(String.format("Not injecting feed with entry link '%s' - already in cache", resultEntry.getLink())); } } catch (FeedException e) { logger.warn(String.format("Skipping entry in feed %s at %s", feed.getId(), feed.getUri()), e); } } if (logger.isInfoEnabled()) logger.info(String.format( "Finished retrieving feed '%s' with URL '%s' (and hostname %s) - injecting the %d result(s)", feed.getName(), feed.getUri(), feed.getHostname() != null ? "'" + feed.getHostname() + "'" : "(null)", elements.size())); // Don't consider injecting locally or remotely - everything goes over the queue - it's easier that way for (SourceElement element : elements) { Long elementDelay = feed.getElementDelay(); if (feed.isElementDelayRandomized() && elementDelay > 0) elementDelay = (long) randomGenerator.nextInt(elementDelay.intValue() - RANDOM_RANGE_MINIMUM + 1) + RANDOM_RANGE_MINIMUM; if (logger.isInfoEnabled()) logger.info(String.format("Injecting feed element with hostname %s%s - URI is %s", element.getHostname() != null ? "'" + element.getHostname() + "'" : "(null)", elementDelay != null ? " and delay " + elementDelay : "", element.getUri())); // This element might already exist in the storage back-end (cache-sync issues, etc.) // String compositeUrl = cacheService.createCompositeKey(element.getUri().toString(), feed.getResultParameters()); // CompositeEntry existingEntry = storageService.retrieveEntryByCompositeUrl(compositeUrl); // if (existingEntry != null) // logger.warn("The storage back-end already contains a fully processed element with composite (enricher) ID '" + compositeUrl + "' - probably due to cache sync - will reuse"); CompositeEntry existingEntry = null; // Create and inject a new CompositeEntry CompositeEntry entry = existingEntry != null ? existingEntry : new CompositeEntry(); if (existingEntry == null) entry.setId(ObjectId.get()); entry.setSource(feed); entry.setElement(element); entry.setCurrentState(CompositeState.SourceElement); storageService.saveEntry(entry); injectionService.injectElement(entry.getId(), element.getHostname(), elementDelay); } if (logger.isInfoEnabled()) logger.info(String.format("Finished injecting results for feed '%s' with URL '%s'", feed.getName(), feed.getUri())); } /** * Retrieve a user agent override from the given feed, or use the default. * * @param feed * @return String */ private String determineUserAgent(final Feed feed) { String userAgent = feed.getUserAgentsOverride() != null && feed.getUserAgentsOverride().size() > 0 ? feed.getUserAgentsOverride().get(0) : defaultUserAgent; if (feed.getUserAgentsOverride() != null && feed.getUserAgentsOverride().size() > 1 && feed.isUserAgentsOverrideRandomized()) userAgent = feed.getUserAgentsOverride().get( randomGenerator.nextInt((feed.getUserAgentsOverride().size() - 1) - RANDOM_RANGE_MINIMUM + 1) + RANDOM_RANGE_MINIMUM); return userAgent; } }