com.orange.clara.tool.crawlers.rss.RssCrawler.java Source code

Java tutorial

Introduction

Here is the source code for com.orange.clara.tool.crawlers.rss.RssCrawler.java

Source

package com.orange.clara.tool.crawlers.rss;

import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.orange.clara.tool.crawlers.Crawler;
import com.orange.clara.tool.exceptions.CrawlerGetContentException;
import com.orange.clara.tool.model.ContentResource;
import com.orange.clara.tool.model.ResourceType;
import com.orange.clara.tool.model.WatchedResource;
import com.orange.clara.tool.service.RssService;
import com.rometools.rome.feed.synd.SyndEntry;
import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.io.FeedException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.net.URI;
import java.util.List;

/**
 * Copyright (C) 2016 Orange
 * <p>
 * This software is distributed under the terms and conditions of the 'Apache-2.0'
 * license which can be found in the file 'LICENSE' in this package distribution
 * or at 'https://opensource.org/licenses/Apache-2.0'.
 * <p>
 * Author: Arthur Halet
 * Date: 25/06/2016
 */
@Component
public class RssCrawler implements Crawler {

    @Autowired
    protected RssService rssService;

    @Override
    public List<ContentResource> getLastContent(WatchedResource watchedResource) throws CrawlerGetContentException {
        try {
            SyndFeed feed = this.rssService.getFeed(watchedResource);
            List<ContentResource> contentResources = Lists.newArrayList();
            ContentResource contentResource;
            for (SyndEntry entry : feed.getEntries()) {
                contentResource = this.extractContent(entry, feed);
                if (watchedResource.getUpdatedResourceAt() != null
                        && contentResource.getDate().before(watchedResource.getUpdatedResourceAt())) {
                    continue;
                }
                contentResources.add(this.extractContent(entry, feed));
            }
            return contentResources;
        } catch (Exception e) {
            throw new CrawlerGetContentException(e.getMessage(), e);
        }
    }

    @Override
    public String getImage(WatchedResource watchedResource) throws CrawlerGetContentException {
        try {
            SyndFeed feed = this.rssService.getFeed(watchedResource);
            return feed.getImage().getUrl();
        } catch (IOException | FeedException e) {
            throw new CrawlerGetContentException(e.getMessage(), e);
        }
    }

    @Override
    public String generateTitle(WatchedResource watchedResource) throws CrawlerGetContentException {
        URI uri = URI.create(watchedResource.getLink());
        return "Rss feed from " + uri.getHost() + uri.getPath();
    }

    @Override
    public ResourceType forResourceType() {
        return ResourceType.RSS;
    }

    private ContentResource extractContent(SyndEntry entry, SyndFeed feed) throws IOException {
        ContentResource contentResource = new ContentResource();
        contentResource.setTitle(entry.getTitle());
        if (!entry.getEnclosures().isEmpty()) {
            contentResource.setImage(entry.getEnclosures().get(0).getUrl());
        }
        if (entry.getContents().isEmpty()) {
            contentResource.setDescription(entry.getDescription().getValue());
        } else {
            contentResource.setDescription(Joiner.on("\n").join(entry.getContents()));
        }
        if (entry.getAuthor() == null || entry.getAuthor().isEmpty()) {
            contentResource.setAuthor(feed.getTitle());
        } else {
            contentResource.setAuthor(entry.getAuthor());
        }
        contentResource.setDate(entry.getPublishedDate());
        contentResource.setLink(entry.getLink());
        return contentResource;
    }
}