feedzilla.Feed.java Source code

Java tutorial

Introduction

Here is the source code for feedzilla.Feed.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package feedzilla;

import Crawler.NewsCrawler;
import Log.Log;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.Random;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 *
 * @author felipelageduarte
 */
public class Feed implements Runnable {

    private final int category;
    private final int subcategory;
    private int id;
    private String title;
    private String summary;
    private String published;
    private String updated;
    private String author;
    private String link;
    private String copyright;
    private String source_title;
    private String source_link;
    private String news;
    private File newsXMLFile;

    public Feed(int category, int subcategory, Element entry) throws Exception {
        this.category = category;
        this.subcategory = subcategory;
        parser(entry);
        if (verifyIfNewsAlredyExists()) {
            throw new Exception("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName()
                    + " - Already Exists");
        }
    }

    private boolean verifyIfNewsAlredyExists() {
        newsXMLFile = new File("./data/" + category + "/" + subcategory + "/" + this.id + ".xml");

        if (newsXMLFile.exists()) {
            return true;
        } else {
            return false;
        }
    }

    private void parser(Element entry) {
        boolean source = false;
        for (Element element : entry.children()) {
            switch (element.nodeName()) {
            case "id":
                this.id = Integer.parseInt(element.text().split(":")[1]);
                break;
            case "title":
                if (source) {
                    this.source_title = element.text();
                } else {
                    this.title = element.text();
                }
                break;
            case "summary":
                this.summary = element.text().split("<br")[0];
                break;
            case "published":
                this.published = element.text();
                break;
            case "updated":
                this.updated = element.text();
                break;
            case "author":
                this.author = element.text();
                break;
            case "link":
                if (source) {
                    this.source_link = element.attr("href");
                } else {
                    this.link = element.attr("href");
                }
                break;
            case "rights":
                this.copyright = element.text();
                break;
            case "source":
                source = true;
                break;
            default:
                Log.debug("Unknow TAG: " + element.nodeName());
                break;
            }
        }
    }

    @Override
    public String toString() {
        return "Feed{" + "category=" + category + ", subcategory=" + subcategory + ", id=" + id + ", title=" + title
                + ", summary=" + summary + ", published=" + published + ", updated=" + updated + ", author="
                + author + ", link=" + link + ", copyright=" + copyright + ", source_title=" + source_title
                + ", source_link=" + source_link + ", news=" + news + '}';
    }

    public String toXML() {
        String xml = "<data>\n";
        xml += "\t<category>\n";
        xml += "\t\t<id>\n\t\t\t" + category + "\n\t\t</id>\n";
        xml += "\t\t<name>\n\t\t\t" + Category.map.get(category) + "\n\t\t</name>\n";
        xml += "\t</category>\n";
        xml += "\t<subcategory>\n";
        xml += "\t\t<id>\n\t\t\t" + subcategory + "\n\t\t</id>\n";
        xml += "\t\t<name>\n\t\t\t" + SubCategory.map.get(subcategory) + "\n\t\t</name>\n";
        xml += "\t</subcategory>\n";
        xml += "\t<id>\n\t\t" + id + "\n\t</id>\n";
        xml += "\t<title>\n\t\t" + title + "\n\t</title>\n";
        xml += "\t<summary>\n\t\t" + summary + "\n\t</summary>\n";
        xml += "\t<published>\n\t\t" + published + "\n\t</published>\n";
        xml += "\t<updated>\n\t\t" + updated + "\n\t</updated>\n";
        xml += "\t<author>\n\t\t" + author + "\n\t</author>\n";
        xml += "\t<link>\n\t\t" + link + "\n\t</link>\n";
        xml += "\t<copyright>\n\t\t" + copyright + "\n\t</copyright>\n";
        xml += "\t<source>\n";
        xml += "\t\t<title>\n\t\t\t" + source_title + "\n\t\t</title>\n";
        xml += "\t\t<link>\n\t\t\t" + source_link + "\n\t\t</link>\n";
        xml += "\t</source>\n";
        xml += "\t<news>\n\t\t" + news + "\n\t</news>\n";
        xml += "</data>\n";
        return xml;
    }

    public String getUrlInParams(String url) {
        String urlAnswer = url;
        String[] urlParts = url.split("\\?");
        if (urlParts.length > 1) {
            String[] params = urlParts[1].split("&");
            for (String param : params) {
                if (param.split("=")[0].equals("url")) {
                    String[] values = param.split("=");
                    for (int i = 1; i < values.length; ++i) {
                        urlAnswer += values[i];
                    }

                }
            }
        }
        return urlAnswer;
    }

    @Override
    public void run() {
        try {
            Thread.sleep((new Random()).nextInt(60 * 1000));
        } catch (InterruptedException ex) {
            Log.warn("Could not sleep Thread", ex);
        }

        Document doc = null;
        boolean get = true;
        int trysCount = 0;
        do {
            get = true;
            try {
                doc = Jsoup.connect(this.link).timeout(60 * 1000).userAgent(
                        "Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6")
                        .referrer("http://www.google.com").get();
            } catch (IOException ex) {
                Logger.getLogger(Feed.class.getName()).log(Level.SEVERE, null, ex);
                Log.warn("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName()
                        + " - Could not get Feed page from FeedZilla", ex);
                get = false;
                if (++trysCount > 5) {
                    Log.fatal("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - "
                            + "Five attempts and has not yet been possible to "
                            + "retrieve the page from filezilla. Ignoring this news.");
                    return;
                }
            }
        } while (!get);

        Elements elements = doc.body().select("iframe");
        for (Element element : elements) {
            try {
                this.link = URLDecoder.decode(element.attr("src"), "UTF-8");
            } catch (UnsupportedEncodingException ex) {
                Logger.getLogger(Feed.class.getName()).log(Level.SEVERE, null, ex);
                Log.fatal("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - "
                        + "Could not get the news link from FeedZilla pages");
                return;
            }
        }
        this.link = getUrlInParams(this.link);
        try {
            this.news = (new NewsCrawler(this.link)).getNews();
        } catch (Exception ex) {
            Log.fatal("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName() + " - "
                    + "Could not retrieve news from link " + this.link, ex);
            return;
        }

        newsXMLFile.getParentFile().mkdirs();
        try {
            FileUtils.writeStringToFile(newsXMLFile, this.toXML());
            Log.info("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName()
                    + " - Successfuly saved!");
            System.out.println("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName()
                    + " - Successfuly saved!");
        } catch (IOException ex) {
            Log.error("News " + this.category + "/" + this.subcategory + "/" + newsXMLFile.getName()
                    + " - Could not save news into file", ex);
        }
    }

}