mk.finki.ranggo.aggregator.ContentsAggregatorImpl.java Source code

Introduction

Here is the source code for mk.finki.ranggo.aggregator.ContentsAggregatorImpl.java
Source

package mk.finki.ranggo.aggregator;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
//import java.io.StringWriter;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.sql.Timestamp;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.TimeZone;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
//import javax.xml.transform.Transformer;
//import javax.xml.transform.TransformerException;
//import javax.xml.transform.TransformerFactory;
//import javax.xml.transform.dom.DOMSource;
//import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.lang3.StringEscapeUtils;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.springframework.http.MediaType;
import org.springframework.http.converter.json.*;
import org.springframework.web.client.RestTemplate;
import org.w3c.dom.Document;
//import org.w3c.dom.Node;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import com.alchemyapi.api.AlchemyAPI;
import com.alchemyapi.api.AlchemyAPI_CombinedParams;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;

import mk.finki.ranggo.aggregator.ContentsAggregatorException.AggregatorMethod;
import mk.finki.ranggo.aggregator.crawlers.impl.DnevnikCrawler;
import mk.finki.ranggo.aggregator.crawlers.impl.FokusCrawler;
import mk.finki.ranggo.aggregator.crawlers.impl.KurirCrawler;
import mk.finki.ranggo.aggregator.crawlers.impl.LibertasCrawler;
import mk.finki.ranggo.aggregator.crawlers.impl.NovaTVCrawler;
import mk.finki.ranggo.aggregator.crawlers.impl.RepublikaCrawler;
import mk.finki.ranggo.aggregator.crawlers.impl.TelmaCrawler;
import mk.finki.ranggo.aggregator.crawlers.impl.UtrinskiVesnikCrawler;
import mk.finki.ranggo.aggregator.crawlers.impl.VecherCrawler;
import mk.finki.ranggo.aggregator.crawlers.impl.VestCrawler;
import mk.finki.ranggo.aggregator.crawlers.impl.Vesti24Crawler;
import mk.finki.ranggo.aggregator.helper.NewYorkTimes.NYTimesResult;
import mk.finki.ranggo.aggregator.helper.NewYorkTimes.NYTimesWebUrl;
import mk.finki.ranggo.aggregator.helper.TheGuardian.TheGuardianResult;
import mk.finki.ranggo.aggregator.helper.TheGuardian.TheGuardianWebUrl;
import mk.finki.ranggo.aggregator.model.Concept;
import mk.finki.ranggo.aggregator.model.Content;
import mk.finki.ranggo.aggregator.model.Keyword;
import mk.finki.ranggo.aggregator.model.Person;
import mk.finki.ranggo.aggregator.model.PersonEntity;
import mk.finki.ranggo.aggregator.model.Taxonomy;
import mk.finki.ranggo.aggregator.repository.ContentRepository;
import mk.finki.ranggo.aggregator.repository.PersonRepository;
import mk.finki.ranggo.aggregator.yandex.YandexTranslator;

public class ContentsAggregatorImpl implements ContentsAggregator {

    private PersonRepository personRepository;

    private ContentRepository contentRepository;

    private AlchemyAPI alchemyapi;
    private AlchemyAPI_CombinedParams alchemyapi_params;

    public ContentsAggregatorImpl(String alchemyapi_key, PersonRepository personRepository,
            ContentRepository contentRepository) {
        alchemyapi = AlchemyAPI.GetInstanceFromString(alchemyapi_key);

        alchemyapi_params = new AlchemyAPI_CombinedParams();
        alchemyapi_params.setLinkedData(true);
        alchemyapi_params.setSentiment(true);
        alchemyapi_params.setShowSourceText(true);
        alchemyapi_params.setExtract("entity");
        alchemyapi_params.setExtract("concept");
        alchemyapi_params.setExtract("keyword");
        alchemyapi_params.setExtract("taxonomy");

        this.personRepository = personRepository;
        this.contentRepository = contentRepository;
    }

    public void aggregateGoogleNewsRSSFeed(Date date) throws ContentsAggregatorException {
        //only the category 'top stories' is extracted (due to the daily limit of 1000 transactions of a free alchemyapi license) 
        //url: http://news.google.com/news?&hl=en&output=rss&scoring=n&as_drrb=b&as_minm=%minm&as_mind=%mind&as_maxm=%maxm&as_maxd=%maxd
        //%minm, %mind, %maxm, %maxd - minimum/maximum month/day

        Calendar calendar = Calendar.getInstance();
        calendar.setTimeZone(TimeZone.getTimeZone("UTC"));
        calendar.setTime(date);

        String month = String.valueOf(calendar.get(Calendar.MONTH) + 1);
        String day = String.valueOf(calendar.get(Calendar.DAY_OF_MONTH));

        //pagination parameter? 'start' does not work. max size per page is 30
        //currently fetching 10 articles to reduce the method runtime
        final String url = "http://news.google.com/news?&hl=en&output=rss&scoring=n&as_drrb=b&as_minm=" + month
                + "&as_mind=" + day + "&as_maxm=" + month + "&as_maxd=" + day + "&num=10";

        //parse the rss
        try {
            DocumentBuilderFactory documenetBuilderFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder documentBuilder = documenetBuilderFactory.newDocumentBuilder();
            Document document = documentBuilder.parse(url);

            document.getDocumentElement().normalize();

            NodeList items = document.getElementsByTagName("item");

            //extract data for each article
            for (int i = 0; i < items.getLength(); i++) {
                Element itemElement = (Element) items.item(i);

                String itemTitle = itemElement.getElementsByTagName("title").item(0).getTextContent()
                        .replaceAll("&apos;", "'").replaceAll("&quot;", "'").replaceAll("&lt;", "<")
                        .replaceAll("&gt;", ">").replaceAll("&amp;", "&");
                String itemLink = itemElement.getElementsByTagName("link").item(0).getTextContent();
                String itemPubDate = itemElement.getElementsByTagName("pubDate").item(0).getTextContent();

                itemLink = URLDecoder.decode(itemLink.substring(itemLink.indexOf("url=") + 4), "UTF-8"); //the feed nests the url and redirects to it, extract only the nested url

                try {
                    this.processGoogleNewsArticle(itemLink, itemTitle, itemPubDate);
                } catch (ParserConfigurationException exception) {
                } //these exceptions should be logged - only a single article fails  
                catch (SAXException exception) {
                } catch (XPathExpressionException exception) {
                } catch (IOException exception) {
                }
            }
        } catch (ParserConfigurationException exception) {
            throw new ContentsAggregatorException("parser configuration error",
                    AggregatorMethod.GOOGLE_NEWS_RSS_FEED);
        } catch (SAXException exception) {
            throw new ContentsAggregatorException("xml parse exception", AggregatorMethod.GOOGLE_NEWS_RSS_FEED);
        } catch (IOException exception) {
            throw new ContentsAggregatorException("can not fetch resource", AggregatorMethod.GOOGLE_NEWS_RSS_FEED);
        }
    }

    public void aggregateHuffingtonPost() throws ContentsAggregatorException {
        //fetches current index contents
        final String url = "http://www.huffingtonpost.com/feeds/index.xml";

        //parse the rss
        try {
            DocumentBuilderFactory documenetBuilderFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder documentBuilder = documenetBuilderFactory.newDocumentBuilder();
            Document document = documentBuilder.parse(url);

            document.getDocumentElement().normalize();

            NodeList items = document.getElementsByTagName("item");

            //extract data for each article
            for (int i = 0; i < items.getLength(); i++) {
                Element itemElement = (Element) items.item(i);

                String itemTitle = itemElement.getElementsByTagName("title").item(0).getTextContent()
                        .replace("<![CDATA[", "").replace("]]>", "").replaceAll("&apos;", "'")
                        .replaceAll("&quot;", "'").replaceAll("&lt;", "<").replaceAll("&gt;", ">")
                        .replaceAll("&amp;", "&").trim();
                String itemLink = itemElement.getElementsByTagName("link").item(0).getTextContent();
                String itemPubDate = itemElement.getElementsByTagName("pubDate").item(0).getTextContent();

                itemLink = URLDecoder.decode(itemLink, "UTF-8");

                try {
                    this.processHuffingtonPost(itemLink, itemTitle, itemPubDate);
                } catch (ParserConfigurationException exception) {
                } //these exceptions should be logged - only a single article fails  
                catch (SAXException exception) {
                } catch (XPathExpressionException exception) {
                } catch (IOException exception) {
                }
            }
        } catch (ParserConfigurationException exception) {
            throw new ContentsAggregatorException("parser configuration error", AggregatorMethod.HUFFINGTON_POST);
        } catch (SAXException exception) {
            throw new ContentsAggregatorException("xml parse exception", AggregatorMethod.HUFFINGTON_POST);
        } catch (IOException exception) {
            throw new ContentsAggregatorException("can not fetch resource", AggregatorMethod.HUFFINGTON_POST);
        }
    }

    public void aggregateDnevnik() throws ContentsAggregatorException {
        // TODO Auto-generated method stub
        DnevnikCrawler crawler = new DnevnikCrawler(contentRepository);
        List<AlchemyAPIAnalysisResult> results = crawler.crawl();

        for (AlchemyAPIAnalysisResult analysisResults : results) {
            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    public void aggregateFokus() throws ContentsAggregatorException {

        //      FokusCrawler crawler = new FokusCrawler(contentRepository);
        //      List<AlchemyAPIAnalysisResult> results = crawler.crawl();

        //      for(AlchemyAPIAnalysisResult analysisResults : results){
        //         ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        //      }
    }

    public void aggregateKurir() throws ContentsAggregatorException {
        KurirCrawler crawler = new KurirCrawler(contentRepository);
        List<AlchemyAPIAnalysisResult> results = crawler.crawl();

        for (AlchemyAPIAnalysisResult analysisResults : results) {
            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    public void aggregateLibertas() throws ContentsAggregatorException {
        //      LibertasCrawler crawler = new LibertasCrawler(contentRepository);
        //      List<AlchemyAPIAnalysisResult> results = crawler.crawl();
        //      
        //      for(AlchemyAPIAnalysisResult analysisResults : results){
        //         ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        //      }
    }

    public void aggregateNovaTV() throws ContentsAggregatorException {
        NovaTVCrawler crawler = new NovaTVCrawler(contentRepository);
        List<AlchemyAPIAnalysisResult> results = crawler.crawl();

        for (AlchemyAPIAnalysisResult analysisResults : results) {
            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    public void aggregateRepublika() throws ContentsAggregatorException {
        RepublikaCrawler crawler = new RepublikaCrawler(contentRepository);
        List<AlchemyAPIAnalysisResult> results = crawler.crawl();

        for (AlchemyAPIAnalysisResult analysisResults : results) {
            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    public void aggregateTelma() throws ContentsAggregatorException {
        TelmaCrawler crawler = new TelmaCrawler(contentRepository);
        List<AlchemyAPIAnalysisResult> results = crawler.crawl();

        for (AlchemyAPIAnalysisResult analysisResults : results) {
            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    public void aggregateUtrinskiVesnik() throws ContentsAggregatorException {
        UtrinskiVesnikCrawler crawler = new UtrinskiVesnikCrawler(contentRepository);
        List<AlchemyAPIAnalysisResult> results = crawler.crawl();

        for (AlchemyAPIAnalysisResult analysisResults : results) {
            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    public void aggregateVecher() throws ContentsAggregatorException {
        VecherCrawler crawler = new VecherCrawler(contentRepository);
        List<AlchemyAPIAnalysisResult> results = crawler.crawl();

        for (AlchemyAPIAnalysisResult analysisResults : results) {
            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    public void aggregateVest() throws ContentsAggregatorException {
        VestCrawler crawler = new VestCrawler(contentRepository);
        List<AlchemyAPIAnalysisResult> results = crawler.crawl();

        for (AlchemyAPIAnalysisResult analysisResults : results) {
            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    public void aggregateVesti24() throws ContentsAggregatorException {
        Vesti24Crawler crawler = new Vesti24Crawler(contentRepository);
        List<AlchemyAPIAnalysisResult> results = crawler.crawl();

        for (AlchemyAPIAnalysisResult analysisResults : results) {
            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    public void aggregateNYTimes(Date date) throws ContentsAggregatorException {

        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYYMMdd");
        String stringDate = simpleDateFormat.format(date);

        NYTimesResult response = pullArticlesNYTimes(stringDate, stringDate, null);

        int pages = response.getResponse().getMeta().getHits() / 10;

        if ((pages % 10) > 0) {
            pages += 1;
        }

        for (int i = 1; i < pages; ++i) {
            if (i == 100)
                break;
            pullArticlesNYTimes(stringDate, stringDate, Integer.toString(i));
        }

    }

    private NYTimesResult pullArticlesNYTimes(String beginDate, String endDate, String page) {

        final String api_key = "b4d2896590ef4a8aaf583d26cd97a6df";
        final String api_url = "http://api.nytimes.com/svc/search/v2/articlesearch.json?";

        String URL = api_url + "begin_date=" + beginDate + "&end_date=" + endDate + "&api-key=" + api_key
                + "&fl=web_url,lead_paragraph,pub_date";

        if (page != null)
            URL += "&page=" + page;

        try {
            String url = URL;
            URLConnection conn = new URL(url).openConnection();
            BufferedReader in = new BufferedReader(
                    new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8));

            String inputLine;
            StringBuilder sb = new StringBuilder();
            while ((inputLine = in.readLine()) != null) {
                sb.append(inputLine);
            }

            String json = sb.toString();

            GsonBuilder gsonBuilder = new GsonBuilder();
            Gson gson = gsonBuilder.create();

            NYTimesResult response = gson.fromJson(json, NYTimesResult.class);
            ArrayList<NYTimesWebUrl> list = response.getResponse().getDocs();

            for (NYTimesWebUrl webUrl : list) {
                this.processNYTimes(webUrl.getWeb_url(), webUrl.getTitle(), webUrl.getPub_date());
            }

            return response;

        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    public void aggregateTheGuardian(Date date) throws ContentsAggregatorException {

        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("YYYY-MM-dd");
        String stringDate = simpleDateFormat.format(date);

        TheGuardianResult response = pullArticles(stringDate, stringDate, null);

        if (response != null) {
            for (int i = 2; i < response.getResponse().getPages(); ++i) {
                pullArticles(stringDate, stringDate, Integer.toString(i));
            }
        }

    }

    private TheGuardianResult pullArticles(String beginDate, String endDate, String page) {

        final String api_key = "c5b09c10-090a-4f9e-b87f-37df6fb7a079";
        final String api_url = "http://content.guardianapis.com/search?";
        String URL = api_url + "from-date=" + beginDate + "&to-date=" + endDate + "&api-key=" + api_key;

        if (page != null) {
            URL += "&page=" + page;
        }

        try {

            String url = URL;
            URLConnection conn = new URL(url).openConnection();
            BufferedReader in = new BufferedReader(
                    new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8));

            String inputLine;
            StringBuilder sb = new StringBuilder();
            while ((inputLine = in.readLine()) != null) {
                sb.append(inputLine);
            }

            String json = sb.toString();

            GsonBuilder gsonBuilder = new GsonBuilder();
            Gson gson = gsonBuilder.create();

            TheGuardianResult response = gson.fromJson(json, TheGuardianResult.class);

            ArrayList<TheGuardianWebUrl> results = response.getResponse().getUrls();
            for (TheGuardianWebUrl webUrl : results) {
                this.processTheGuardian(webUrl.getWebUrl(), webUrl.getWebTitle(), webUrl.getWebPublicationDate());
            }

            return response;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    public void aggregateTest() throws ContentsAggregatorException {
        final int CONTENT_COUNT = 20;

        String[] articles = new String[CONTENT_COUNT];
        String[] titles = new String[CONTENT_COUNT];
        String[] timestamps = new String[CONTENT_COUNT];

        int counter = 0;

        //populating content information

        articles[counter] = "http://nymag.com/daily/intelligencer/2015/09/frank-rich-in-praise-of-donald-trump.html";
        titles[counter] = "The Importance of Donald Trump";
        timestamps[counter] = "Sun, 20 Sep 2015 21:00:00 GMT";
        counter++;

        articles[counter] = "https://www.washingtonpost.com/opinions/what-americans-think-should-be-done-on-illegal-immigration/2015/10/11/4f6f6df6-6ed7-11e5-9bfe-e59f5e244f92_story.html";
        titles[counter] = "What Americans think should be done on illegal immigration";
        timestamps[counter] = "Sun, 11 Oct 2015 00:00:00 GMT";
        counter++;

        articles[counter] = "http://thinkprogress.org/politics/2015/11/22/3724879/donald-trump-black-lives-matter-protester-beating/";
        titles[counter] = "Donald Trump: My Fans Were Right To Beat Up Black Protester";
        timestamps[counter] = "Sun, 22 Nov 2015 09:30:00 GMT";
        counter++;

        articles[counter] = "http://edition.cnn.com/2015/12/11/politics/donald-trump-ted-cruz-iowa-ethanol/";
        titles[counter] = "Donald Trump launches first attacks against Ted Cruz";
        timestamps[counter] = "Fri, 11 Dec 2015 00:00:00 GMT";
        counter++;

        articles[counter] = "http://edition.cnn.com/2015/12/18/politics/donald-trump-praises-defends-vladimir-putin/";
        titles[counter] = "Donald Trump lavishes praise on 'leader' Putin";
        timestamps[counter] = "Fri, 18 Dec 2015 15:19:00 GMT";
        counter++;

        articles[counter] = "http://www.cbsnews.com/news/donald-trump-compares-bill-clinton-to-bill-cosby/";
        titles[counter] = "Donald Trump compares Bill Clinton to Bill Cosby";
        timestamps[counter] = "Wed, 06 Jan 2016 08:56:00 GMT";
        counter++;

        articles[counter] = "http://www.sbnation.com/lookit/2016/1/28/10858584/australian-open-video-roger-federer-novak-djokovic-point";
        titles[counter] = "Roger Federer wins mind-boggling point against Novak Djokovic in ridiculous rally";
        timestamps[counter] = "Thu, 28 Jan 2016 09:47:00 GMT";
        counter++;

        articles[counter] = "http://www.theguardian.com/sport/2016/jan/31/novak-djokovic-andy-murray-australian-open-final";
        titles[counter] = "Novak Djokovic beats Andy Murray to win the 2016 Australian Open final";
        timestamps[counter] = "Sun, 31 Jan 2016 11:37:00 GMT";
        counter++;

        articles[counter] = "https://www.washingtonpost.com/politics/a-sigh-of-relief-in-swinging-northern-virginia-when-trump-fell-short-in-iowa/2016/02/02/1f48fb36-c9d3-11e5-a7b2-5a2f824b02c9_story.html";
        titles[counter] = "A sigh of relief in Northern Virginia as Trump falls short in Iowa";
        timestamps[counter] = "Tue, 2 Feb 2016 00:00:00 GMT";
        counter++;

        articles[counter] = "http://www.ibtimes.com/angelina-jolie-brad-pitt-divorce-angie-insanely-jealous-over-selena-gomez-cheating-2312126";
        titles[counter] = "Angelina Jolie And Brad Pitt To Divorce? Angie Is 'Insanely Jealous' Over Selena Gomez Cheating Rumors: Report";
        timestamps[counter] = "Wed, 17 Feb 2016 13:07:00 GMT";
        counter++;

        articles[counter] = "http://www.nytimes.com/2016/02/24/us/politics/nevada-caucus-gop.html";
        titles[counter] = "Donald Trump Wins Nevada Caucuses, Collecting Third Straight Victory";
        timestamps[counter] = "Tue, 23 Feb 2016 00:00:00 GMT";
        counter++;

        articles[counter] = "http://www.nytimes.com/politics/first-draft/2016/03/16/donald-trump-warns-of-riots-if-party-blocks-him-at-convention/";
        titles[counter] = "Donald Trump Warns of 'Riots' if Party Blocks Him at Convention";
        timestamps[counter] = "Wed, 16 Mar 2016 15:15:00 GMT";
        counter++;

        articles[counter] = "https://www.washingtonpost.com/blogs/post-partisan/wp/2016/03/23/ted-cruzs-harebrained-harmful-policy-towards-american-muslims/";
        titles[counter] = "Ted Cruz's harebrained, harmful policy toward American Muslims";
        timestamps[counter] = "Wed, 23 Mar 2016 00:00:00 GMT";
        counter++;

        articles[counter] = "http://espn.go.com/tennis/story/_/id/15079824/miami-open-rafael-nadal-roger-federer-host-questions-answer";
        titles[counter] = "Rafael Nadal, Roger Federer among top stars with questions to answer";
        timestamps[counter] = "Mon, 28 Mar 2016 00:00:00 GMT";
        counter++;

        articles[counter] = "http://www.houstonchronicle.com/sports/article/Feliciano-Lopez-glad-he-plays-in-tennis-era-of-7235565.php";
        titles[counter] = "Feliciano Lopez glad he plays in tennis era of Novak Djokovic & Co.";
        timestamps[counter] = "Thu, 07 Apr 2016 00:00:00 GMT";
        counter++;

        articles[counter] = "http://www.vanityfair.com/hollywood/2016/04/brad-pitt-and-angelina-jolie-london-mansion";
        titles[counter] = "Brad Pitt and Angelina Jolie's $21,000/Month London Mansion Is an Inspiration to All Business Travelers";
        timestamps[counter] = "Fri, 22 Apr 2016 10:38:00 GMT";
        counter++;

        articles[counter] = "https://www.washingtonpost.com/news/the-fix/wp/2016/04/26/how-donald-trump-is-dominating-the-i-95-primary/";
        titles[counter] = "How Donald Trump dominated Tuesday's primaries";
        timestamps[counter] = "Tue, 26 Apr 2016 00:00:00 GMT";
        counter++;

        articles[counter] = "http://www.inquisitr.com/3042986/johnny-depp-was-under-medical-care-after-filming/";
        titles[counter] = "Johnny Depp Was Under Medical Care After Filming";
        timestamps[counter] = "Thu, 28 Apr 2016 00:00:00 GMT";
        counter++;

        articles[counter] = "https://www.washingtonpost.com/news/the-fix/wp/2016/04/29/the-gop-is-running-on-empty-so-why-are-people-debating-whether-to-stop-for-gas/";
        titles[counter] = "Mitt Romney's chief strategist says Donald Trump is 'uniquely unqualified' to be president";
        timestamps[counter] = "Fri, 29 Apr 2016 09:53:00 GMT";
        counter++;

        articles[counter] = "http://www.mirror.co.uk/3am/celebrity-news/kirk-norcross-gets-arrested-dangerous-7864035";
        titles[counter] = "Kirk Norcross 'gets ARRESTED for dangerous driving' hours after Snapchatting behind the wheel";
        timestamps[counter] = "Fri, 29 Apr 2016 17:43:00 GMT";
        counter++; //= CONTENT_COUNT

        //end populating content information

        for (int i = 0; i < CONTENT_COUNT; i++) {
            try {
                System.out.println("Pred da procesira");
                this.processStatic(articles[i], titles[i], timestamps[i]);
            } catch (ParserConfigurationException exception) {
                exception.printStackTrace();
            } //these exceptions should be logged - only a single link fails  
            catch (SAXException exception) {
                exception.printStackTrace();
            } catch (XPathExpressionException exception) {
                exception.printStackTrace();
            } catch (IOException exception) {
                exception.printStackTrace();
            }
        }
    }

    private void processGoogleNewsArticle(String articleURL, String title, String timestamp)
            throws ParserConfigurationException, SAXException, XPathExpressionException, IOException {
        //do not analyze it if it exists in the data store (check by url)
        if (contentRepository.findBySourceUrl(articleURL) == null) {
            AlchemyAPIAnalysisResult analysisResults = ContentsAggregatorImpl.analyzeContent(alchemyapi,
                    alchemyapi_params, articleURL);

            analysisResults.setType("GOOGLE_NEWS");
            analysisResults.setUrl(articleURL);
            analysisResults.setTitle(title);
            analysisResults.setTimestamp(timestamp);

            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    private void processHuffingtonPost(String URL, String title, String timestamp)
            throws ParserConfigurationException, SAXException, XPathExpressionException, IOException {
        //do not analyze it if it exists in the data store (check by url)
        if (contentRepository.findBySourceUrl(URL) == null) {
            AlchemyAPIAnalysisResult analysisResults = ContentsAggregatorImpl.analyzeContent(alchemyapi,
                    alchemyapi_params, URL);

            analysisResults.setType("HUFFINGTON_POST");
            analysisResults.setUrl(URL);
            analysisResults.setTitle(title);
            analysisResults.setTimestamp(timestamp);

            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    private void processNYTimes(String URL, String title, String timestamp)
            throws XPathExpressionException, ParserConfigurationException, SAXException, IOException {
        if (contentRepository.findBySourceUrl(URL) == null) {

            AlchemyAPIAnalysisResult analysisResults = ContentsAggregatorImpl.analyzeContent(alchemyapi,
                    alchemyapi_params, URL);
            analysisResults.setType("NY Times");
            analysisResults.setUrl(URL);
            analysisResults.setTitle(title);
            analysisResults.setTimestamp(timestamp);

            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    private void processTheGuardian(String URL, String title, String timestamp)
            throws XPathExpressionException, ParserConfigurationException, SAXException, IOException {

        if (contentRepository.findBySourceUrl(URL) == null) {

            AlchemyAPIAnalysisResult analysisResults = ContentsAggregatorImpl.analyzeContent(alchemyapi,
                    alchemyapi_params, URL);
            analysisResults.setType("The Guardian");
            analysisResults.setUrl(URL);
            analysisResults.setTitle(title);
            analysisResults.setTimestamp(timestamp);

            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    private void processStatic(String URL, String title, String timestamp)
            throws ParserConfigurationException, SAXException, XPathExpressionException, IOException {
        //do not analyze it if it exists in the data store (check by url)

        if (contentRepository.findBySourceUrl(URL) == null) {
            AlchemyAPIAnalysisResult analysisResults = ContentsAggregatorImpl.analyzeContent(alchemyapi,
                    alchemyapi_params, URL);
            System.out.println(analysisResults);
            analysisResults.setType("STATIC");
            analysisResults.setUrl(URL);
            analysisResults.setTitle(title);
            analysisResults.setTimestamp(timestamp);
            System.out.println("Pred da napravi persist");
            ContentsAggregatorImpl.persistData(personRepository, contentRepository, analysisResults);
        }
    }

    private static AlchemyAPIAnalysisResult analyzeContent(AlchemyAPI alchemyapi,
            AlchemyAPI_CombinedParams alchemyapi_params, String contentURL)
            throws ParserConfigurationException, SAXException, XPathExpressionException, IOException {
        Document document = alchemyapi.URLGetCombined(contentURL, alchemyapi_params);
        //System.out.println(getStringFromDocument(document));

        String text = document.getElementsByTagName("text").item(0).getTextContent();
        text = text.replaceAll("", "'").replaceAll("", "\"").replaceAll("", "\"");

        //extracting entities

        Element entitiesElement = (Element) document.getElementsByTagName("entities").item(0);
        NodeList entities = entitiesElement.getElementsByTagName("entity");

        List<Person> persons = new ArrayList<Person>();
        List<PersonEntity> personEntities = new ArrayList<PersonEntity>();
        for (int i = 0; i < entities.getLength(); i++) {
            Element entity = (Element) entities.item(i);

            if (entity.getElementsByTagName("type").item(0).getTextContent().equals("Person")) {
                String name = entity.getElementsByTagName("text").item(0).getTextContent();

                String relevanceString = entity.getElementsByTagName("relevance").item(0).getTextContent();
                Double relevance = Double.parseDouble(relevanceString);

                Element sentiment = (Element) entity.getElementsByTagName("sentiment").item(0);
                Double score;
                Boolean mixed;

                String sentimentType = sentiment.getElementsByTagName("type").item(0).getTextContent();
                if (sentimentType.equals("neutral")) {
                    score = 0.0;
                } else {
                    String scoreText = sentiment.getElementsByTagName("score").item(0).getTextContent();
                    score = Double.parseDouble(scoreText);
                }

                NodeList mixedElements = sentiment.getElementsByTagName("mixed");
                if (mixedElements.getLength() > 0) {
                    String mixedText = mixedElements.item(0).getTextContent();
                    mixed = mixedText.equals("1");
                } else {
                    mixed = false;
                }

                NodeList disambiguatedElements = entity.getElementsByTagName("disambiguated");
                List<String> categories = new ArrayList<String>();
                String dbpediaUrl = null;
                if (disambiguatedElements.getLength() > 0) {
                    Element disambiguated = (Element) disambiguatedElements.item(0);

                    NodeList subtypes = disambiguated.getElementsByTagName("subType");
                    for (int z = 0; z < subtypes.getLength(); z++) {
                        categories.add(subtypes.item(z).getTextContent());
                    }

                    dbpediaUrl = disambiguated.getElementsByTagName("dbpedia").item(0).getTextContent();
                }

                Person person = new Person();

                person.setName(name);
                person.setDbpediaUrl(dbpediaUrl);
                person.setCategories(categories);

                //defer fetching from dbpedia until before adding to data store (a prior confirmation that the person isn't already stored reduces total traffic) 
                person.setShortBio(null);
                person.setPictureUrl(null);

                persons.add(person);

                PersonEntity personEntity = new PersonEntity();

                //set this after the person has been added to the data store
                personEntity.setPerson(null);

                personEntity.setRelevance(relevance);
                personEntity.setScore(score);
                if (mixed) {
                    personEntity.setMixed(mixed);
                }

                personEntities.add(personEntity);
            }
        }

        //extracting concepts

        LinkedList<Concept> concepts = new LinkedList<Concept>();

        Element conceptsElement = (Element) document.getElementsByTagName("concepts").item(0);
        NodeList conceptsList = conceptsElement.getElementsByTagName("concept");
        for (int i = 0; i < conceptsList.getLength(); i++) {
            Element conceptElement = (Element) conceptsList.item(i);

            String name = conceptElement.getElementsByTagName("text").item(0).getTextContent();

            String relevanceString = conceptElement.getElementsByTagName("relevance").item(0).getTextContent();
            Double relevance = Double.parseDouble(relevanceString);

            String dbpediaUrl = conceptElement.getElementsByTagName("dbpedia").item(0).getTextContent();

            Concept concept = new Concept();

            concept.setText(name);
            concept.setRelevance(relevance);
            concept.setDbpediaUrl(dbpediaUrl);

            concepts.add(concept);
        }

        //extracting keywords

        LinkedList<Keyword> keywords = new LinkedList<Keyword>();

        Element keywordsElement = (Element) document.getElementsByTagName("keywords").item(0);
        NodeList keywordsList = keywordsElement.getElementsByTagName("keyword");
        for (int i = 0; i < keywordsList.getLength(); i++) {
            Element keywordElement = (Element) keywordsList.item(i);

            String name = keywordElement.getElementsByTagName("text").item(0).getTextContent();

            String relevanceString = keywordElement.getElementsByTagName("relevance").item(0).getTextContent();
            Double relevance = Double.parseDouble(relevanceString);

            Element sentiment = (Element) keywordElement.getElementsByTagName("sentiment").item(0);
            Double score;
            Boolean mixed;

            String sentimentType = sentiment.getElementsByTagName("type").item(0).getTextContent();
            if (sentimentType.equals("neutral")) {
                score = 0.0;
            } else {
                String scoreText = sentiment.getElementsByTagName("score").item(0).getTextContent();
                score = Double.parseDouble(scoreText);
            }

            NodeList mixedElements = sentiment.getElementsByTagName("mixed");
            if (mixedElements.getLength() > 0) {
                String mixedText = mixedElements.item(0).getTextContent();
                mixed = mixedText.equals("1");
            } else {
                mixed = false;
            }

            Keyword keyword = new Keyword();

            keyword.setText(name);
            keyword.setRelevance(relevance);
            keyword.setScore(score);
            if (mixed) {
                keyword.setMixed(mixed);
            }

            keywords.add(keyword);
        }

        //extracting taxonomies

        LinkedList<Taxonomy> taxonomies = new LinkedList<Taxonomy>();

        Element taxonomiesElement = (Element) document.getElementsByTagName("taxonomy").item(0);
        NodeList taxonomiesList = taxonomiesElement.getElementsByTagName("element");
        for (int i = 0; i < taxonomiesList.getLength(); i++) {
            Element taxonomyElement = (Element) taxonomiesList.item(i);

            String labelString = taxonomyElement.getElementsByTagName("label").item(0).getTextContent();
            if (labelString.startsWith("/")) {
                labelString = labelString.substring(1);
            }
            List<String> labels = Arrays.asList(labelString.split("/"));

            String scoreString = taxonomyElement.getElementsByTagName("score").item(0).getTextContent();
            Double score = Double.parseDouble(scoreString);

            Taxonomy taxonomy = new Taxonomy();

            taxonomy.setLabels(labels);
            taxonomy.setScore(score);

            taxonomies.add(taxonomy);
        }

        return new AlchemyAPIAnalysisResult(null, null, text, null, null, persons, personEntities, concepts,
                keywords, taxonomies);
    }

    private static void persistData(PersonRepository personRepository, ContentRepository contentRepository,
            AlchemyAPIAnalysisResult data) {
        List<Person> persons = data.getPersons();
        List<PersonEntity> personEntities = data.getPersonEntities();

        if (persons.size() > 0) {
            for (int i = 0; i < persons.size(); i++) {
                Person person = persons.get(i);
                Person found = null;

                if (person.getDbpediaUrl() != null) {
                    found = personRepository.findByDbpediaUrl(person.getDbpediaUrl());
                } else {
                    found = personRepository.findByName(person.getName());
                }

                if (found == null) {
                    found = personRepository.save(person);
                }

                //insert the id in the rating
                personEntities.get(i).setPerson(found);
            }

            //save the article

            Content content = new Content();

            content.setType(data.getType());
            content.setSourceUrl(data.getUrl());
            content.setTitle(data.getTitle());
            content.setBody(data.getBody());
            content.setPersonEntities(personEntities);
            content.setConcepts(data.getConcepts());
            content.setKeywords(data.getKeywords());
            content.setTaxonomies(data.getTaxonomies());
            content.setTimestamp(data.getTimestamp());

            content = contentRepository.save(content);
            System.out.println("Go zachuvalo content");

            //fetch short biography and picture url for each person from dbpedia

            for (PersonEntity personEntity : personEntities) {
                Person person = personEntity.getPerson();
                if (person.getDbpediaUrl() != null
                        && (person.getPictureUrl() == null || person.getShortBio() == null)) {
                    //fetch the details
                    ContentsAggregatorImpl.fetchPersonDetailsFromDbpedia(person);

                    //save the updated person
                    person = personRepository.save(person);

                    //set the updated person
                    personEntity.setPerson(person);
                }
            }
        }
    }

    private static void fetchPersonDetailsFromDbpedia(Person person) {
        String url = person.getDbpediaUrl();
        url = url.replace("http://dbpedia.org/resource/", "http://dbpedia.org/data/")
                .replace("http://dbpedia.org/page/", "http://dbpedia.org/data/");
        url += ".xml";

        try {
            DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
            Document document = documentBuilder.parse(url);
            //System.out.println(getStringFromDocument(document));

            //extract abstract (short biography)
            NodeList abstracts = document.getElementsByTagName("dbo:abstract");
            for (int i = 0; i < abstracts.getLength(); i++) {
                Element entity = (Element) abstracts.item(i);
                if (entity.hasAttribute("xml:lang") && entity.getAttribute("xml:lang").equals("en")) {
                    String dbAbstract = entity.getTextContent().replaceAll("&#39;", "'").replaceAll("&lt;", "<")
                            .replaceAll("&gt;", ">").replaceAll("&amp;", "&").replaceAll("", "-");
                    person.setShortBio(dbAbstract);
                }
            }

            //extract thumbnail (picture url)
            NodeList thumbnails = document.getElementsByTagName("dbo:thumbnail");
            for (int i = 0; i < thumbnails.getLength(); i++) {
                Element entity = (Element) thumbnails.item(i);
                if (entity.hasAttribute("rdf:resource")) {
                    String dbThumbnail = entity.getAttribute("rdf:resource");
                    person.setPictureUrl(dbThumbnail);
                }
            }
        } catch (ParserConfigurationException exception) {
        } catch (SAXException exception) {
        } catch (IOException exception) {
        }
    }

    /*
    private static String getStringFromDocument(Document doc){
        try {
       DOMSource domSource = new DOMSource(doc);
       StringWriter writer = new StringWriter();
       StreamResult result = new StreamResult(writer);
        
       TransformerFactory tf = TransformerFactory.newInstance();
       Transformer transformer = tf.newTransformer();
       transformer.transform(domSource, result);
        
       return writer.toString();
        } catch (TransformerException ex) {
       ex.printStackTrace();
       return null;
        }
    }
    */

    //declared public because otherwise Spring failes to autowire it
    public static class AlchemyAPIAnalysisResult {
        private String type;
        private String title;
        private String body;
        private String url;
        private String timestamp;

        private List<Person> persons;
        private List<PersonEntity> personEntities;
        private List<Concept> concepts;
        private List<Keyword> keywords;
        private List<Taxonomy> taxonomies;

        public AlchemyAPIAnalysisResult() {
            //does nothing
        }

        public AlchemyAPIAnalysisResult(String type, String title, String body, String url, String timestamp,
                List<Person> persons, List<PersonEntity> personEntities, List<Concept> concepts,
                List<Keyword> keywords, List<Taxonomy> taxonomies) {
            this.type = type;
            this.title = title;
            this.body = body;
            this.url = url;
            this.timestamp = timestamp;

            this.persons = persons;
            this.personEntities = personEntities;
            this.concepts = concepts;
            this.keywords = keywords;
            this.taxonomies = taxonomies;
        }

        public String getType() {
            return type;
        }

        public void setType(String type) {
            this.type = type;
        }

        public String getTitle() {
            return title;
        }

        public void setTitle(String title) {
            this.title = title;
        }

        public String getBody() {
            return body;
        }

        public void setBody(String body) {
            this.body = body;
        }

        public String getUrl() {
            return url;
        }

        public void setUrl(String url) {
            this.url = url;
        }

        public String getTimestamp() {
            return timestamp;
        }

        public void setTimestamp(String timestamp) {
            this.timestamp = timestamp;
        }

        public List<Person> getPersons() {
            return persons;
        }

        public void setPersons(List<Person> persons) {
            this.persons = persons;
        }

        public List<PersonEntity> getPersonEntities() {
            return personEntities;
        }

        public void setPersonEntities(List<PersonEntity> personEntities) {
            this.personEntities = personEntities;
        }

        public List<Concept> getConcepts() {
            return concepts;
        }

        public void setConcepts(List<Concept> concepts) {
            this.concepts = concepts;
        }

        public List<Keyword> getKeywords() {
            return keywords;
        }

        public void setKeywords(List<Keyword> keywords) {
            this.keywords = keywords;
        }

        public List<Taxonomy> getTaxonomies() {
            return taxonomies;
        }

        public void setTaxonomies(List<Taxonomy> taxonomies) {
            this.taxonomies = taxonomies;
        }
    }

}