uk.co.jassoft.markets.utils.article.ContentGrabber.java Source code

Introduction

Here is the source code for uk.co.jassoft.markets.utils.article.ContentGrabber.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package uk.co.jassoft.markets.utils.article;

import com.joestelmach.natty.DateGroup;
import com.joestelmach.natty.Parser;
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import uk.co.jassoft.markets.datamodel.story.date.MissingDateFormat;
import uk.co.jassoft.markets.exceptions.article.ArticleContentException;
import uk.co.jassoft.markets.repository.MissingDateFormatRepository;

import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 *
 * @author Jonny
 */
@Component
public class ContentGrabber {

    private static final Logger LOG = LoggerFactory.getLogger(ContentGrabber.class);

    @Autowired
    private MissingDateFormatRepository missingDateFormatRepository;

    private static final Pattern TITLE_TAG = Pattern.compile("\\<title>(.*)\\</title>",
            Pattern.CASE_INSENSITIVE | Pattern.DOTALL);

    public String getContentFromWebsite(String html) throws ArticleContentException {
        try {
            return ArticleExtractor.INSTANCE.getText(html);
        } catch (Exception exception) {
            throw new ArticleContentException("Failed to get content from website", exception);
        }
    }

    public String geTitleFromWebsite(String html) {
        try {
            // extract the title
            Matcher matcher = TITLE_TAG.matcher(html);
            if (matcher.find()) {
                /* replace any occurrences of whitespace (which may
                 * include line feeds and other uglies) as well
                 * as HTML brackets with a space */
                return matcher.group(1).replaceAll("[\\s\\<>]+", " ").trim();
            } else
                return null;
        } catch (Exception exception) {
            LOG.error("Failed to get title from website", exception);
        }

        return null;
    }

    public Date getPublishedDate(String html) {
        try {
            Document doc = Jsoup.parse(html);

            List<Date> possibleDates = new ArrayList<>();

            for (String selector : getSelectors()) {
                Elements metalinks = doc.select(selector);

                if (metalinks.isEmpty())
                    continue;

                Date value = getDateValue(metalinks.get(0).toString());

                if (value != null) {
                    return value;
                }

                if (possibleDates.isEmpty()) {
                    LOG.info("Date Format Not recognised for [{}]", metalinks.get(0).toString());
                    missingDateFormatRepository
                            .save(new MissingDateFormat(metalinks.get(0).toString(), new Date()));
                }
            }

            if (!possibleDates.isEmpty()) {
                if (possibleDates.size() > 1) {
                    possibleDates.sort(Date::compareTo);
                }
                return possibleDates.get(possibleDates.size() - 1);
            }

            return null;
        } catch (Exception exception) {
            LOG.error("Failed to get Published Date", exception);
            return null;
        }
    }

    private Date getDateValue(final String contentsToCheck) {
        if (contentsToCheck.isEmpty())
            return null;

        Parser parser = new Parser();
        List<DateGroup> groups = parser.parse(contentsToCheck);

        Date possibleDate = null;

        for (DateGroup group : groups) {
            List<Date> dates = group.getDates();

            for (Date publishedDate : dates) {
                if (new DateTime(DateTimeZone.UTC).plusDays(1).isBefore(publishedDate.getTime())) {
                    LOG.debug("Date is over 1 day in the future [{}]", publishedDate.toString());
                    continue;
                }

                if (possibleDate == null) {
                    possibleDate = publishedDate;

                    if (group.isTimeInferred()) {
                        possibleDate = new DateTime(publishedDate).withTime(0, 0, 0, 0).toDate();
                    }
                    continue;
                }

                DateTime latestPublishedDate = new DateTime(publishedDate);

                if (!group.isTimeInferred()) {
                    possibleDate = new DateTime(possibleDate).withTime(latestPublishedDate.getHourOfDay(),
                            latestPublishedDate.getMinuteOfHour(), latestPublishedDate.getSecondOfMinute(),
                            latestPublishedDate.getMillisOfSecond()).toDate();
                }

                if (!group.isDateInferred()) {
                    possibleDate = new DateTime(possibleDate).withDate(latestPublishedDate.getYear(),
                            latestPublishedDate.getMonthOfYear(), latestPublishedDate.getDayOfMonth()).toDate();
                }
            }
        }

        if (possibleDate != null) {
            return possibleDate;
        }

        return null;
    }

    private List<String> getSelectors() {
        List<String> selectors = new ArrayList<>();

        selectors.add("date");
        selectors.add("time");
        selectors.add("meta[name*=date]");
        selectors.add("meta[name*=time]");
        selectors.add("meta[itemprop*=date]");

        return selectors;
    }
}