com.romeikat.datamessie.core.base.service.download.ContentDownloader.java Source code

Java tutorial

Introduction

Here is the source code for com.romeikat.datamessie.core.base.service.download.ContentDownloader.java

Source

package com.romeikat.datamessie.core.base.service.download;

/*-
 * ============================LICENSE_START============================
 * data.messie (core)
 * =====================================================================
 * Copyright (C) 2013 - 2018 Dr. Raphael Romeikat
 * =====================================================================
 * This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public
License along with this program.  If not, see
<http://www.gnu.org/licenses/gpl-3.0.html>.
 * =============================LICENSE_END=============================
 */

import java.io.InputStream;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.time.LocalDateTime;
import org.apache.commons.lang3.StringEscapeUtils;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import com.romeikat.datamessie.core.base.util.XmlUtil;

@Service
public class ContentDownloader extends AbstractDownloader {

    private final static Logger LOG = LoggerFactory.getLogger(ContentDownloader.class);

    @Autowired
    private XmlUtil xmlUtil;

    public DownloadResult downloadContent(String url) {
        LOG.debug("Downloading content from {}", url);
        // In case of a new redirection for that source, use redirected URL
        URLConnection urlConnection = null;
        String originalUrl = null;
        org.jsoup.nodes.Document jsoupDocument = null;
        Integer statusCode = null;
        final LocalDateTime downloaded = LocalDateTime.now();
        try {
            urlConnection = getConnection(url);
            // Server-side redirection
            final String responseUrl = getResponseUrl(urlConnection);
            if (responseUrl != null) {
                final String redirectedUrl = getRedirectedUrl(url, responseUrl);
                if (isValidRedirection(url, redirectedUrl)) {
                    originalUrl = url;
                    url = redirectedUrl;
                    closeUrlConnection(urlConnection);
                    urlConnection = getConnection(url);
                    LOG.debug("Redirection (server): {} -> {}", originalUrl, url);
                }
            }
            // Download content for further redirects
            final InputStream urlInputStream = asInputStream(urlConnection, true, false);
            final Charset charset = getCharset(urlConnection);
            jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url);
            final Elements metaTagsHtmlHeadLink;
            Elements metaTagsHtmlHeadMeta = null;
            // Meta redirection (<link rel="canonical" .../>)
            if (originalUrl == null) {
                metaTagsHtmlHeadLink = jsoupDocument.select("html head link");
                for (final Element metaTag : metaTagsHtmlHeadLink) {
                    final Attributes metaTagAttributes = metaTag.attributes();
                    if (metaTagAttributes.hasKey("rel")
                            && metaTagAttributes.get("rel").equalsIgnoreCase("canonical")
                            && metaTagAttributes.hasKey("href")) {
                        final String redirectedUrl = metaTagAttributes.get("href").trim();
                        if (isValidRedirection(url, redirectedUrl)) {
                            originalUrl = url;
                            url = redirectedUrl;
                            jsoupDocument = null;
                            LOG.debug("Redirection (<link rel=\"canonical\" .../>): {} -> {}", originalUrl, url);
                            break;
                        }
                    }
                }
            }
            // Meta redirection (<meta http-equiv="refresh" .../>)
            if (originalUrl == null) {
                metaTagsHtmlHeadMeta = jsoupDocument.select("html head meta");
                for (final Element metaTag : metaTagsHtmlHeadMeta) {
                    final Attributes metaTagAttributes = metaTag.attributes();
                    if (metaTagAttributes.hasKey("http-equiv")
                            && metaTagAttributes.get("http-equiv").equalsIgnoreCase("refresh")
                            && metaTagAttributes.hasKey("content")) {
                        final String[] parts = metaTagAttributes.get("content").replace(" ", "").split("=", 2);
                        if (parts.length > 1) {
                            final String redirectedUrl = parts[1];
                            if (isValidRedirection(url, redirectedUrl)) {
                                originalUrl = url;
                                url = redirectedUrl;
                                jsoupDocument = null;
                                LOG.debug("Redirection (<meta http-equiv=\"refresh\" .../>): {} -> {}", originalUrl,
                                        url);
                                break;
                            }
                        }
                    }
                }
            }
            // Meta redirection (<meta property="og:url" .../>)
            if (originalUrl == null) {
                for (final Element metaTag : metaTagsHtmlHeadMeta) {
                    final Attributes metaTagAttributes = metaTag.attributes();
                    if (metaTagAttributes.hasKey("property")
                            && metaTagAttributes.get("property").equalsIgnoreCase("og:url")
                            && metaTagAttributes.hasKey("content")) {
                        final String redirectedUrl = metaTagAttributes.get("content").trim();
                        if (isValidRedirection(url, redirectedUrl)) {
                            originalUrl = url;
                            url = redirectedUrl;
                            jsoupDocument = null;
                            LOG.debug("Redirection (<meta property=\"og:url\" .../>): {} -> {}", originalUrl, url);
                            break;
                        }
                    }
                }
            }
        } catch (final Exception e) {
            if (e instanceof HttpStatusException) {
                statusCode = ((HttpStatusException) e).getStatusCode();
            }
            LOG.warn("Could not determine redirected URL for " + url, e);
        } finally {
            closeUrlConnection(urlConnection);
        }
        // Download content (if not yet done)
        String content = null;
        try {
            if (jsoupDocument == null) {
                LOG.debug("Downloading content from {}", url);
                urlConnection = getConnection(url);
                final InputStream urlInputStream = asInputStream(urlConnection, true, false);
                final Charset charset = getCharset(urlConnection);
                jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url);
            }
        } catch (final Exception e) {
            if (e instanceof HttpStatusException) {
                statusCode = ((HttpStatusException) e).getStatusCode();
            }
            // If the redirected URL does not exist, use the original URL instead
            if (originalUrl == null) {
                LOG.warn("Could not download content from " + url, e);
            }
            // If the redirected URL does not exist and a original URL is available, use the
            // original URL instead
            else {
                try {
                    LOG.debug(
                            "Could not download content from redirected URL {}, downloading content from original URL {} instead",
                            url, originalUrl);
                    urlConnection = getConnection(originalUrl);
                    final InputStream urlInputStream = asInputStream(urlConnection, true, false);
                    final Charset charset = getCharset(urlConnection);
                    jsoupDocument = Jsoup.parse(urlInputStream, charset.name(), url);
                    url = originalUrl;
                    originalUrl = null;
                    statusCode = null;
                } catch (final Exception e2) {
                    LOG.warn("Could not download content from original URL " + url, e);
                }
            }
        } finally {
            closeUrlConnection(urlConnection);
        }
        if (jsoupDocument != null) {
            content = jsoupDocument.html();
        }
        // Strip non-valid characters as specified by the XML 1.0 standard
        final String validContent = xmlUtil.stripNonValidXMLCharacters(content);
        // Unescape HTML characters
        final String unescapedContent = StringEscapeUtils.unescapeHtml4(validContent);
        // Done
        final DownloadResult downloadResult = new DownloadResult(originalUrl, url, unescapedContent, downloaded,
                statusCode);
        return downloadResult;
    }

}