org.apache.streams.urls.LinkResolver.java Source code

Introduction

Here is the source code for org.apache.streams.urls.LinkResolver.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.streams.urls;

import org.apache.commons.codec.net.URLCodec;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.validator.routines.UrlValidator;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.Serializable;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;

public class LinkResolver implements Serializable {

    /**
     * References:
     * Some helpful references to demonstrate the different types of browser re-directs that
     * can happen. If you notice a redirect that was not followed to the proper place please
     * submit a bug at :
     * https://issues.apache.org/jira/browse/STREAMS
     * <p/>
     * Purpose              URL
     * -------------        ----------------------------------------------------------------
     * [Status Codes]       http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
     * [Test Cases]         http://greenbytes.de/tech/tc/httpredirects/
     * [t.co behavior]      https://dev.twitter.com/docs/tco-redirection-behavior
     */

    private final static Logger LOGGER = LoggerFactory.getLogger(LinkResolver.class);

    private static final int MAX_ALLOWED_REDIRECTS = 30; // We will only chase the link to it's final destination a max of 30 times.
    private static final int DEFAULT_HTTP_TIMEOUT = 10000; // We will only wait a max of 10,000 milliseconds (10 seconds) for any HTTP response
    private static final String LOCATION_IDENTIFIER = "location";
    private static final String SET_COOKIE_IDENTIFIER = "set-cookie";

    // if Bots are not 'ok' this is the spoof settings that we'll use
    private static final Map<String, String> SPOOF_HTTP_HEADERS = new HashMap<String, String>() {
        {
            put("Connection", "Keep-Alive");
            put("User-Agent",
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48 Safari/537.36");
            put("Accept-Language", "en-US,en;q=0.8,zh;q=0.6");
        }
    };

    // These are the known domains that are 'bot' friendly.
    private static final Collection<String> BOTS_ARE_OK = new ArrayList<String>() {
        {
            add("t.co");
        }
    };

    // To help canonicalize the URL, these parts are 'known' to be 'ok' to remove
    private static final Collection<String> URL_TRACKING_TO_REMOVE = new ArrayList<String>() {
        {
            /*
             * Google uses parameters in the URL string to track referrers
             * on their Google Analytics and promotions. These are the
             * identified URL patterns.
             *
             * URL:
             * https://support.google.com/analytics/answer/1033867?hl=en
             *****************************************************************/

            // Required. Use utm_source to identify a search engine, newsletter name, or other source.
            add("([\\?&])utm_source(=)[^&?]*");

            // Required. Use utm_medium to identify a medium such as email or cost-per- click.
            add("([\\?&])utm_medium(=)[^&?]*");

            // Used for paid search. Use utm_term to note the keywords for this ad.
            add("([\\?&])utm_term(=)[^&?]*");

            // Used for A/B testing and content-targeted ads. Use utm_content to differentiate ads or links that point to the same
            add("([\\?&])utm_content(=)[^&?]*");

            // Used for keyword analysis. Use utm_campaign to identify a specific product promotion or strategic campaign.
            add("([\\?&])utm_campaign(=)[^&?]*");
        }
    };

    // This element holds all the information about all the re-directs that have taken place
    // and the steps and HTTP codes that occurred inside of each step.
    private final LinkDetails linkDetails;
    private Collection<String> domainsSensitiveTo = new HashSet<>();

    /**
     * Get the link details
     *
     * @return Detailed log of every redirection that took place with the browser along with it it's ultimate status code.
     */
    public LinkDetails getLinkDetails() {
        return linkDetails;
    }

    /**
     * Raw string input of the URL. If the URL is invalid, the response code that is returned will indicate such.
     *
     * @param originalURL The URL you wish to unwind represented as a string.
     */
    public LinkResolver(String originalURL) {
        linkDetails = new LinkDetails();
        linkDetails.setOriginalURL(originalURL);
    }

    public void run() {

        Objects.requireNonNull(linkDetails.getOriginalURL());

        linkDetails.setStartTime(DateTime.now());

        // we are going to try three times just in case we catch a slow server or one that needs
        // to be warmed up. This tends to happen many times with smaller private servers
        for (int i = 0; (i < 3) && linkDetails.getFinalURL() == null; i++)
            if (linkDetails.getLinkStatus() != LinkDetails.LinkStatus.SUCCESS)
                unwindLink(linkDetails.getOriginalURL());

        // because this is a POJO we need to make sure that we set this to false if it was never re-directed
        if (this.linkDetails.getRedirectCount() == 0 || this.linkDetails.getRedirected() == null)
            this.linkDetails.setRedirected(false);

        linkDetails.setFinalURL(cleanURL(linkDetails.getFinalURL()));
        if (StringUtils.isNotBlank(linkDetails.getFinalURL()))
            linkDetails.setNormalizedURL(normalizeURL(linkDetails.getFinalURL()));
        if (StringUtils.isNotBlank(linkDetails.getNormalizedURL()))
            linkDetails.setUrlParts(tokenizeURL(linkDetails.getNormalizedURL()));

        this.updateTookInMillis();
    }

    protected void updateTookInMillis() {
        Objects.requireNonNull(linkDetails.getStartTime());
        linkDetails.setTookInMills(DateTime.now().minus(linkDetails.getStartTime().getMillis()).getMillis());
    }

    public void unwindLink(String url) {
        Objects.requireNonNull(linkDetails);
        Objects.requireNonNull(url);

        // Check url validity
        UrlValidator urlValidator = new UrlValidator();
        if (!urlValidator.isValid(url)) {
            linkDetails.setLinkStatus(LinkDetails.LinkStatus.MALFORMED_URL);
            return;
        }

        // Check to see if they wound up in a redirect loop,
        // IE: 'A' redirects to 'B', then 'B' redirects to 'A'
        if ((linkDetails.getRedirectCount() != null && linkDetails.getRedirectCount() > 0
                && (linkDetails.getOriginalURL().equals(url) || linkDetails.getRedirects().contains(url)))
                || (linkDetails.getRedirectCount() != null
                        && linkDetails.getRedirectCount() > MAX_ALLOWED_REDIRECTS)) {
            linkDetails.setLinkStatus(LinkDetails.LinkStatus.LOOP);
            return;
        }

        if (!linkDetails.getOriginalURL().equals(url))
            linkDetails.getRedirects().add(url);

        HttpURLConnection connection = null;

        // Store where the redirected link will go (if there is one)
        String reDirectedLink = null;

        try {
            // Turn the string into a URL
            URL thisURL = new URL(url);

            // Be sensitive to overloading domains STREAMS-77
            try {
                String host = thisURL.getHost().toLowerCase();
                if (!domainsSensitiveTo.contains(host)) {
                    domainsSensitiveTo.add(host);
                    long domainWait = LinkResolverHelperFunctions.waitTimeForDomain(thisURL.getHost());
                    if (domainWait > 0) {
                        LOGGER.debug("Waiting for domain: {}", domainWait);
                        Thread.sleep(domainWait);
                    }
                }
            } catch (Exception e) {
                // noOp
            }

            connection = (HttpURLConnection) new URL(url).openConnection();

            // now we are going to pretend that we are a browser...
            // This is the way my mac works.
            if (!BOTS_ARE_OK.contains(thisURL.getHost())) {
                connection.addRequestProperty("Host", thisURL.getHost());

                // Bots are not 'ok', so we need to spoof the headers
                for (String k : SPOOF_HTTP_HEADERS.keySet())
                    connection.addRequestProperty(k, SPOOF_HTTP_HEADERS.get(k));

                // the test to seattlemamadoc.com prompted this change.
                // they auto detect bots by checking the referrer chain and the 'user-agent'
                // this broke the t.co test. t.co URLs are EXPLICITLY ok with bots
                // there is a list for URLS that behave this way at the top in BOTS_ARE_OK
                // smashew 2013-13-2013
                if (linkDetails.getRedirectCount() > 0 && BOTS_ARE_OK.contains(thisURL.getHost()))
                    connection.addRequestProperty("Referrer", linkDetails.getOriginalURL());
            }

            connection.setReadTimeout(DEFAULT_HTTP_TIMEOUT);
            connection.setConnectTimeout(DEFAULT_HTTP_TIMEOUT);

            // we want to follow this behavior on our own to ensure that we are getting to the
            // proper place. This is especially true with links that are wounded by special
            // link winders,
            // IE:
            connection.setInstanceFollowRedirects(false);

            if (linkDetails.getCookies() != null)
                for (String cookie : linkDetails.getCookies())
                    connection.addRequestProperty("Cookie", cookie.split(";", 1)[0]);

            connection.connect();

            linkDetails.setFinalResponseCode((long) connection.getResponseCode());

            Map<String, List<String>> headers = createCaseInsensitiveMap(connection.getHeaderFields());
            /*
             * If they want us to set cookies, well, then we will set cookies
             * Example URL:
             * http://nyti.ms/1bCpesx
             *****************************************************************/
            if (headers.containsKey(SET_COOKIE_IDENTIFIER))
                linkDetails.getCookies().add(headers.get(SET_COOKIE_IDENTIFIER).get(0));

            switch (linkDetails.getFinalResponseCode().intValue()) {
            /*
             * W3C HTTP Response Codes:
             * http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
             */
            case 200: // HTTP OK
                linkDetails.setFinalURL(connection.getURL().toString());
                linkDetails.setDomain(new URL(linkDetails.getFinalURL()).getHost());
                linkDetails.setLinkStatus(LinkDetails.LinkStatus.SUCCESS);
                break;
            case 300: // Multiple choices
            case 301: // URI has been moved permanently
            case 302: // Found
            case 303: // Primarily for a HTTP Post
            case 304: // Not Modified
            case 306: // This status code is unused but in the redirect block.
            case 307: // Temporary re-direct
                /*
                 * Author:
                 * Smashew
                 *
                 * Date: 2013-11-15
                 *
                 * Note:
                 * It is possible that we have already found our final URL. In
                 * the event that we have found our final URL, we are going to
                 * save this URL as long as it isn't the original URL.
                 * We are still going to ask the browser to re-direct, but in the
                 * case of yet another redirect, seen with the redbull test
                 * this can be followed by a 304, a browser, by W3C standards would
                 * still render the page with it's content, but for us to assert
                 * a success, we are really hoping for a 304 message.
                 *******************************************************************/
                if (!linkDetails.getOriginalURL().toLowerCase()
                        .equals(connection.getURL().toString().toLowerCase()))
                    linkDetails.setFinalURL(connection.getURL().toString());
                if (!headers.containsKey(LOCATION_IDENTIFIER)) {
                    LOGGER.info("Headers: {}", headers);
                    linkDetails.setLinkStatus(LinkDetails.LinkStatus.REDIRECT_ERROR);
                } else {
                    linkDetails.setRedirected(Boolean.TRUE);
                    linkDetails.setRedirectCount(linkDetails.getRedirectCount() + 1);
                    reDirectedLink = connection.getHeaderField(LOCATION_IDENTIFIER);
                }
                break;
            case 305: // User must use the specified proxy (deprecated by W3C)
                break;
            case 401: // Unauthorized (nothing we can do here)
                linkDetails.setLinkStatus(LinkDetails.LinkStatus.UNAUTHORIZED);
                break;
            case 403: // HTTP Forbidden (Nothing we can do here)
                linkDetails.setLinkStatus(LinkDetails.LinkStatus.FORBIDDEN);
                break;
            case 404: // Not Found (Page is not found, nothing we can do with a 404)
                linkDetails.setLinkStatus(LinkDetails.LinkStatus.NOT_FOUND);
                break;
            case 500: // Internal Server Error
            case 501: // Not Implemented
            case 502: // Bad Gateway
            case 503: // Service Unavailable
            case 504: // Gateway Timeout
            case 505: // Version not supported
                linkDetails.setLinkStatus(LinkDetails.LinkStatus.HTTP_ERROR_STATUS);
                break;
            default:
                LOGGER.info("Unrecognized HTTP Response Code: {}", linkDetails.getFinalResponseCode());
                linkDetails.setLinkStatus(LinkDetails.LinkStatus.NOT_FOUND);
                break;
            }
        } catch (MalformedURLException e) {
            // the URL is trash, so, it can't load it.
            linkDetails.setLinkStatus(LinkDetails.LinkStatus.MALFORMED_URL);
        } catch (IOException ex) {
            // there was an issue we are going to set to error.
            linkDetails.setLinkStatus(LinkDetails.LinkStatus.ERROR);
        } catch (Exception ex) {
            // there was an unknown issue we are going to set to exception.
            linkDetails.setLinkStatus(LinkDetails.LinkStatus.EXCEPTION);
        } finally {
            // if the connection is not null, then we need to disconnect to close any underlying resources
            if (connection != null)
                connection.disconnect();
        }

        // If there was a redirection, then we have to keep going
        // Placing this code here should help to satisfy ensuring that the connection object
        // is closed successfully.
        if (reDirectedLink != null)
            unwindLink(reDirectedLink);

    }

    private Map<String, List<String>> createCaseInsensitiveMap(Map<String, List<String>> input) {
        Map<String, List<String>> toReturn = new HashMap<>();
        for (String k : input.keySet())
            if (k != null && input.get(k) != null)
                toReturn.put(k.toLowerCase(), input.get(k));
        return toReturn;
    }

    private String cleanURL(String url) {
        // If they pass us a null URL then we are going to pass that right back to them.
        if (url == null)
            return null;

        // remember how big the URL was at the start
        int startLength = url.length();

        // Iterate through all the known URL parameters of tracking URLs
        for (String pattern : URL_TRACKING_TO_REMOVE)
            url = url.replaceAll(pattern, "");

        // If the URL is smaller than when it came in. Then it had tracking information
        if (url.length() < startLength)
            linkDetails.setTracked(Boolean.TRUE);

        // return our url.
        return url;
    }

    /**
     * Removes the protocol, if it exists, from the front and
     * removes any random encoding characters
     * Extend this to do other url cleaning/pre-processing
     *
     * @param url - The String URL to normalize
     * @return normalizedUrl - The String URL that has no junk or surprises
     */
    public static String normalizeURL(String url) {
        // Decode URL to remove any %20 type stuff
        String normalizedUrl = url;
        try {

            // Replaced URLDecode with commons-codec b/c of failing tests

            URLCodec codec = new URLCodec();

            normalizedUrl = codec.decode(url);

            // Remove the protocol, http:// ftp:// or similar from the front
            if (normalizedUrl.contains("://"))
                normalizedUrl = normalizedUrl.split(":/{2}")[1];

        } catch (NullPointerException npe) {
            System.err.println("NPE Decoding URL. Decoding skipped.");
            npe.printStackTrace();
        } catch (Throwable e) {
            System.err.println("Misc error Decoding URL. Decoding skipped.");
            e.printStackTrace();
        }

        // Room here to do more pre-processing

        return normalizedUrl;
    }

    /**
     * Goal is to get the different parts of the URL path. This can be used
     * in a classifier to help us determine if we are working with
     * <p/>
     * Reference:
     * http://stackoverflow.com/questions/10046178/pattern-matching-for-url-classification
     *
     * @param url - Url to be tokenized
     * @return tokens - A String array of all the tokens
     */
    public static List<String> tokenizeURL(String url) {
        url = normalizeURL(url);
        // I assume that we're going to use the whole URL to find tokens in
        // If you want to just look in the GET parameters, or you want to ignore the domain
        // or you want to use the domain as a token itself, that would have to be
        // processed above the next line, and only the remaining parts split
        List<String> toReturn = new ArrayList<>();

        // Split the URL by forward slashes. Most modern browsers will accept a URL
        // this malformed such as http://www.smashew.com/hello//how////are/you
        // hence the '+' in the regular expression.
        for (String part : url.split("/+"))
            toReturn.add(part.toLowerCase());

        // return our object.
        return toReturn;

        // One could alternatively use a more complex regex to remove more invalid matches
        // but this is subject to your (?:in)?ability to actually write the regex you want

        // These next two get rid of tokens that are too short, also.

        // Destroys anything that's not alphanumeric and things that are
        // alphanumeric but only 1 character long
        //String[] tokens = url.split("(?:[\\W_]+\\w)*[\\W_]+");

        // Destroys anything that's not alphanumeric and things that are
        // alphanumeric but only 1 or 2 characters long
        //String[] tokens = url.split("(?:[\\W_]+\\w{1,2})*[\\W_]+");
    }

}