com.snowplowanalytics.refererparser.Parser.java Source code

Java tutorial

Introduction

Here is the source code for com.snowplowanalytics.refererparser.Parser.java

Source

/**
 * Copyright 2012-2013 Snowplow Analytics Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.snowplowanalytics.refererparser;

// Java
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.Charset;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.Collections;

// SnakeYAML
import org.yaml.snakeyaml.Yaml;
import org.yaml.snakeyaml.constructor.SafeConstructor;

// Apache URLEncodedUtils
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URLEncodedUtils;

/**
 * Java implementation of <a href="https://github.com/snowplow/referer-parser">Referer Parser</a>
 *
 * @author Alex Dean (@alexatkeplar) <support at snowplowanalytics com>
 */
public class Parser {

    private static final String REFERERS_YAML_PATH = "/referers.yml";
    private Map<String, RefererLookup> referers;

    /**
     * Holds the structure of each referer
     * in our lookup Map.
     */
    private static class RefererLookup {
        public Medium medium;
        public String source;
        public List<String> parameters;

        public RefererLookup(Medium medium, String source, List<String> parameters) {
            this.medium = medium;
            this.source = source;
            this.parameters = parameters;
        }
    }

    /**
     * Construct our Parser object using the
     * bundled referers.yml
     */
    public Parser() throws IOException, CorruptYamlException {
        this(Parser.class.getResourceAsStream(REFERERS_YAML_PATH));
    }

    /**
     * Construct our Parser object using a 
     * InputStream (in YAML format)
     *
     * @param referersYaml The referers YAML
     *        to load into our Parser, in
     *        InputStream format
     */
    public Parser(InputStream referersStream) throws CorruptYamlException {
        referers = loadReferers(referersStream);
    }

    /**
     * Construct our Parser object using a
     * custom resource String
     *
     * @param referersResource The resource pointing
     *        to the referers YAML file to load
     */
    public Parser(String referersResource) throws IOException, CorruptYamlException {
        this(Parser.class.getResourceAsStream(referersResource));
    }

    public Referer parse(URI refererUri, URI pageUri) {
        return parse(refererUri, pageUri.getHost());
    }

    public Referer parse(String refererUri, URI pageUri) throws URISyntaxException {
        return parse(refererUri, pageUri.getHost());
    }

    public Referer parse(String refererUri, String pageHost) throws URISyntaxException {
        if (refererUri == null || refererUri == "")
            return null;
        final URI uri = new URI(refererUri);
        return parse(uri, pageHost);
    }

    public Referer parse(URI refererUri, String pageHost) {
        return parse(refererUri, pageHost, Collections.<String>emptyList());
    }

    public Referer parse(URI refererUri, String pageHost, List<String> internalDomains) {
        if (refererUri == null) {
            return null;
        }
        return parse(refererUri.getScheme(), refererUri.getHost(), refererUri.getPath(), refererUri.getRawQuery(),
                pageHost, internalDomains);
    }

    public Referer parse(URL refererUrl, String pageHost) {
        if (refererUrl == null) {
            return null;
        }
        return parse(refererUrl.getProtocol(), refererUrl.getHost(), refererUrl.getPath(), refererUrl.getQuery(),
                pageHost);
    }

    private Referer parse(String scheme, String host, String path, String query, String pageHost) {
        return parse(scheme, host, path, query, pageHost, Collections.<String>emptyList());
    }

    private Referer parse(String scheme, String host, String path, String query, String pageHost,
            List<String> internalDomains) {

        if (scheme == null || (!scheme.equals("http") && !scheme.equals("https")))
            return null;

        // Internal link if hosts match exactly
        // TODO: would also be nice to:
        // 1. Support a list of other hosts which count as internal
        // 2. Have an algo for stripping subdomains before checking match
        if (host == null)
            return null; // Not a valid URL
        if (host.equals(pageHost))
            return new Referer(Medium.INTERNAL, null, null);
        for (String s : internalDomains) {
            if (s.trim().equals(host))
                return new Referer(Medium.INTERNAL, null, null);
        }

        // Try to lookup our referer. First check with paths, then without.
        // This is the safest way of handling lookups
        RefererLookup referer = lookupReferer(host, path, true);
        if (referer == null) {
            referer = lookupReferer(host, path, false);
        }

        if (referer == null) {
            return new Referer(Medium.UNKNOWN, null, null); // Unknown referer, nothing more to do
        } else {
            // Potentially add a search term
            final String term = (referer.medium == Medium.SEARCH) ? extractSearchTerm(query, referer.parameters)
                    : null;
            return new Referer(referer.medium, referer.source, term);
        }
    }

    /**
     * Recursive function to lookup a host (or partial host)
     * in our referers map.
     *
     * First check the host, then the host+full path, then the host+
     * one-level path.
     *
     * If not found, remove one subdomain-level off the front
     * of the host and try again.
     *
     * @param pageHost The host of the current page
     * @param pagePath The path to the current page
     * @param includePath Whether to include the path in the lookup
     *
     * @return a RefererLookup object populated with the given
     *         referer, or null if not found
     */
    private RefererLookup lookupReferer(String refererHost, String refererPath, Boolean includePath) {

        // Check if domain+full path matches, e.g. for apollo.lv/portal/search/ 
        RefererLookup referer = (includePath) ? referers.get(refererHost + refererPath) : referers.get(refererHost);

        // Check if domain+one-level path matches, e.g. for orange.fr/webmail/fr_FR/read.html (in our YAML it's orange.fr/webmail)
        if (includePath && referer == null) {
            final String[] pathElements = refererPath.split("/");
            if (pathElements.length > 1) {
                referer = referers.get(refererHost + "/" + pathElements[1]);
            }
        }

        if (referer == null) {
            final int idx = refererHost.indexOf('.');
            if (idx == -1) {
                return null; // No "."? Let's quit.
            } else {
                return lookupReferer(refererHost.substring(idx + 1), refererPath, includePath); // Recurse
            }
        } else {
            return referer;
        }
    }

    private String extractSearchTerm(String query, List<String> possibleParameters) {

        List<NameValuePair> params;
        try {
            params = URLEncodedUtils.parse(new URI("http://localhost?" + query), "UTF-8");
            // params = URLEncodedUtils.parse(query, Charset.forName("UTF-8")); because https://github.com/snowplow/referer-parser/issues/76
        } catch (IllegalArgumentException iae) {
            return null;
        } catch (URISyntaxException use) { // For new URI
            return null;
        }

        for (NameValuePair pair : params) {
            final String name = pair.getName();
            final String value = pair.getValue();

            if (possibleParameters.contains(name)) {
                return value;
            }
        }
        return null;
    }

    /**
     * Builds the map of hosts to referers from the
     * input YAML file.
     *
     * @param referersYaml An InputStream containing the
     *                     referers database in YAML format.
     *
     * @return a Map where the key is the hostname of each
     *         referer and the value (RefererLookup)
     *         contains all known info about this referer
     */
    private Map<String, RefererLookup> loadReferers(InputStream referersYaml) throws CorruptYamlException {

        Yaml yaml = new Yaml(new SafeConstructor());
        Map<String, Map<String, Map>> rawReferers = (Map<String, Map<String, Map>>) yaml.load(referersYaml);

        // This will store all of our referers
        Map<String, RefererLookup> referers = new HashMap<String, RefererLookup>();

        // Outer loop is all referers under a given medium
        for (Map.Entry<String, Map<String, Map>> mediumReferers : rawReferers.entrySet()) {

            Medium medium = Medium.fromString(mediumReferers.getKey());

            // Inner loop is individual referers
            for (Map.Entry<String, Map> referer : mediumReferers.getValue().entrySet()) {

                String sourceName = referer.getKey();
                Map<String, List<String>> refererMap = referer.getValue();

                // Validate
                List<String> parameters = refererMap.get("parameters");
                if (medium == Medium.SEARCH) {
                    if (parameters == null) {
                        throw new CorruptYamlException(
                                "No parameters found for search referer '" + sourceName + "'");
                    }
                } else {
                    if (parameters != null) {
                        throw new CorruptYamlException(
                                "Parameters not supported for non-search referer '" + sourceName + "'");
                    }
                }
                List<String> domains = refererMap.get("domains");
                if (domains == null) {
                    throw new CorruptYamlException("No domains found for referer '" + sourceName + "'");
                }

                // Our hash needs referer domain as the
                // key, so let's expand
                for (String domain : domains) {
                    if (referers.containsValue(domain)) {
                        throw new CorruptYamlException("Duplicate of domain '" + domain + "' found");
                    }
                    referers.put(domain, new RefererLookup(medium, sourceName, parameters));
                }
            }
        }

        return referers;
    }
}