Java tutorial
// Copyright 2014 Wikimedia Foundation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package org.wikimedia.analytics.refinery.core; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; /** * Functions to work with Wikimedia webrequest data. */ public class Webrequest { /* * Meta-methods to enable eager instantiation in a singleton-based way. * in non-Java terms: you get to only create one class instance, and only * when you need it, instead of always having everything (static/eager instantiation) * or always generating everything anew (!singletons). So we have: * (1) an instance; * (2) an empty constructor (to avoid people just calling the constructor); * (3) an actual getInstance method to allow for instantiation. */ private static final Webrequest instance = new Webrequest(); private Webrequest() { } public static Webrequest getInstance() { return instance; } /* Regex to coarsely match email addresses in the user-agent as part of spider identification, as the User-Agent policy - https://meta.wikimedia.org/wiki/User-Agent_policy, encourages bot developers to leave contact information in the user agent string. */ private static final String coarseEmailPattern = "\\S+@\\S+\\.[a-zA-Z]{2,3}"; /* * Spiders identification pattern (obviously not perfect...) * to be used in addition to ua-parser device_family field * being identified as Spider. * * Implements also the Wikimedia User-Agent policy - * https://meta.wikimedia.org/wiki/User-Agent_policy */ private static final Pattern spiderPattern = Pattern.compile("(?i)^(" + ".*(bot|spider|WordPress|AppEngine|AppleDictionaryService|Python-urllib|python-requests|" + "Google-HTTP-Java-Client|[Ff]acebook|[Yy]ahoo|RockPeaks|http).*" + "|(goo wikipedia|MediaWikiCrawler-Google|wikiwix-bot|Java/|curl|PHP/|Faraday|HTTPC|Ruby|\\.NET|" + "Python|Apache|Scrapy|PycURL|libwww|Zend|wget|nodemw|WinHttpRaw|Twisted|com\\.eusoft|Lagotto|" + "Peggo|Recuweb|check_http|Magnus|MLD|Jakarta|find-link|J\\. River|projectplan9|ADmantX|" + "httpunit|LWP|iNaturalist|WikiDemo|FSResearchIt|livedoor|Microsoft Monitoring|MediaWiki|" + "User:|User_talk:|github|tools.wmflabs.org|" + coarseEmailPattern + ").*" + ")$"); /* * Spiders identification regexp takes a lot of computation power while there only are only * a relatively small number of recurrent user_agent values (less than a million). * We use a LRU cache to prevent recomputing agentType for frequently seen user agents. */ private Utilities.LRUCache<String, Boolean> agentTypeCache = new Utilities.LRUCache<>(10000); /** * Pattern for automatically-added subdomains that indicate zero, * or some similar portal-based interface to MW. */ private static final Pattern uriHostPattern = Pattern .compile("(^(m|zero|wap|mobile)\\.)|(\\.(m|zero|wap|mobile)\\.)"); /** * Consistent fragment of the user agent used by the Wikimedia * official mobile apps: used to identify app requests in * getAccessMethod. */ private static final Pattern appAgentPattern = Pattern.compile("WikipediaApp"); /** * Identify a bunch of spiders; returns TRUE * if the user agent matches a known spider. * @param userAgent the user agent associated with the request. * @return boolean */ public boolean isSpider(String userAgent) { if ("-".equals(userAgent)) return true; else { if (agentTypeCache.containsKey(userAgent)) return agentTypeCache.get(userAgent); else { boolean isSpider = spiderPattern.matcher(userAgent).find(); agentTypeCache.put(userAgent, isSpider); return isSpider; } } } /** * Kept for backward compatibility. */ @Deprecated public boolean isCrawler(String userAgent) { return isSpider(userAgent); } /** * Given an x_analytics field and the name of a key, return the * value associated with said key, or an empty string if the key * is not found. * * @param xAnalytics the x_analytics field entry. * @param key the key to search for the value of. * @return String */ public String getXAnalyticsValue(String xAnalytics, String key) { String value = ""; int keyIndex = xAnalytics.indexOf(key); if (keyIndex == -1) { return value; } int delimiterIndex = xAnalytics.indexOf(";", keyIndex); if (delimiterIndex == -1) { value = xAnalytics.substring(keyIndex + key.length() + 1); } else { value = xAnalytics.substring(keyIndex + key.length() + 1, delimiterIndex); } //Done return value; } /** * Determines the method used for accessing the site - mobile web, * desktop, or app. If the user agent is an app agent, it's * mobile app; if the user agent is not, but it is to m. or * zero. domains, mobile web; otherwise, desktop. * * @param uriHost the value in the uri_host field. * * @param userAgent the user_agent. * * @return String */ public String getAccessMethod(String uriHost, String userAgent) { String accessMethod; if (appAgentPattern.matcher(userAgent).find()) { accessMethod = "mobile app"; } else if (uriHostPattern.matcher(uriHost).find()) { accessMethod = "mobile web"; } else { accessMethod = "desktop"; } return accessMethod; } /** * Classifies a referer * * @param url The referer url to classify * @return RefererClassification */ public String classifyReferer(String url) { if (url == null || url.isEmpty() || url.equals("-")) { return Referer.NONE.getRefLabel(); } String[] urlParts = StringUtils.splitPreserveAllTokens(url, '/'); if (urlParts == null || urlParts.length < 3) { return Referer.UNKNOWN.getRefLabel(); } if (!urlParts[0].equals("http:") && !urlParts[0].equals("https:")) { return Referer.UNKNOWN.getRefLabel(); } if (!urlParts[1].isEmpty()) { return Referer.UNKNOWN.getRefLabel(); } String[] domainParts = StringUtils.splitPreserveAllTokens(urlParts[2], '.'); if (domainParts == null || domainParts.length < 2) { return Referer.UNKNOWN.getRefLabel(); } if (domainParts[domainParts.length - 1].equals("org")) { switch (domainParts[domainParts.length - 2]) { case "": return Referer.UNKNOWN.getRefLabel(); case "mediawiki": case "wikibooks": case "wikidata": case "wikinews": case "wikimedia": case "wikimediafoundation": case "wikipedia": case "wikiquote": case "wikisource": case "wikiversity": case "wikivoyage": case "wiktionary": return Referer.INTERNAL.getRefLabel(); } } return Referer.EXTERNAL.getRefLabel(); } /** * Extract project information from host by lowering case, splitting * and assigning projectClass, project, qualifiers and tld parts based on splits * Example: normalizeHost("en.m.zero.wikipedia.org")<br/> * Returns:<br/> * NormalizedHostInfo( * "project_class":"wikipedia", * "project":"en", * "qualifiers":["m", "zero"], * "tld":"org", * ) * * @param uriHost The url's host * @return A NormalizedHostInfo object with project_class, project, qualifiers and tld values set. */ public NormalizedHostInfo normalizeHost(String uriHost) { NormalizedHostInfo result = new NormalizedHostInfo(); if ((uriHost == null) || (uriHost.isEmpty())) return result; // Remove port if any int portIdx = uriHost.indexOf(":"); uriHost = uriHost.substring(0, ((portIdx < 0) ? uriHost.length() : portIdx)); // Replace multiple dots by only one uriHost = uriHost.replaceAll("[//.]+", "."); // Split by the dots String[] uriParts = uriHost.toLowerCase().split("\\."); // If no splitted part, return empty if (uriParts.length == 0) return result; // Handle special case where TLD is numeric --> assume IP address, don't normalize // Length is > 0 because of previous check, so no error case if (uriParts[uriParts.length - 1].matches("[0-9]+")) return result; if (uriParts.length > 1) { // project_class and TLD normalization result.setProjectClass(uriParts[uriParts.length - 2]); result.setTld(uriParts[uriParts.length - 1]); } // project normalization if ((uriParts.length > 2) && (!uriParts[0].equals("www"))) result.setProject(uriParts[0]); // qualifiers normalization: xx.[q1.q2.q3].wikixxx.xx if (uriParts.length > 3) { for (int i = 1; i < uriParts.length - 2; i++) { result.addQualifier(uriParts[i]); } } return result; } }