Java tutorial
/* PublicSuffixes.java * * $Id: BloomFilter32bitSplit.java 5197 2007-06-06 01:31:46Z gojomo $ * * Created on Jun 13, 2007 * * Copyright (C) 2007 Internet Archive * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package com.cyberway.issue.net; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; import com.cyberway.issue.util.TextUtils; /** * Utility class for making use of the information about 'public suffixes' at * http://publicsuffix.org. * * The public suffix list (once known as 'effective TLDs') was motivated by the * need to decide on which broader domains a subdomain was allowed to set * cookies. For example, a server at 'www.example.com' can set cookies for * 'www.example.com' or 'example.com' but not 'com'. 'www.example.co.uk' can set * cookies for 'www.example.co.uk' or 'example.co.uk' but not 'co.uk' or 'uk'. * The number of rules for all top-level-domains and 2nd- or 3rd- level domains * has become quite long; essentially the broadest domain a subdomain may assign * to is the one that was sold/registered to a specific name registrant. * * This concept should be useful in other contexts, too. Grouping URIs (or * queues of URIs to crawl) together with others sharing the same registered * suffix may be useful for applying the same rules to all, such as assigning * them to the same queue or crawler in a multi- machine setup. * * @author Gojomo */ public class PublicSuffixes { protected static Pattern topmostAssignedSurtPrefixPattern; protected static String topmostAssignedSurtPrefixRegex; /** * Utility method for dumping a regex String, based on a published public * suffix list, which matches any SURT-form hostname up through the broadest * 'private' (assigned/sold) domain-segment. That is, for any of the * SURT-form hostnames... * * com,example, com,example,www, com,example,california,www * * ...the regex will match 'com,example,'. * * @param args * @throws IOException */ public static void main(String args[]) throws IOException { String regex; if (args.length == 0 || "=".equals(args[0])) { // use bundled list regex = getTopmostAssignedSurtPrefixRegex(); } else { // use specified filename BufferedReader reader = new BufferedReader(new FileReader(args[0])); regex = getTopmostAssignedSurtPrefixRegex(reader); IOUtils.closeQuietly(reader); } boolean needsClose = false; BufferedWriter writer; if (args.length >= 2) { // writer to specified file writer = new BufferedWriter(new FileWriter(args[1])); needsClose = true; } else { // write to stdout writer = new BufferedWriter(new OutputStreamWriter(System.out)); } writer.append(regex); writer.flush(); if (needsClose) { writer.close(); } } /** * Reads a file of the format promulgated by publicsuffix.org, ignoring * comments and '!' exceptions/notations, converting domain segments to * SURT-ordering. Leaves glob-style '*' wildcarding in place. Returns sorted * list of unique SURT-ordered prefixes. * * @param reader * @return * @throws IOException */ public static List<String> readPublishedFileToSurtList(BufferedReader reader) throws IOException { String line; List<String> list = new ArrayList<String>(); while ((line = reader.readLine()) != null) { // discard whitespace, empty lines, comments, exceptions line = line.trim(); if (line.length() == 0 || line.startsWith("//")) { continue; } // discard utf8 notation after entry line = line.split("\\s+")[0]; line = line.toLowerCase(); // SURT-order domain segments String[] segs = line.split("\\."); StringBuilder surtregex = new StringBuilder(); for (int i = segs.length - 1; i >= 0; i--) { if (segs[i].length() > 0) { // current list has a stray '?' in a .no domain String fixed = segs[i].replaceAll("\\?", "_"); // replace '!' with '+' to indicate lookahead-for-exceptions // (gets those to sort before '*' at later build-step) fixed = fixed.replaceAll("!", "+"); surtregex.append(fixed + ","); } } list.add(surtregex.toString()); } Collections.sort(list); // uniq String last = ""; Iterator<String> iter = list.iterator(); while (iter.hasNext()) { String s = iter.next(); if (s.equals(last)) { iter.remove(); continue; } last = s; // System.out.println(s); } return list; } /** * Converts SURT-ordered list of public prefixes into a Java regex which * matches the public-portion "plus one" segment, giving the domain on which * cookies can be set or other policy grouping should occur. Also adds to * regex a fallback matcher that for any new/unknown TLDs assumes the * second-level domain is assignable. (Eg: 'zzz,example,'). * * @param list * @return */ private static String surtPrefixRegexFromSurtList(List<String> list) { StringBuilder regex = new StringBuilder(); regex.append("(?ix)^\n"); TreeSet<String> prefixes = new TreeSet<String>(Collections.reverseOrder()); prefixes.addAll(list); prefixes.add("*,"); // for new/unknown TLDs buildRegex("", regex, prefixes); regex.append("\n([\\-\\w]+,)"); String rstring = regex.toString(); // convert glob-stars to word-char-runs rstring = rstring.replaceAll("\\*", "[\\\\-\\\\w]+"); return rstring; } protected static void buildRegex(String stem, StringBuilder regex, SortedSet<String> prefixes) { if (prefixes.isEmpty()) { return; } if (prefixes.size() == 1 && prefixes.first().equals(stem)) { // avoid unnecessary "(?:)" return; } regex.append("(?:"); if (stem.length() == 0) { regex.append("\n "); // linebreak-space before first character } Iterator<String> iter = prefixes.iterator(); char c = 0; while (iter.hasNext()) { String s = iter.next(); if (s.length() > stem.length()) { char d = s.charAt(stem.length()); if (d == '+') { // convert exception to zero-width-positive-lookahead regex.append("(?=" + s.substring(stem.length() + 1) + ")"); } else { if (d == c) { continue; } c = d; regex.append(c); String newStem = s.substring(0, stem.length() + 1); SortedSet<String> tail = prefixes.tailSet(newStem); SortedSet<String> range = null; successor: for (String candidate : tail) { if (!candidate.equals(newStem)) { range = prefixes.subSet(s, candidate); break successor; } } if (range == null) { range = prefixes.tailSet(s); } buildRegex(newStem, regex, range); } regex.append('|'); } else { // empty suffix; insert dummy to be eaten when loop exits regex.append('@'); } } // eat the trailing '|' (if no empty '@') or dummy regex.deleteCharAt(regex.length() - 1); regex.append(')'); if (stem.length() == 1) { regex.append('\n'); // linebreak for TLDs } } public static synchronized Pattern getTopmostAssignedSurtPrefixPattern() { if (topmostAssignedSurtPrefixPattern == null) { topmostAssignedSurtPrefixPattern = Pattern.compile(getTopmostAssignedSurtPrefixRegex()); } return topmostAssignedSurtPrefixPattern; } public static synchronized String getTopmostAssignedSurtPrefixRegex() { if (topmostAssignedSurtPrefixRegex == null) { // use bundled list BufferedReader reader = new BufferedReader(new InputStreamReader( PublicSuffixes.class.getClassLoader().getResourceAsStream("effective_tld_names.dat"))); topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader); IOUtils.closeQuietly(reader); } return topmostAssignedSurtPrefixRegex; } public static String getTopmostAssignedSurtPrefixRegex(BufferedReader reader) { List<String> list; try { list = readPublishedFileToSurtList(reader); } catch (IOException e) { throw new RuntimeException(e); } return surtPrefixRegexFromSurtList(list); } /** * Truncate SURT to its topmost assigned domain segment; that is, * the public suffix plus one segment, but as a SURT-ordered prefix. * * if the pattern doesn't match, the passed-in SURT is returned. * * @param surt SURT to truncate * @return truncated-to-topmost-assigned SURT prefix */ public static String reduceSurtToTopmostAssigned(String surt) { Matcher matcher = TextUtils.getMatcher(getTopmostAssignedSurtPrefixRegex(), surt); if (matcher.find()) { surt = matcher.group(); } TextUtils.recycleMatcher(matcher); return surt; } }