org.archive.util.SURT.java Source code

Introduction

Here is the source code for org.archive.util.SURT.java
Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.util;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.regex.Matcher;

import org.apache.commons.httpclient.URIException;
import org.archive.url.UsableURIFactory;

/**
 * Sort-friendly URI Reordering Transform.
 * 
 * Converts URIs of the form:
 * 
 *   scheme://userinfo@domain.tld:port/path?query#fragment
 * 
 * ...into...
 * 
 *   scheme://(tld,domain,:port@userinfo)/path?query#fragment
 * 
 * The '(' ')' characters serve as an unambiguous notice that the so-called 
 * 'authority' portion of the URI ([userinfo@]host[:port] in http URIs) has 
 * been transformed; the commas prevent confusion with regular hostnames.
 * 
 * This remedies the 'problem' with standard URIs that the host portion of a 
 * regular URI, with its dotted-domains, is actually in reverse order from 
 * the natural hierarchy that's usually helpful for grouping and sorting.
 * 
 * The value of respecting URI case variance is considered negligible: it
 * is vanishingly rare for case-variance to be meaningful, while URI case-
 * variance often arises from people's confusion or sloppiness, and they
 * only correct it insofar as necessary to avoid blatant problems. Thus 
 * the usual SURT form is considered to be flattened to all lowercase, and 
 * not completely reversible. 
 * 
 * @author gojomo
 */
public class SURT {
    protected static char DOT = '.';
    protected static String BEGIN_TRANSFORMED_AUTHORITY = "(";
    protected static String TRANSFORMED_HOST_DELIM = ",";
    protected static String END_TRANSFORMED_AUTHORITY = ")";

    // 1: scheme://
    // 2: userinfo (if present)
    // 3: @ (if present)
    // 4: dotted-quad host
    // 5: other host
    // 6: :port
    // 7: path
    protected static String URI_SPLITTER = "^(\\w+://)(?:([-\\w\\.!~\\*'\\(\\)%;:&=+$,]+?)(@))?" +
    //        1           2                                 3    
            "(?:((?:\\d{1,3}\\.){3}\\d{1,3})|(\\S+?))(:\\d+)?(/\\S*)?$";
    //           4                            5       6       7

    // RFC2396 
    //       reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
    //                     "$" | ","
    //       unreserved  = alphanum | mark
    //       mark        = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
    //       userinfo    = *( unreserved | escaped |
    //                     ";" | ":" | "&" | "=" | "+" | "$" | "," )
    //       escaped     = "%" hex hex

    /**
     * Utility method for creating the SURT form of the URI in the
     * given String.
     * 
     * By default, does not preserve casing. 
     * 
     * @param s String URI to be converted to SURT form
     * @return SURT form 
     */
    public static String fromURI(String s) {
        return fromURI(s, false);
    }

    /**
     * Utility method for creating the SURT form of the URI in the
     * given String.
     * 
     * If it appears a bit convoluted in its approach, note that it was
     * optimized to minimize object-creation after allocation-sites profiling 
     * indicated this method was a top source of garbage in long-running crawls.
     * 
     * Assumes that the String URI has already been cleaned/fixed (eg
     * by UURI fixup) in ways that put it in its crawlable form for 
     * evaluation.
     * 
     * @param s String URI to be converted to SURT form
     * @param preserveCase whether original case should be preserved
     * @return SURT form 
     */
    public static String fromURI(String s, boolean preserveCase) {
        Matcher m = TextUtils.getMatcher(URI_SPLITTER, s);
        if (!m.matches()) {
            // not an authority-based URI scheme; return unchanged
            TextUtils.recycleMatcher(m);
            return s;
        }
        // preallocate enough space for SURT form, which includes
        // 3 extra characters ('(', ')', and one more ',' than '.'s
        // in original)
        StringBuffer builder = new StringBuffer(s.length() + 3);
        append(builder, s, m.start(1), m.end(1)); // scheme://
        builder.append(BEGIN_TRANSFORMED_AUTHORITY); // '('

        if (m.start(4) > -1) {
            // dotted-quad ip match: don't reverse
            append(builder, s, m.start(4), m.end(4));
        } else {
            // other hostname match: do reverse
            int hostSegEnd = m.end(5);
            int hostStart = m.start(5);
            for (int i = m.end(5) - 1; i >= hostStart; i--) {
                if (s.charAt(i - 1) != DOT && i > hostStart) {
                    continue;
                }
                append(builder, s, i, hostSegEnd); // rev host segment
                builder.append(TRANSFORMED_HOST_DELIM); // ','
                hostSegEnd = i - 1;
            }
        }

        append(builder, s, m.start(6), m.end(6)); // :port
        append(builder, s, m.start(3), m.end(3)); // at
        append(builder, s, m.start(2), m.end(2)); // userinfo
        builder.append(END_TRANSFORMED_AUTHORITY); // ')'
        append(builder, s, m.start(7), m.end(7)); // path
        if (!preserveCase) {
            for (int i = 0; i < builder.length(); i++) {
                builder.setCharAt(i, Character.toLowerCase(builder.charAt((i))));
            }
        }
        TextUtils.recycleMatcher(m);
        return builder.toString();
    }

    private static void append(StringBuffer b, CharSequence cs, int start, int end) {
        if (start < 0) {
            return;
        }
        b.append(cs, start, end);
    }

    /**
     * Given a plain URI or hostname/hostname+path, deduce an implied SURT 
     * prefix from it. Results may be unpredictable on strings that cannot
     * be interpreted as URIs. 
     * 
     * UURI 'fixup' is applied to the URI that is built. 
     *
     * @param u URI or almost-URI to consider
     * @return implied SURT prefix form
     */
    public static String prefixFromPlain(String u) {
        u = fromPlain(u);
        // truncate to implied prefix
        u = SurtPrefixSet.asPrefix(u);
        return u;
    }

    /**
     * Given a plain URI or hostname/hostname+path, give its SURT form.
     * Results may be unpredictable on strings that cannot
     * be interpreted as URIs. 
     * 
     * UURI 'fixup' is applied to the URI before conversion to SURT 
     * form. 
     *
     * @param u URI or almost-URI to consider
     * @return implied SURT prefix form
     */
    public static String fromPlain(String u) {
        u = ArchiveUtils.addImpliedHttpIfNecessary(u);
        boolean trailingSlash = u.endsWith("/");
        // ensure all typical UURI cleanup (incl. IDN-punycoding) is done
        try {
            u = UsableURIFactory.getInstance(u).toString();
        } catch (URIException e) {
            e.printStackTrace();
            // allow to continue with original string uri
        }
        // except: don't let UURI-fixup add a trailing slash
        // if it wasn't already there (presence or absence of
        // such slash has special meaning specifying implied
        // SURT prefixes)
        if (!trailingSlash && u.endsWith("/")) {
            u = u.substring(0, u.length() - 1);
        }
        // convert to full SURT
        u = SURT.fromURI(u);
        return u;
    }

    /**
     * Allow class to be used as a command-line tool for converting 
     * URL lists (or naked host or host/path fragments implied
     * to be HTTP URLs) to SURT form. Lines that cannot be converted
     * are returned unchanged. 
     * 
     *
     * Read from stdin or first file argument. Writes to stdout or 
     * second argument filename
     * 
     * @param args cmd-line arguments
     * @throws IOException
     */
    public static void main(String[] args) throws IOException {
        InputStream in = args.length > 0 ? new BufferedInputStream(new FileInputStream(args[0])) : System.in;
        PrintStream out = args.length > 1 ? new PrintStream(new BufferedOutputStream(new FileOutputStream(args[1])))
                : System.out;
        BufferedReader br = new BufferedReader(new InputStreamReader(in));
        String line;
        while ((line = br.readLine()) != null) {
            if (line.indexOf("#") > 0)
                line = line.substring(0, line.indexOf("#"));
            line = line.trim();
            if (line.length() == 0)
                continue;
            line = ArchiveUtils.addImpliedHttpIfNecessary(line);
            out.println(SURT.fromURI(line));
        }
        br.close();
        out.close();
    }
}