org.archive.url.UsableURI.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.url.UsableURI.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.url;

import gnu.inet.encoding.IDNA;
import java.io.File;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.commons.httpclient.URIException;
import org.archive.util.SURT;
import org.archive.util.TextUtils;

/**
 * Usable URI.
 * 
 * This class wraps {@link org.apache.commons.httpclient.URI} adding caching
 * and methods. It cannot be instantiated directly.  Go via UURIFactory.
 * 
 *  <p>We used to use {@link java.net.URI} for parsing URIs but ran across
 * quirky behaviors and bugs.  {@link java.net.URI} is not subclassable --
 * its final -- and its unlikely that java.net.URI will change any time soon
 * (See Gordon's considered petition here:
 * <a href="http://developer.java.sun.com/developer/bugParade/bugs/4939847.html">java.net.URI
 * should have loose/tolerant/compatibility option (or allow reuse)</a>).
 *
 * <p>This class tries to cache calculated strings such as the extracted host
 * and this class as a string rather than have the parent class rerun its
 * calculation everytime.
 *
 * @author gojomo
 * @author stack
 *
 * @see org.apache.commons.httpclient.URI
 */
public class UsableURI extends LaxURI implements CharSequence, Serializable {

    private static final long serialVersionUID = -1277570889914647093L;

    //private static Logger LOGGER =
    //    Logger.getLogger(UURI.class.getName());

    /**
     * Consider URIs too long for IE as illegal.
     */
    public final static int MAX_URL_LENGTH = 2083;

    public static final String MASSAGEHOST_PATTERN = "^www\\d*\\.";

    /**
     * Cache of the host name.
     *
     * Super class calculates on every call.  Profiling shows us spend 30% of
     * total elapsed time in URI class.
     */
    private transient String cachedHost = null;

    /**
     * Cache of this uuri escaped as a string.
     *
     * Super class calculates on every call.  Profiling shows us spend 30% of
     * total elapsed time in URI class.
     */
    private transient String cachedEscapedURI = null;

    /**
     * Cache of this uuri escaped as a string.
     *
     * Super class calculates on every call.  Profiling shows us spend 30% of
     * total elapsed time in URI class.
     */
    private transient String cachedString = null;

    /**
     * Cached authority minus userinfo.
     */
    private transient String cachedAuthorityMinusUserinfo = null;

    /**
     * Cache of this uuri in SURT format
     */
    private transient String surtForm = null;

    // Technically, underscores are disallowed in the domainlabel
    // portion of hostname according to rfc2396 but we'll be more
    // loose and allow them. See: [ 1072035 ] [uuri] Underscore in
    // host messes up port parsing.
    static {
        hostname.set('_');
    }

    /**
     * Shutdown access to default constructor.
     */
    protected UsableURI() {
        super();
    }

    /**
     * @param uri String representation of an absolute URI.
     * @param escaped If escaped.
     * @param charset Charset to use.
     * @throws org.apache.commons.httpclient.URIException
     */
    protected UsableURI(String uri, boolean escaped, String charset) throws URIException {
        super(uri, escaped, charset);
        normalize();
    }

    /**
     * @param relative String representation of URI.
     * @param base Parent UURI to use derelativizing.
     * @throws org.apache.commons.httpclient.URIException
     */
    protected UsableURI(UsableURI base, UsableURI relative) throws URIException {
        super(base, relative);
        normalize();
    }

    /**
     * @param uri String representation of a URI.
     * @param escaped If escaped.
     * @throws NullPointerException
     * @throws URIException
     */
    protected UsableURI(String uri, boolean escaped) throws URIException, NullPointerException {
        super(uri, escaped);
        normalize();
    }

    /**
     * @param uri URI as string that is resolved relative to this UURI.
     * @return UURI that uses this UURI as base.
     * @throws URIException
     */
    public UsableURI resolve(String uri) throws URIException {
        return resolve(uri, false, // assume not escaped
                this.getProtocolCharset());
    }

    /**
     * @param uri URI as string that is resolved relative to this UURI.
     * @param e True if escaped.
     * @return UURI that uses this UURI as base.
     * @throws URIException
     */
    public UsableURI resolve(String uri, boolean e) throws URIException {
        return resolve(uri, e, this.getProtocolCharset());
    }

    /**
     * @param uri URI as string that is resolved relative to this UURI.
     * @param e True if uri is escaped.
     * @param charset Charset to use.
     * @return UURI that uses this UURI as base.
     * @throws URIException
     */
    public UsableURI resolve(String uri, boolean e, String charset) throws URIException {
        return new UsableURI(this, new UsableURI(uri, e, charset));
    }

    /**
     * Test an object if this UURI is equal to another.
     *
     * @param obj an object to compare
     * @return true if two URI objects are equal
     */
    public boolean equals(Object obj) {

        // normalize and test each components
        if (obj == this) {
            return true;
        }
        if (!(obj instanceof UsableURI)) {
            return false;
        }
        UsableURI another = (UsableURI) obj;
        // scheme
        if (!equals(this._scheme, another._scheme)) {
            return false;
        }
        // is_opaque_part or is_hier_part?  and opaque
        if (!equals(this._opaque, another._opaque)) {
            return false;
        }
        // is_hier_part
        // has_authority
        if (!equals(this._authority, another._authority)) {
            return false;
        }
        // path
        if (!equals(this._path, another._path)) {
            return false;
        }
        // has_query
        if (!equals(this._query, another._query)) {
            return false;
        }
        // UURIs do not have fragments
        return true;
    }

    /**
     * Strips www variants from the host.
     *
     * Strips www[0-9]*\. from the host.  If calling getHostBaseName becomes a
     * performance issue we should consider adding the hostBasename member that
     * is set on initialization.
     *
     * @return Host's basename.
     * @throws URIException
     */
    public String getHostBasename() throws URIException {
        // caching eliminated because this is rarely used
        // (only benefits legacy DomainScope, which should
        // be retired). Saves 4-byte object pointer in UURI
        // instances.
        return (this.getReferencedHost() == null) ? null
                : TextUtils.replaceFirst(MASSAGEHOST_PATTERN, this.getReferencedHost(),
                        UsableURIFactory.EMPTY_STRING);
    }

    /**
     * Returns an alternate, functional String representation -- in this 
     * case, a String of the URI represented by this UURI instance.  
     * 
     * @return
     */
    public synchronized String toCustomString() {
        if (this.cachedString == null) {
            this.cachedString = super.toString();
            coalesceUriStrings();
        }
        return this.cachedString;
    }

    /**
     * Override to cache result
     * 
     * TODO: eliminate, moving most callers to toCustomString, to avoid 
     * overloading/diluting toString()
     * (see http://webteam.archive.org/confluence/display/Heritrix/Preserve+toString%28%29 )
     * @return String representation of this URI
     */
    public String toString() {
        return toCustomString();
    }

    /**
     * In the case of a puny encoded IDN, this method returns the decoded Unicode version.
     * <p>
     * Most of this implementation is copied from {@link org.apache.commons.httpclient.URI#setURI()}.
     * 
     * @return decoded IDN version of URI
     */
    public String toUnicodeHostString() {
        if (!_is_hostname) {
            return toString();
        }

        try {
            StringBuilder buf = new StringBuilder();

            if (_scheme != null) {
                buf.append(_scheme);
                buf.append(':');
            }
            if (_is_net_path) {
                buf.append("//");
                if (_authority != null) { // has_authority
                    if (_userinfo != null) {
                        buf.append(_userinfo).append('@');
                    }
                    buf.append(IDNA.toUnicode(getHost()));
                    if (_port >= 0) {
                        buf.append(':').append(_port);
                    }
                }
            }
            if (_opaque != null && _is_opaque_part) {
                buf.append(_opaque);
            } else if (_path != null) {
                // _is_hier_part or _is_relativeURI
                if (_path.length != 0) {
                    buf.append(_path);
                }
            }
            if (_query != null) { // has_query
                buf.append('?');
                buf.append(_query);
            }
            return buf.toString();
        } catch (URIException ex) {
            throw new RuntimeException(ex);
        }
    }

    public synchronized String getEscapedURI() {
        if (this.cachedEscapedURI == null) {
            this.cachedEscapedURI = super.getEscapedURI();
            coalesceUriStrings();
        }
        return this.cachedEscapedURI;
    }

    /**
     * The two String fields cachedString and cachedEscapedURI are 
     * usually identical; if so, coalesce into a single instance. 
     */
    protected void coalesceUriStrings() {
        if (this.cachedString != null && this.cachedEscapedURI != null
                && this.cachedString.length() == this.cachedEscapedURI.length()) {
            // lengths will only be identical if contents are identical
            // (deescaping will always shrink length), so coalesce to
            // use only single cached instance
            this.cachedString = this.cachedEscapedURI;
        }
    }

    public synchronized String getHost() throws URIException {
        if (this.cachedHost == null) {
            // If this._host is null, 3.0 httpclient throws
            // illegalargumentexception.  Don't go there.
            if (this._host != null) {
                this.cachedHost = super.getHost();
                coalesceHostAuthorityStrings();
            }
        }
        return this.cachedHost;
    }

    /**
     * The two String fields cachedHost and cachedAuthorityMinusUserInfo are 
     * usually identical; if so, coalesce into a single instance. 
     */
    protected void coalesceHostAuthorityStrings() {
        if (this.cachedAuthorityMinusUserinfo != null && this.cachedHost != null
                && this.cachedHost.length() == this.cachedAuthorityMinusUserinfo.length()) {
            // lengths can only be identical if contents
            // are identical; use only one instance
            this.cachedAuthorityMinusUserinfo = this.cachedHost;
        }
    }

    /**
     * Return the referenced host in the UURI, if any, also extracting the 
     * host of a DNS-lookup URI where necessary. 
     * 
     * @return the target or topic host of the URI
     * @throws URIException
     */
    public String getReferencedHost() throws URIException {
        String referencedHost = this.getHost();
        if (referencedHost == null && this.getScheme().equals("dns")) {
            // extract target domain of DNS lookup
            String possibleHost = this.getCurrentHierPath();
            if (possibleHost != null && possibleHost.matches("[-_\\w\\.:]+")) {
                referencedHost = possibleHost;
            }
        }
        return referencedHost;
    }

    /**
     * @return Return the 'SURT' format of this UURI
     */
    public String getSurtForm() {
        if (surtForm == null) {
            surtForm = SURT.fromURI(this.toString());
        }
        return surtForm;
    }

    /**
     * Return the authority minus userinfo (if any).
     * 
     * If no userinfo present, just returns the authority.
     * 
     * @return The authority stripped of any userinfo if present.
     * @throws URIException
     */
    public String getAuthorityMinusUserinfo() throws URIException {
        if (this.cachedAuthorityMinusUserinfo == null) {
            String tmp = getAuthority();
            if (tmp != null && tmp.length() > 0) {
                int index = tmp.indexOf('@');
                if (index >= 0 && index < tmp.length()) {
                    tmp = tmp.substring(index + 1);
                }
            }
            this.cachedAuthorityMinusUserinfo = tmp;
            coalesceHostAuthorityStrings();
        }
        return this.cachedAuthorityMinusUserinfo;
    }

    /* (non-Javadoc)
     * @see java.lang.CharSequence#length()
     */
    public int length() {
        return getEscapedURI().length();
    }

    /* (non-Javadoc)
     * @see java.lang.CharSequence#charAt(int)
     */
    public char charAt(int index) {
        return getEscapedURI().charAt(index);
    }

    /* (non-Javadoc)
     * @see java.lang.CharSequence#subSequence(int, int)
     */
    public CharSequence subSequence(int start, int end) {
        return getEscapedURI().subSequence(start, end);
    }

    /* (non-Javadoc)
     * @see java.lang.Comparable#compareTo(java.lang.Object)
     */
    public int compareTo(Object arg0) {
        return getEscapedURI().compareTo(arg0.toString());
    }

    /**
     * Test if passed String has likely URI scheme prefix.
     * @param possibleUrl URL string to examine.
     * @return True if passed string looks like it could be an URL.
     */
    public static boolean hasScheme(String possibleUrl) {
        boolean result = false;
        for (int i = 0; i < possibleUrl.length(); i++) {
            char c = possibleUrl.charAt(i);
            if (c == ':') {
                if (i != 0) {
                    result = true;
                }
                break;
            }
            if (!scheme.get(c)) {
                break;
            }
        }
        return result;
    }

    /**
     * @param pathOrUri A file path or a URI.
     * @return Path parsed from passed <code>pathOrUri</code>.
     * @throws URISyntaxException
     */
    public static String parseFilename(final String pathOrUri) throws URISyntaxException {
        String path = pathOrUri;
        if (UsableURI.hasScheme(pathOrUri)) {
            URI url = new URI(pathOrUri);
            path = url.getPath();
        }
        return (new File(path)).getName();
    }

    private void writeObject(ObjectOutputStream stream) throws IOException {
        stream.writeUTF(toCustomString());
    }

}