Java tutorial
/* * $Header: /cvsrep/fd/WEB-INF/classes/org/eleusoft/uri/apache/URI.java,v 1.1 2007/07/10 18:19:30 mik Exp $ * $Revision: 1.1 $ * $Date: 2007/07/10 18:19:30 $ * * ==================================================================== * * Copyright 2002-2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. * */ //package org.apache.commons.httpclient; // http://svn.apache.org/viewvc/jakarta/commons/proper/httpclient/trunk/ // http://svn.apache.org/viewvc/jakarta/commons/proper/httpclient/trunk/src/java/org/apache/commons/httpclient/ package org.eleusoft.uri.apache; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; import java.util.BitSet; import java.util.Hashtable; import java.util.Locale; //import org.apache.commons.codec.DecoderException; //import org.apache.commons.codec.net.URLCodec; //import org.apache.commons.httpclient.util.EncodingUtil; /** * The interface for the URI(Uniform Resource Identifiers) version of RFC 2396. * This class has the purpose of supportting of parsing a URI reference to * extend any specific protocols, the character encoding of the protocol to * be transported and the charset of the document. * <p> * A URI is always in an "escaped" form, since escaping or unescaping a * completed URI might change its semantics. * <p> * Implementers should be careful not to escape or unescape the same string * more than once, since unescaping an already unescaped string might lead to * misinterpreting a percent data character as another escaped character, * or vice versa in the case of escaping an already escaped string. * <p> * In order to avoid these problems, data types used as follows: * <p><blockquote><pre> * URI character sequence: char * octet sequence: byte * original character sequence: String * </pre></blockquote><p> * * So, a URI is a sequence of characters as an array of a char type, which * is not always represented as a sequence of octets as an array of byte. * <p> * * URI Syntactic Components * <p><blockquote><pre> * - In general, written as follows: * Absolute URI = <scheme>:<scheme-specific-part> * Generic URI = <scheme>://<authority><path>?<query> * * - Syntax * absoluteURI = scheme ":" ( hier_part | opaque_part ) * hier_part = ( net_path | abs_path ) [ "?" query ] * net_path = "//" authority [ abs_path ] * abs_path = "/" path_segments * </pre></blockquote><p> * * The following examples illustrate URI that are in common use. * <pre> * ftp://ftp.is.co.za/rfc/rfc1808.txt * -- ftp scheme for File Transfer Protocol services * gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles * -- gopher scheme for Gopher and Gopher+ Protocol services * http://www.math.uio.no/faq/compression-faq/part1.html * -- http scheme for Hypertext Transfer Protocol services * mailto:mduerst@ifi.unizh.ch * -- mailto scheme for electronic mail addresses * news:comp.infosystems.www.servers.unix * -- news scheme for USENET news groups and articles * telnet://melvyl.ucop.edu/ * -- telnet scheme for interactive services via the TELNET Protocol * </pre> * Please, notice that there are many modifications from URL(RFC 1738) and * relative URL(RFC 1808). * <p> * <b>The expressions for a URI</b> * <p><pre> * For escaped URI forms * - URI(char[]) // constructor * - char[] getRawXxx() // method * - String getEscapedXxx() // method * - String toString() // method * <p> * For unescaped URI forms * - URI(String) // constructor * - String getXXX() // method * </pre><p> * * @author <a href="mailto:jericho@apache.org">Sung-Gu</a> * @author <a href="mailto:mbowler@GargoyleSoftware.com">Mike Bowler</a> * @version $Revision: 1.1 $ $Date: 2002/03/14 15:14:01 */ class URI implements Comparable, Serializable { private static boolean MVSURE = true; private static boolean MV = true; // ----------------------------------------------------------- Constructors /** Create an instance as an internal use */ protected URI() { } /** * Construct a URI from the given string. * <p><blockquote><pre> * URI-reference = URI / relative-ref * </pre></blockquote><p> * An URI can be placed within double-quotes or angle brackets like * "http://test.com/" and <http://test.com/> * * @param original the URI string representation, * in percent escaped form. * @throws URIException If the URI cannot be created. */ public URI(String original) throws URIException { parseUriReference(original); } /** * Construct a general URI from the given components. * <p><blockquote><pre> * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] * absoluteURI = scheme ":" ( hier_part | opaque_part ) * opaque_part = uric_no_slash *uric * </pre></blockquote><p> * It's for absolute URI = <scheme>:<scheme-specific-part># * <fragment>. * * @param scheme the scheme string * @param schemeSpecificPart scheme_specific_part * @param fragment the fragment string * @throws URIException If the URI cannot be created. * @see #getDefaultProtocolCharset */ public URI(String scheme, String schemeSpecificPart, String fragment) throws URIException { // validate and contruct the URI character sequence if (scheme == null) { throw new URIException(URIException.PARSING, "scheme required"); } _scheme = validateScheme(scheme); _opaque = encode(schemeSpecificPart, allowed_opaque_part, getProtocolCharset()); // Set flag _is_opaque_part = true; _fragment = fragment == null ? null : validateFragment(fragment); setURI(); } /** * Construct a general URI from the given components. * <p><blockquote><pre> * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] * absoluteURI = scheme ":" ( hier_part | opaque_part ) * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] * hier_part = ( net_path | abs_path ) [ "?" query ] * </pre></blockquote><p> * It's for absolute URI = <scheme>:<path>?<query>#< * fragment> and relative URI = <path>?<query>#<fragment * >. * * @param scheme the scheme string * @param authority the authority string * @param path the path string * @param query the query string * @param fragment the fragment string * @throws URIException If the new URI cannot be created. * @see #getDefaultProtocolCharset */ public URI(String scheme, String authority, String path, String query, String fragment) throws URIException { if (path == null) throw new IllegalArgumentException( "Null path passed. " + "\nRFC 3986 3.3 A path is always defined for a URI, " + "though the defined path may be empty (zero length)."); final char[] schemeChars; if (scheme != null) { schemeChars = validateScheme(scheme); } else schemeChars = null; // MV: here must first escape the components // and then pass to parseURIReference(String,true) // otherwise path with %23 [#] is lost.... // NEW: query and frag must be already escaped // validate and contruct the URI character sequence StringBuffer buff = new StringBuffer(); if (schemeChars != null) { buff.append(schemeChars); buff.append(':'); } if (authority != null) { buff.append("//"); buff.append(authority); } if (path.length() != 0) { // MV DO NOT check for empty path, was: (path!=null) //accept empty path if ((scheme != null || authority != null) && isPathARelativePath(path)) { throw new URIException(URIException.PARSING, "abs_path requested"); } else { final String escapedPath = escapePathAsString(path); checkNotEmptyEscapedPathAndAuthority(escapedPath, authority); buff.append(escapedPath); } } if (query != null) { buff.append('?'); buff.append(query); } if (fragment != null) { buff.append('#'); //buff.append(escapeFragmentAsString(fragment)); buff.append(fragment); //2007remove } parseUriReference(buff.toString()); } private char[] validateFragment(String fragment) throws URIException { char[] arr = fragment.toCharArray(); if (arr.length == 0) return EMPTYCHARS; else { if (!validate(arr, URI.ABNF_fragment)) { throw new URIException(URIException.PARSING, "incorrect fragment: [" + fragment + "]"); } else return arr; } } private char[] validateQuery(String query) throws URIException { char[] arr = query.toCharArray(); if (arr.length == 0) return EMPTYCHARS; else { // remove the fragment identifier // OLD not needed // escapedQuery = removeFragmentIdentifier(escapedQuery); if (!validate(arr, URI.ABNF_query)) { throw new URIException(URIException.PARSING, "incorrect query: [" + query + "]"); } else return arr; } } /** * Scheme validation */ /** * Scheme validation */ private char[] validateScheme(String scheme) throws URIException { final char[] schemeChars = scheme.toCharArray(); if (schemeChars.length == 0) { throw new URIException(URIException.PARSING, "scheme cannot be an empty string " + "(URI cannot start with colon ':')."); } if (!ALPHA.get(schemeChars[0])) { throw new URIException(URIException.PARSING, "scheme must start with letter (a-z A-Z) : [" + new String(schemeChars) + "]"); } else if (!validate(schemeChars, URI.ABNF_scheme)) { throw new URIException(URIException.PARSING, "incorrect scheme: [" + new String(schemeChars) + "]"); } else return schemeChars; } private void checkNotEmptyEscapedPathAndAuthority(String path, String auth) throws URIException { // RFC3986 3.3 // (..) If a URI does not contain an authority component, then the path cannot begin // with two slash characters ("//"). // NOTE: empty string is a present, empty authority component if (auth == null && (path.startsWith("//"))) { throw new URIException("RFC3986 3.3 if an URI " + "does not contain an authority component " + "then the path cannot begin with [//]. Escaped path: [" + path + "]"); } if (isPathARelativePath(path)) { // RFC 3986 3.3 // If the path is a relative path, // and so the uri is a relative path reference, // it cannot contain a colon in its first segment. // RFC 3986 4.2 // (..)A path segment that contains a colon character (e.g., "this:that") // cannot be used as the first segment of a relative-path reference, as // it would be mistaken for a scheme name. Such a segment must be // preceded by a dot-segment (e.g., "./this:that") to make a relative- // path reference. // ------------- // Yes but after normalization ? // Relative paths cannot be normalized !!! // Found a good reason for not normalizing. final int indxColon = path.indexOf(":"); final int indxSlash = path.indexOf("/"); if (indxColon >= 0 && (indxSlash == -1 || indxColon < indxSlash)) { // If there is a colon in the rel path and // (the first slash is not present or occurs after the colon), // then it MUST start with ./ if (!path.startsWith("./")) throw new URIException("RFC3986 3.3 a URI reference" + " may be a relative-path reference, in which case the " + " first path segment cannot contain a colon (\":\") character. Escaped path:" + path); } } } /** * @path NON NULL path * @returt true if relative path */ private static boolean isPathARelativePath(String path) { return !path.startsWith("/"); } private char[] escapePath(final String path) throws URIException { if (path == null) return null; else if (path.length() == 0) return EMPTYCHARS; else return _escapePathAsString(path).toCharArray(); } private String escapePathAsString(final String path) throws URIException { if (path == null || path.length() == 0) { return path; } else return _escapePathAsString(path); } /** * @param NON NULL and NON EMPTY path */ private String _escapePathAsString(final String path) throws URIException { // set the charset to do escape encoding final String charset = getProtocolCharset(); final boolean isRelativePath = isPathARelativePath(path); final String result; if (!isRelativePath) { result = new String(encode(path, allowed_abs_path, charset)); //System.out.println("MV: abs or net path escaped is " + result); } else { //////////if (_is_rel_path) { StringBuffer buff = new StringBuffer(path.length()); int at = path.indexOf('/'); if (at == 0) { // never 0 throw new URIException(URIException.PARSING, "incorrect relative path"); } if (at > 0) { buff.append(encode(path.substring(0, at), allowed_rel_path, charset)); buff.append(encode(path.substring(at), allowed_abs_path, charset)); } else { buff.append(encode(path, allowed_rel_path, charset)); } result = buff.toString(); //.toCharArray(); //System.out.println("MV: rel path escaped is " + result); } /** else if (_is_opaque_part) { StringBuffer buf = new StringBuffer(); buf.insert(0, encode(path.substring(0, 1), uric_no_slash, charset)); buf.insert(1, encode(path.substring(1), uric, charset)); _opaque = buf.toString().toCharArray(); } else { throw new URIException(URIException.PARSING, "incorrect path"); } **/ return result; } /** * Construct a general URI with the given relative URI string. * * @param base the base URI * @param relative the relative URI string, * in percent escaped form. * @throws URIException If the new URI cannot be created. * */ public URI(URI base, String relative) throws URIException { this(base, new URI(relative)); } /** * Construct a general URI with the given relative URI. * <p><blockquote><pre> * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] * </pre></blockquote><p> * Resolving Relative References to Absolute Form. * * <strong>Examples of Resolving Relative URI References</strong> * * Within an object with a well-defined base URI of * <p><blockquote><pre> * http://a/b/c/d;p?q * </pre></blockquote><p> * the relative URI would be resolved as follows: * * Normal Examples * * <p><blockquote><pre> * g:h = g:h * g = http://a/b/c/g * ./g = http://a/b/c/g * g/ = http://a/b/c/g/ * /g = http://a/g * //g = http://g * ?y = http://a/b/c/?y * g?y = http://a/b/c/g?y * #s = (current document)#s * g#s = http://a/b/c/g#s * g?y#s = http://a/b/c/g?y#s * ;x = http://a/b/c/;x * g;x = http://a/b/c/g;x * g;x?y#s = http://a/b/c/g;x?y#s * . = http://a/b/c/ * ./ = http://a/b/c/ * .. = http://a/b/ * ../ = http://a/b/ * ../g = http://a/b/g * ../.. = http://a/ * ../../ = http://a/ * ../../g = http://a/g * </pre></blockquote><p> * * Some URI schemes do not allow a hierarchical syntax matching the * <hier_part> syntax, and thus cannot use relative references. * * @param base the base URI * @param relative the relative URI * @throws URIException If the new URI cannot be created. */ public URI(URI base, URI relative) throws URIException { // RFC 3986 5.2.1 says no scheme, no resolving // TODO add to "strict" option if (base._scheme == null) { throw new URIException(URIException.PARSING, "base URI required"); } else if (base._is_opaque_part || relative._is_opaque_part) { // Fix one is opaque case // use copyOpaqueAllButScheme base.copyContext(this); this._scheme = base._scheme; this._is_opaque_part = base._is_opaque_part || relative._is_opaque_part; this._opaque = relative._opaque; // ??? TODO this._path = this._opaque; // ??? TODO this._fragment = relative._fragment; // Always call setURI before exiting this.setURI(); return; } // Here base is absolute uri, // none of the two is opaque // Fix cases relative is absolute and relative // with authority if (relative._scheme != null) { // Case relative is absolute URI // clone the relative apart fragment this._scheme = relative._scheme; relative.copyNotOpaqueAllButScheme(this, false); } else if (relative._authority != null) { // Case network path //myhost[..] // Clone all but scheme and fragment of the base this._scheme = base._scheme; relative.copyNotOpaqueAllButScheme(this, false); } else { // Here relative has no scheme and no authority, // copy the scheme and authority part from the base to the // result uri (this for the moment) this._scheme = base._scheme; base.copyContext(this); if (base._authority != null) { base.copyAuthority(this, false); } // Here remains to solve path, query and fragment. // If the relative has no path bu query and/or fragment // the base path must be used. if (relative._path.length == 0) { // Remember: Resolved fragment, if present, is ALWAYS fragment of relative if (relative._query == null) { if (relative._fragment == null) { // Relative has no scheme, auth, path, query and fragm // so is the empty uri, clone the base uri query (path done later) this._query = base._query; } else { this._fragment = relative._fragment; this._query = base._query; } } else { // Rel (that is rel uri, empty auth/path) // has query: copy query and possible fragment relative.copyQueryFragment(this); } base.copyPath(this); } else { // Relative has path, resolve base path with rel // and use query and frag from rel this._path = resolvePath(base._path, relative._path); relative.copyQueryFragment(this); } } this.setURI(); } // --------------------------------------------------- Instance Variables /** Version ID for serialization */ static final long serialVersionUID = 604752400577948726L; /** * Cache the hash code for this URI. */ protected int hash = 0; /** * This Uniform Resource Identifier (URI). * The URI is always in an "escaped" form, since escaping or unescaping * a completed URI might change its semantics. */ protected char[] _uri = null; /** * The charset of the protocol used by this URI instance. */ protected String protocolCharset = null; /** * The default charset of the protocol. RFC 2277, 2396 */ protected static String defaultProtocolCharset = "UTF-8"; /** * The default charset of the document. RFC 2277, 2396 * The platform's charset is used for the document by default. */ protected static String defaultDocumentCharset; protected static final String defaultDocumentCharsetByLocale; protected static final String defaultDocumentCharsetByPlatform; // Static initializer for defaultDocumentCharset static { Locale locale = Locale.getDefault(); // in order to support backward compatiblity String locDefaultDocumentCharset; if (locale != null) { defaultDocumentCharsetByLocale = LocaleToCharsetMap.getCharset(locale); // set the default document charset locDefaultDocumentCharset = defaultDocumentCharsetByLocale; } else { locDefaultDocumentCharset = null; defaultDocumentCharsetByLocale = null; } // in order to support platform encoding String locDefaultDocumentCharsetByPlatform; try { locDefaultDocumentCharsetByPlatform = System.getProperty("file.encoding"); } catch (SecurityException ignore) { locDefaultDocumentCharsetByPlatform = null; } defaultDocumentCharsetByPlatform = locDefaultDocumentCharsetByPlatform; if (locDefaultDocumentCharset == null) { // set the default document charset defaultDocumentCharset = defaultDocumentCharsetByPlatform; } else { defaultDocumentCharset = locDefaultDocumentCharset; } } /** * The scheme. */ protected char[] _scheme = null; /** * The opaque. */ protected char[] _opaque = null; /** * The authority. */ protected char[] _authority = null; /** * The userinfo. */ protected char[] _userinfo = null; /** * The host. */ protected char[] _host = null; /** * The port. */ protected int _port = -1; /** * The path. */ protected char[] _path = null; /** * The query. */ protected char[] _query = null; /** * The fragment. */ protected char[] _fragment = null; /** * The root path. */ protected static char[] rootPath = { '/' }; // ---------------------- Bitsets for components validation /** * The percent "%" character always has the reserved purpose of being the * escape indicator, it must be escaped as "%25" in order to be used as * data within a URI. */ protected static final BitSet PERCENT = new BitSet(256); static { PERCENT.set('%'); } /** * BitSet for DIGIT, * specified in RFC 4234, * <a href='http://ietfreport.isoc.org/rfc/rfc4234.txt'> * <em>Augmented BNF for Syntax Specifications: ABNF</em></a>. * <p><blockquote><pre> * DIIGT = 0 / 9 * </pre></blockquote><p> */ protected static final BitSet DIGIT = new BitSet(256); static { for (int i = '0'; i <= '9'; i++) { DIGIT.set(i); } } /** * BitSet for ALPHA, specified in RFC 4234, * <a href='http://ietfreport.isoc.org/rfc/rfc4234.txt'> * <em>Augmented BNF for Syntax Specifications: ABNF</em></a>. * <p><blockquote><pre> * ALPHA = A-Z / a-z * </pre></blockquote><p> */ protected static final BitSet ALPHA = new BitSet(256); static { for (int i = 'a'; i <= 'z'; i++) { ALPHA.set(i); } for (int i = 'A'; i <= 'Z'; i++) { ALPHA.set(i); } } /** * BitSet for HEXDIG, * specified in RFC 4234, * <a href='http://ietfreport.isoc.org/rfc/rfc4234.txt'> * <em>Augmented BNF for Syntax Specifications: ABNF</em></a>. * <p><blockquote><pre> * HEXDIG= DIGIT / "A" / "B" / "C" / "D" / "E" / "F" * * Note: ABNF Strings are case unsensitive. * </pre></blockquote><p> */ protected static final BitSet HEXDIG = new BitSet(256); static { HEXDIG.or(DIGIT); for (int i = 'a'; i <= 'f'; i++) { HEXDIG.set(i); } for (int i = 'A'; i <= 'F'; i++) { HEXDIG.set(i); } } /** * BitSet for ALPHADIGIT * (join of {@link #ALPHA} & {@link #DIGIT}). * Implementation helper. * <p><blockquote><pre> * ALPHADIGIT = ALPHA / DIGIT * </pre></blockquote><p> */ protected static final BitSet ALPHADIGIT = new BitSet(256); static { ALPHADIGIT.or(ALPHA); ALPHADIGIT.or(DIGIT); } /** * BitSet for <code>pct-encoded</code>, * specified as an ABNF in * <a href='http://www.apps.ietf.org/rfc/rfc3986.html#sec-2.1'> * <em>RFC3986 Section 2.1</em></a>. * <p><blockquote> * <p>A percent-encoding mechanism is used to represent * a data octet in a component when that octet's * corresponding character is outside the allowed * set or is being used as a delimiter of, or within, * the component. A percent-encoded octet is encoded * as a character triplet, consisting of the percent * character "%" followed by the two hexadecimal digits * representing that octet's numeric value. * <p><pre> * pct_encoded = "%" HEXDIG HEXDIG * </pre></blockquote><p> */ protected static final BitSet ABNF_pct_encoded = new BitSet(256); static { ABNF_pct_encoded.or(PERCENT); ABNF_pct_encoded.or(HEXDIG); } /** * BitSet for <code>scheme</code>, * specified as an ABNF in * <a href='http://www.apps.ietf.org/rfc/rfc3986.html#sec-3.1'> * <em>RFC3986 Section 3.1</em></a>. * <p><blockquote> * * <p>Each URI begins with a scheme name that refers to * a specification for assigning identifiers within that * scheme. As such, the URI syntax is a federated and * extensible naming system wherein each scheme's specification * may further restrict the syntax and semantics of * identifiers using that scheme * * <p>Scheme names consist of a sequence of characters * beginning with a letter and followed by any combination * of letters, digits, plus ("+"), period ("."), or hyphen ("-"). * Although schemes are case- insensitive, the canonical form * is lowercase and documents that specify schemes must do so * with lowercase letters. An implementation should accept * uppercase letters as equivalent to lowercase in * scheme names (e.g., allow "HTTP" as well as "http") for * the sake of robustness but should only produce lowercase * scheme names for consistency * <pre> * scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) * </pre></blockquote><p> */ protected static final BitSet ABNF_scheme = new BitSet(256); // Static initializer for scheme static { ABNF_scheme.or(ALPHA); ABNF_scheme.or(DIGIT); ABNF_scheme.set('+'); ABNF_scheme.set('-'); ABNF_scheme.set('.'); } /** * BitSet for <code>gen-delims</code>, * specified as an ABNF in * <a href='http://www.apps.ietf.org/rfc/rfc3986.html#sec-2.2'> * <em>RFC3986 Section 2.2</em></a>. * <p><blockquote> * <p><pre> * gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" * </pre></blockquote><p> * @see #ABNF_sub_delims * @see #ABNF_reserved */ protected static final BitSet ABNF_gen_delims = new BitSet(256); static { ABNF_gen_delims.set(':'); ABNF_gen_delims.set('/'); ABNF_gen_delims.set('?'); ABNF_gen_delims.set('#'); ABNF_gen_delims.set('['); ABNF_gen_delims.set(']'); ABNF_gen_delims.set('@'); } /** * BitSet for <code>sub-delims</code>, * specified as an ABNF in * <a href='http://www.apps.ietf.org/rfc/rfc3986.html#sec-2.2'> * <em>RFC3986 Section 2.2</em></a>. * <p><blockquote> * <p><pre> * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" * / "*" / "+" / "," / ";" / "=" * * </pre></blockquote><p> * @see #ABNF_gen_delims * @see #ABNF_reserved */ protected static final BitSet ABNF_sub_delims = new BitSet(256); static { ABNF_sub_delims.set('!'); ABNF_sub_delims.set('$'); ABNF_sub_delims.set('&'); ABNF_sub_delims.set('\''); ABNF_sub_delims.set('('); ABNF_sub_delims.set(')'); ABNF_sub_delims.set('*'); ABNF_sub_delims.set('+'); ABNF_sub_delims.set(','); ABNF_sub_delims.set(';'); ABNF_sub_delims.set('='); } /** * BitSet for <code>reserved</code>, * specified as an ABNF in * <a href='http://www.apps.ietf.org/rfc/rfc3986.html#sec-2.2'> * <em>RFC3986 Section 2.2</em></a>. * <p><blockquote> * <p>URIs include components and subcomponents that * are delimited by characters in the "reserved" set. * These characters are called "reserved" because * they may (or may not) be defined as delimiters * by the generic syntax, by each scheme-specific * syntax, or by the implementation-specific * syntax of a URI's dereferencing algorithm. * If data for a URI component would conflict * with a reserved character's purpose as a delimiter, * then the conflicting data must be * percent-encoded before the URI is formed. * * <p><pre> * reserved = gen-delims / sub-delims * </pre></blockquote><p> * @see #ABNF_gen_delims * @see #ABNF_sub_delims */ protected static final BitSet ABNF_reserved = new BitSet(256); static { ABNF_reserved.or(ABNF_gen_delims); ABNF_reserved.or(ABNF_sub_delims); } /** * BitSet for <code>unreserved</code>, * specified as an ABNF in * <a href='http://www.apps.ietf.org/rfc/rfc3986.html#sec-2.3'> * <em>RFC3986 Section 2.3</em></a>. * <p><blockquote> * <p>Characters that are allowed in a URI but * do not have a reserved purpose are called unreserved. * These include uppercase and lowercase letters, decimal * digits, hyphen, period, underscore, and tilde. * <p><pre> * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" * </pre></blockquote><p> */ protected static final BitSet ABNF_unreserved = new BitSet(256); static { ABNF_unreserved.or(ALPHADIGIT); ABNF_unreserved.set('-'); ABNF_unreserved.set('.'); ABNF_unreserved.set('_'); ABNF_unreserved.set('~'); } /** * BitSet for pchar. * <p><blockquote><pre> * uric = unreserved / pct-encoded / sub-delims / ":" / "@" * </pre></blockquote><p> */ protected static final BitSet pchar = new BitSet(256); static { pchar.or(ABNF_unreserved); pchar.or(ABNF_pct_encoded); pchar.or(ABNF_sub_delims); pchar.set(':'); pchar.set('@'); } /** * Helper BitSet common to fragment and query. * <p><blockquote><pre> * query = *( pchar / "/" / "?" ) * </pre></blockquote><p> */ private static final BitSet ABNFHELPER_fragment_query = new BitSet(256); static { ABNFHELPER_fragment_query.or(pchar); ABNFHELPER_fragment_query.set('/'); ABNFHELPER_fragment_query.set('?'); } /** * BitSet for <code>query</code>, * specified as an ABNF in * <a href='http://www.apps.ietf.org/rfc/rfc3986.html#sec-3.4'> * <em>RFC3986 Section 3.4</em></a>. * <p><blockquote> * <p>The query component contains non-hierarchical * data that, along with data in the path component * (Section 3.3), serves to identify a resource * within the scope of the URI's scheme and naming * authority (if any). The query component is indicated * by the first question mark ("?") character and * terminated by a number sign ("#") character or * by the end of the URI. * <p><blockquote><pre> * query = *( pchar / "/" / "?" ) * </pre></blockquote><p> * @see ABNF_fragment They have the same bit set. */ protected static final BitSet ABNF_query = ABNFHELPER_fragment_query; /** * BitSet for fragment. * <p><blockquote><pre> * fragment = *( pchar / "/" / "?" ) * </pre></blockquote><p> */ protected static final BitSet ABNF_fragment = ABNFHELPER_fragment_query; //////////////////// MV up to here ok with RFC3986 /** * BitSet for uric. * <p><blockquote><pre> * uric = reserved | unreserved | pct_encoded * </pre></blockquote><p> protected static final BitSet uric = new BitSet(256); // Static initializer for uric static { uric.or(ABNF_reserved); uric.or(ABNF_unreserved); uric.or(ABNF_pct_encoded); } */ /** * BitSet for segment. * <p><blockquote><pre> * segment = *pchar * </pre></blockquote><p> */ protected static final BitSet segment = new BitSet(256); // Static initializer for segment static { segment.or(pchar); } /** * BitSet for path segments. * <p><blockquote><pre> * path_segments = segment *( "/" segment ) * </pre></blockquote><p> */ protected static final BitSet path_segments = new BitSet(256); // Static initializer for path_segments static { path_segments.set('/'); path_segments.or(segment); } /** * URI absolute path. * <p><blockquote><pre> * abs_path = "/" path_segments * </pre></blockquote><p> */ protected static final BitSet abs_path = new BitSet(256); // Static initializer for abs_path static { abs_path.set('/'); abs_path.or(path_segments); } /** * URI bitset for encoding typical non-slash characters. * <p><blockquote><pre> * uric_no_slash = unreserved | pct_encoded | ";" | "?" | ":" | "@" | * "&" | "=" | "+" | "$" | "," * </pre></blockquote><p> */ protected static final BitSet uric_no_slash = new BitSet(256); // Static initializer for uric_no_slash static { uric_no_slash.or(ABNF_unreserved); uric_no_slash.or(ABNF_pct_encoded); uric_no_slash.set(';'); uric_no_slash.set('?'); uric_no_slash.set(':'); //MV was duplicate ; ... is : uric_no_slash.set('@'); uric_no_slash.set('&'); uric_no_slash.set('='); uric_no_slash.set('+'); uric_no_slash.set('$'); uric_no_slash.set(','); } private static final BitSet uric = pchar;// TEMP UNTIL FIX PATH /** * URI bitset that combines uric_no_slash and uric. * <p><blockquote><pre> * opaque_part = uric_no_slash *uric * </pre></blockquote><p> */ protected static final BitSet opaque_part = new BitSet(256); // Static initializer for opaque_part static { // it's generous. because first character must not include a slash opaque_part.or(uric_no_slash); opaque_part.or(uric); // added MV TODO rationalize opaque_part.set('/'); } /** * URI bitset that combines absolute path and opaque part. * <p><blockquote><pre> * path = [ abs_path | opaque_part ] * </pre></blockquote><p> */ protected static final BitSet path = new BitSet(256); // Static initializer for path static { path.or(abs_path); path.or(opaque_part); } /** * Port, a logical alias for DIGIT. */ protected static final BitSet ABNF_port = DIGIT; /** * Bitset that combines DIGIT and dot fo IPv$address. * <p><blockquote><pre> * IPv4address = 1*DIGIT "." 1*DIGIT "." 1*DIGIT "." 1*DIGIT * </pre></blockquote><p> */ protected static final BitSet IPv4address = new BitSet(256); // Static initializer for IPv4address static { IPv4address.or(DIGIT); IPv4address.set('.'); } /** * RFC 2373. * <p><blockquote><pre> * IPv6address = hexpart [ ":" IPv4address ] * </pre></blockquote><p> */ protected static final BitSet IPv6address = new BitSet(256); // Static initializer for IPv6address reference static { IPv6address.or(HEXDIG); // hexpart IPv6address.set(':'); IPv6address.or(IPv4address); } /** * RFC 2732, 2373. * <p><blockquote><pre> * IPv6reference = "[" IPv6address "]" * </pre></blockquote><p> */ protected static final BitSet IPv6reference = new BitSet(256); // Static initializer for IPv6reference static { IPv6reference.set('['); IPv6reference.or(IPv6address); IPv6reference.set(']'); } /** * BitSet for toplabel. * <p><blockquote><pre> * toplabel = ALPHA | ALPHA *( ALPHADIGIT | "-" ) ALPHADIGIT * </pre></blockquote><p> */ protected static final BitSet toplabel = new BitSet(256); // Static initializer for toplabel static { toplabel.or(ALPHADIGIT); toplabel.set('-'); } /** * BitSet for domainlabel. * <p><blockquote><pre> * domainlabel = ALPHADIGIT | ALPHADIGIT *( ALPHADIGIT | "-" ) ALPHADIGIT * </pre></blockquote><p> */ protected static final BitSet domainlabel = toplabel; /** authority = [ userinfo "@" ] host [ ":" port ] userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) host = IP-literal / IPv4address / reg-name port = *DIGIT **/ /** * BitSet for hostname. * <p><blockquote><pre> * hostname = *( domainlabel "." ) toplabel [ "." ] * </pre></blockquote><p> */ protected static final BitSet hostname = new BitSet(256); // Static initializer for hostname static { hostname.or(toplabel); // hostname.or(domainlabel); hostname.set('.'); } /** * BitSet for host. * <p><blockquote><pre> * host = hostname | IPv4address | IPv6reference * </pre></blockquote><p> */ protected static final BitSet host = new BitSet(256); // Static initializer for host static { host.or(hostname); // host.or(IPv4address); host.or(IPv6reference); // IPv4address } /** * BitSet for hostport. * <p><blockquote><pre> * hostport = host [ ":" port ] * </pre></blockquote><p> */ protected static final BitSet hostport = new BitSet(256); // Static initializer for hostport static { hostport.or(host); hostport.set(':'); hostport.or(ABNF_port); } /** * Bitset for userinfo. * <p><blockquote><pre> * userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) * </pre></blockquote><p> */ protected static final BitSet userinfo = new BitSet(256); // Static initializer for userinfo static { userinfo.or(ABNF_unreserved); userinfo.or(ABNF_pct_encoded); userinfo.or(ABNF_sub_delims); userinfo.set(':'); } /** * BitSet for within the userinfo component like user and password. */ public static final BitSet within_userinfo = new BitSet(256); // Static initializer for within_userinfo static { within_userinfo.or(userinfo); within_userinfo.clear(';'); // reserved within authority within_userinfo.clear(':'); within_userinfo.clear('@'); within_userinfo.clear('?'); within_userinfo.clear('/'); } /** * Bitset for server. * <p><blockquote><pre> * server = [ [ userinfo "@" ] hostport ] * </pre></blockquote><p> */ protected static final BitSet ABNF_authority = new BitSet(256); // Static initializer for ABNF_authority static { ABNF_authority.or(userinfo); ABNF_authority.set('@'); ABNF_authority.or(hostport); } /** * BitSet for reg_name. * <p><blockquote><pre> * reg_name = 1*( unreserved | pct_encoded | "$" | "," | * ";" | ":" | "@" | "&" | "=" | "+" ) * </pre></blockquote><p> */ protected static final BitSet reg_name = new BitSet(256); // Static initializer for reg_name static { reg_name.or(ABNF_unreserved); reg_name.or(ABNF_pct_encoded); reg_name.set('$'); reg_name.set(','); reg_name.set(';'); reg_name.set(':'); reg_name.set('@'); reg_name.set('&'); reg_name.set('='); reg_name.set('+'); } /** * BitSet for authority. * <p><blockquote><pre> * authority = ABNF_authority | reg_name * </pre></blockquote><p> */ protected static final BitSet authority = new BitSet(256); // Static initializer for authority static { authority.or(ABNF_authority); authority.or(reg_name); } /** * BitSet for rel_segment. * <p><blockquote><pre> * rel_segment = 1*( unreserved | pct_encoded | * ";" | "@" | "&" | "=" | "+" | "$" | "," ) * </pre></blockquote><p> */ protected static final BitSet rel_segment = new BitSet(256); // Static initializer for rel_segment static { rel_segment.or(ABNF_unreserved); rel_segment.or(ABNF_pct_encoded); rel_segment.set(';'); rel_segment.set('@'); rel_segment.set('&'); rel_segment.set('='); rel_segment.set('+'); rel_segment.set('$'); // MOVED TO RESERVED if (MV) rel_segment.set('#'); // MV ADDED 200601 if (MV) rel_segment.clear('#'); // MV ADDED 200601 if (MV) rel_segment.clear('?'); // MV ADDED 200601 rel_segment.set(','); } /** * BitSet for rel_path. * <p><blockquote><pre> * rel_path = rel_segment [ abs_path ] * </pre></blockquote><p> */ protected static final BitSet rel_path = new BitSet(256); // Static initializer for rel_path static { rel_path.or(rel_segment); rel_path.or(abs_path); } /** * BitSet for net_path. * <p><blockquote><pre> * net_path = "//" authority [ abs_path ] * </pre></blockquote><p> */ protected static final BitSet net_path = new BitSet(256); // Static initializer for net_path static { net_path.set('/'); net_path.or(authority); net_path.or(abs_path); } /** * BitSet for hier_part. * <p><blockquote><pre> * hier_part = ( net_path | abs_path ) [ "?" query ] * </pre></blockquote><p> */ protected static final BitSet hier_part = new BitSet(256); // Static initializer for hier_part static { hier_part.or(net_path); hier_part.or(abs_path); // hier_part.set('?'); aleady included hier_part.or(ABNF_query); } /** * BitSet for relativeURI. * <p><blockquote><pre> * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] * </pre></blockquote><p> */ protected static final BitSet relativeURI = new BitSet(256); // Static initializer for relativeURI static { relativeURI.or(net_path); relativeURI.or(abs_path); relativeURI.or(rel_path); // relativeURI.set('?'); aleady included relativeURI.or(ABNF_query); } /** * BitSet for absoluteURI. * <p><blockquote><pre> * absoluteURI = scheme ":" ( hier_part | opaque_part ) * </pre></blockquote><p> */ protected static final BitSet absoluteURI = new BitSet(256); // Static initializer for absoluteURI static { absoluteURI.or(ABNF_scheme); absoluteURI.set(':'); absoluteURI.or(hier_part); absoluteURI.or(opaque_part); } /** * BitSet for URI-reference. * <p><blockquote><pre> * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] * </pre></blockquote><p> */ protected static final BitSet URI_reference = new BitSet(256); // Static initializer for URI_reference static { URI_reference.or(absoluteURI); URI_reference.or(relativeURI); URI_reference.set('#'); URI_reference.or(ABNF_fragment); } // ---------------------------- Characters disallowed within the URI syntax // Excluded US-ASCII Characters are like control, space, delims and unwise /** * BitSet for control. */ public static final BitSet control = new BitSet(256); // Static initializer for control static { for (int i = 0; i <= 0x1F; i++) { control.set(i); } control.set(0x7F); } /** * BitSet for space. */ public static final BitSet space = new BitSet(256); // Static initializer for space static { space.set(0x20); } /** * BitSet for delims. */ public static final BitSet context_delims = new BitSet(256); // Static initializer for delims static { context_delims.set('<'); context_delims.set('>'); context_delims.set('"'); } /** * BitSet for delims. */ public static final BitSet delims = new BitSet(256); // Static initializer for delims static { delims.set('<'); delims.set('>'); delims.set('#'); delims.set('%'); delims.set('"'); } /** * BitSet for unwise. */ public static final BitSet unwise = new BitSet(256); // Static initializer for unwise static { unwise.set('{'); unwise.set('}'); unwise.set('|'); unwise.set('\\'); unwise.set('^'); unwise.set('['); unwise.set(']'); unwise.set('`'); } // ----------------------- Characters allowed within and for each component /** * Those characters that are allowed for the authority component. */ public static final BitSet allowed_authority = new BitSet(256); // Static initializer for allowed_authority static { allowed_authority.or(ABNF_authority); allowed_authority.clear('%'); } /** * Those characters that are allowed for the opaque_part. */ public static final BitSet allowed_opaque_part = new BitSet(256); // Static initializer for allowed_opaque_part static { allowed_opaque_part.or(opaque_part); allowed_opaque_part.clear('%'); } /** * Those characters that are allowed for the reg_name. */ public static final BitSet allowed_reg_name = new BitSet(256); // Static initializer for allowed_reg_name static { allowed_reg_name.or(reg_name); // allowed_reg_name.andNot(PERCENT); allowed_reg_name.clear('%'); } /** * Those characters that are allowed for the userinfo component. */ public static final BitSet allowed_userinfo = new BitSet(256); // Static initializer for allowed_userinfo static { allowed_userinfo.or(userinfo); // allowed_userinfo.andNot(PERCENT); allowed_userinfo.clear('%'); } /** * Those characters that are allowed for within the userinfo component. */ public static final BitSet allowed_within_userinfo = new BitSet(256); // Static initializer for allowed_within_userinfo static { allowed_within_userinfo.or(within_userinfo); allowed_within_userinfo.clear('%'); } /** * Those characters that are allowed for the IPv6reference component. * The characters '[', ']' in IPv6reference should be excluded. */ public static final BitSet allowed_IPv6reference = new BitSet(256); // Static initializer for allowed_IPv6reference static { allowed_IPv6reference.or(IPv6reference); // allowed_IPv6reference.andNot(unwise); allowed_IPv6reference.clear('['); allowed_IPv6reference.clear(']'); } /** * Those characters that are allowed for the host component. * The characters '[', ']' in IPv6reference should be excluded. */ public static final BitSet allowed_host = new BitSet(256); // Static initializer for allowed_host static { allowed_host.or(hostname); allowed_host.or(allowed_IPv6reference); } /** * Those characters that are allowed for the authority component. */ public static final BitSet allowed_within_authority = new BitSet(256); // Static initializer for allowed_within_authority static { allowed_within_authority.or(ABNF_authority); allowed_within_authority.or(reg_name); allowed_within_authority.clear(';'); allowed_within_authority.clear(':'); allowed_within_authority.clear('@'); allowed_within_authority.clear('?'); allowed_within_authority.clear('/'); } /** * Those characters that are allowed for the abs_path. */ public static final BitSet allowed_abs_path = new BitSet(256); // Static initializer for allowed_abs_path static { allowed_abs_path.or(abs_path); // allowed_abs_path.set('/'); // aleady included allowed_abs_path.andNot(PERCENT); // 200705 added to update with // Fix for HTTPCLIENT-578: literal plus (+) // character in path components of HttpURL is not preserved // http://svn.apache.org/viewvc/jakarta/commons/proper/httpclient/trunk/src/java/org/apache/commons/httpclient/URI.java?r1=372560&r2=410367&diff_format=h // there were no tests for this // but../a+v/ ? what does it mean ? // clearing the bit the behavior becomes // different from java URI and after // hmm..when cleared the lookup works with /a+b/ // and also java uri..should be right.. // only that now the problem is that when creating // from unescaped components the uri becomes /a%20b/ ??!? // The right behavior should be to see /a+b/ as an URI // then the receiver that uses form url encoded // may replace furtherly the + with a space when // this is allowed... // In any case still don't know what to do of this bit allowed_abs_path.clear('+'); } /** * Those characters that are allowed for the rel_path. */ public static final BitSet allowed_rel_path = new BitSet(256); // Static initializer for allowed_rel_path static { allowed_rel_path.or(rel_path); allowed_rel_path.clear('%'); // 200705 same as for rel path allowed_rel_path.clear('+'); } /** * Those characters that are allowed within the path. */ public static final BitSet allowed_within_path = new BitSet(256); // Static initializer for allowed_within_path static { allowed_within_path.or(abs_path); allowed_within_path.clear('/'); allowed_within_path.clear(';'); allowed_within_path.clear('='); allowed_within_path.clear('?'); //if (MV) allowed_within_path.clear('#'); } /** * Those characters that are allowed for the query component. */ public static final BitSet allowed_query = new BitSet(256); // Static initializer for allowed_query static { allowed_query.or(uric); allowed_query.clear('%'); } /** * Those characters that are allowed within the query component. */ public static final BitSet allowed_within_query = new BitSet(256); // Static initializer for allowed_within_query static { allowed_within_query.or(allowed_query); allowed_within_query.andNot(ABNF_reserved); // excluded 'reserved' } /** * Those characters that are allowed for the fragment component. */ public static final BitSet allowed_fragment = new BitSet(256); // Static initializer for allowed_fragment static { allowed_fragment.or(uric); allowed_fragment.clear('%'); } // ------------------------------------------- Flags for this URI-reference // TODO: Figure out what all these variables are for and provide javadoc // URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] // absoluteURI = scheme ":" ( hier_part | opaque_part ) protected boolean _is_hier_part; protected boolean _is_opaque_part; // relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] // hier_part = ( net_path | abs_path ) [ "?" query ] protected boolean _is_net_path; protected boolean _is_abs_path; protected boolean _is_rel_path; // net_path = "//" authority [ abs_path ] // authority = server | reg_name protected boolean _is_reg_name; protected boolean _is_server; // = _has_server // server = [ [ userinfo "@" ] hostport ] // host = hostname | IPv4address | IPv6reference protected boolean _is_hostname; protected boolean _is_IPv4address; protected boolean _is_IPv6reference; // ------------------------------------------ Character and escape encoding /** * Encodes URI string. * * This is a two mapping, one from original characters to octets, and * subsequently a second from octets to URI characters: * <p><blockquote><pre> * original character sequence->octet sequence->URI character sequence * </pre></blockquote><p> * * An escaped octet is encoded as a character triplet, consisting of the * percent character "%" followed by the two hexadecimal digits * representing the octet code. For example, "%20" is the escaped * encoding for the US-ASCII space character. * <p> * Conversion from the local filesystem character set to UTF-8 will * normally involve a two step process. First convert the local character * set to the UCS; then convert the UCS to UTF-8. * The first step in the process can be performed by maintaining a mapping * table that includes the local character set code and the corresponding * UCS code. * The next step is to convert the UCS character code to the UTF-8 encoding. * <p> * Mapping between vendor codepages can be done in a very similar manner * as described above. * <p> * The only time escape encodings can allowedly be made is when a URI is * being created from its component parts. The escape and validate methods * are internally performed within this method. * * @param original the original character sequence * @param allowed those characters that are allowed within a component * @param charset the protocol charset * @return URI character sequence * @throws URIException null component or unsupported character encoding */ protected static char[] encode(String original, BitSet allowed, String charset) throws URIException { if (original == null) { throw new IllegalArgumentException("Original string may not be null"); } if (allowed == null) { throw new IllegalArgumentException("Allowed bitset may not be null"); } byte[] rawdata = URLCodec.encodeUrl(allowed, EncodingUtil.getBytes(original, charset)); //if (MV || true) System.out.println("Encode: [" + original + "] result:[" + EncodingUtil.getAsciiString(rawdata) + "]"); return EncodingUtil.getAsciiString(rawdata).toCharArray(); } /** * Decodes URI encoded string. * * This is a two mapping, one from URI characters to octets, and * subsequently a second from octets to original characters: * <p><blockquote><pre> * URI character sequence->octet sequence->original character sequence * </pre></blockquote><p> * * A URI must be separated into its components before the escaped * characters within those components can be allowedly decoded. * <p> * Notice that there is a chance that URI characters that are non UTF-8 * may be parsed as valid UTF-8. A recent non-scientific analysis found * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0% * false reading. * <p> * The percent "%" character always has the reserved purpose of being * the escape indicator, it must be escaped as "%25" in order to be used * as data within a URI. * <p> * The unescape method is internally performed within this method. * * @param component the URI character sequence * @param charset the protocol charset * @return original character sequence * @throws URIException incomplete trailing escape pattern or unsupported * character encoding */ protected static String decode(char[] component, String charset) throws URIException { if (component == null) { throw new IllegalArgumentException("Component array of chars may not be null"); } return decode(new String(component), charset); } /** * Decodes URI encoded string. * * This is a two mapping, one from URI characters to octets, and * subsequently a second from octets to original characters: * <p><blockquote><pre> * URI character sequence->octet sequence->original character sequence * </pre></blockquote><p> * * A URI must be separated into its components before the escaped * characters within those components can be allowedly decoded. * <p> * Notice that there is a chance that URI characters that are non UTF-8 * may be parsed as valid UTF-8. A recent non-scientific analysis found * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0% * false reading. * <p> * The percent "%" character always has the reserved purpose of being * the escape indicator, it must be escaped as "%25" in order to be used * as data within a URI. * <p> * The unescape method is internally performed within this method. * * @param component the URI character sequence * @param charset the protocol charset * @return original character sequence * @throws URIException incomplete trailing escape pattern or unsupported * character encoding * * @since 3.0 */ protected static String decode(String component, String charset) throws URIException { if (component == null) { throw new IllegalArgumentException("Component array of chars may not be null"); } byte[] rawdata = null; try { rawdata = URLCodec.decodeUrl(EncodingUtil.getAsciiBytes(component), true); } catch (CodecException e) { throw new URIException(e.getMessage()); } return EncodingUtil.getString(rawdata, charset); } /** * Pre-validate the unescaped URI string within a specific component. * * @param component the component string within the component * @param disallowed those characters disallowed within the component * @return if true, it doesn't have the disallowed characters * if false, the component is undefined or an incorrect one */ protected boolean prevalidate(String component, BitSet disallowed) { // prevalidate the given component by disallowed characters if (component == null) { return false; // undefined } char[] target = component.toCharArray(); for (int i = 0; i < target.length; i++) { if (disallowed.get(target[i])) { return false; } } return true; } /** * Validate the URI characters within a specific component. * The component must be performed after escape encoding. Or it doesn't * include escaped characters. * * @param component the characters sequence within the component * @param generous those characters that are allowed within a component * @return if true, it's the correct URI character sequence */ protected boolean validate(char[] component, BitSet generous) { // validate each component by generous characters return validate(component, 0, -1, generous); } /** * Validate the URI characters within a specific component. * The component must be performed after escape encoding. Or it doesn't * include escaped characters. * <p> * It's not that much strict, generous. The strict validation might be * performed before being called this method. * * @param component the characters sequence within the component * @param soffset the starting offset of the given component * @param eoffset the ending offset of the given component * if -1, it means the length of the component * @param generous those characters that are allowed within a component * @return if true, it's the correct URI character sequence */ protected boolean validate(char[] component, int soffset, int eoffset, BitSet generous) { // validate each component by generous characters if (eoffset == -1) { eoffset = component.length - 1; } for (int i = soffset; i <= eoffset; i++) { if (!generous.get(component[i])) { return false; } } return true; } private static final boolean isOpenContextDelimiter(char c) { return c == '"' || c == '<'; } private static final boolean isCloseContextDelimiter(char c, char o) { if (o == '"') return c == '"'; else return c == '>'; } /** * In order to avoid any possilbity of conflict with non-ASCII characters, * Parse a URI reference as a <code>String</code> with the character * encoding of the local system or the document. * <p> * The following line is the regular expression for breaking-down a URI * reference into its components. * <p><blockquote><pre> * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * 12 3 4 5 6 7 8 9 * </pre></blockquote><p> * For example, matching the above expression to * http://jakarta.apache.org/ietf/uri/#Related * results in the following subexpression matches: * <p><blockquote><pre> * $1 = http: * scheme = $2 = http * $3 = //jakarta.apache.org * authority = $4 = jakarta.apache.org * path = $5 = /ietf/uri/ * $6 = <undefined> * query = $7 = <undefined> * $8 = #Related * fragment = $9 = Related * </pre></blockquote><p> * * @param original the original character sequence * @param escaped <code>true</code> if <code>original</code> is escaped * @throws URIException If an error occurs. */ protected void parseUriReference(final String original) throws URIException { // Note: always called by contstructor on empty instance // validate and contruct the URI character sequence if (original == null) { throw new URIException("Null URI String passed"); } // System.out.println("BEGIN PARSE:" + original); /* @ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? */ String tmp = original.trim(); /* * The length of the string sequence of characters. * It may not be equal to the length of the byte array. */ int length = tmp.length(); /* * Remove the delimiters like angle brackets around an URI. */ if (length > 0) { final char first = tmp.charAt(0); if (isOpenContextDelimiter(first)) { if (length == 1) throw new URIException( "First and only character is a context delimiter [" + first + "] :" + original); final int ilast = length - 1; char clast = tmp.charAt(ilast); if (!isCloseContextDelimiter(clast, first)) throw new URIException( "First character is a context delimiter [" + first + "] but last not: [" + clast + "]"); tmp = tmp.substring(1, ilast); length -= 2; } } // Note: length might change in delimiters removal // so order is important // Handle case "empty string" and ":" after delimiters removal if (length == 0) { _path = EMPTYCHARS; _is_rel_path = true; setURI(); return; } else if (tmp.charAt(0) == ':') { throw new URIException("Invalid uri [:] if it is a rel path use [./:]"); } // A URI-reference is either a URI or a relative reference. // If the URI-reference's prefix does not match the syntax of // a scheme followed by its colon separator, then the // URI-reference is a relative reference. /* * The starting index */ int from = 0; //System.out.println("BEGINSKEME"); // Here lenght is >0 and [:] only is already gone _scheme = null; if (tmp.charAt(0) != '/' && !tmp.startsWith(".")) for (int i = 0; i < length; i++) { char c = tmp.charAt(i); if (c == ':') { //System.out.println("SKEME:" + tmp.substring(0, i)); _scheme = validateScheme(tmp.substring(0, i)); from = ++i; break; } else if (!URI.ABNF_scheme.get(c)) { //System.out.println("FOUND NON SKEME CHAR at " + i + ": " + c); // 200507 changed Was: //if (c=='/') break; //int at = indexFirstOf(tmp, ":/?#", i + 1); //tmp.indexOf(':'); // 200507 now: // We check also for the char itself, not only for the next int at = indexFirstOf(tmp, ":/?#", i); if (at != -1 && tmp.charAt(at) == ':') { //System.out.println("SCHEME ERROR:" + at + " for " + tmp); throw new URIException("The first part of the uri contains a colon [:] " + "It is not a scheme name because contains char [" + c + "] so it" + " is a relative reference relative path that contains colon [:] in the" + " first segment, this is not allowed, See RFC3986 Section 3.3. " + "\nFull URI:" + tmp); } else { //System.out.println("HAS NO SCHEME:" + tmp); break; } } else { // Case good char for scheme //System.out.println("GOOD SKEME CHAR:" + c); } } //System.out.println("FINALSCHEME:" + (_scheme==null ? " [null]" : new String(_scheme))); // Note must be careful because [scheme] is the BitSet while // _scheme is the char array....EASY BUG // 200705 changed, BitSet is now (URI.)ABNF_scheme final boolean isOpaque; if (_scheme != null) { if (from == length) { // case <http:> is synctactically correct isOpaque = true; } else if (from < length && tmp.charAt(from) == '/') { isOpaque = false; } else { isOpaque = true; } } else { isOpaque = false; } _is_opaque_part = isOpaque; //System.out.println("Is opaque:" + isOpaque + "," + tmp); if (isOpaque) { if (from == length) { // Case http: _is_abs_path = true; _path = EMPTYCHARS; // to see here..setting opaque then relativ? } else { int at = tmp.indexOf('#', from); if (at != -1) { if (at == length) _fragment = EMPTYCHARS; else _fragment = validateFragment(tmp.substring(at + 1)); _path = tmp.substring(from, at).toCharArray(); } else _path = tmp.substring(from).toCharArray(); } if (!validate(_path, opaque_part)) { throw new URIException("Invalid scheme specific part: " + new String(_path)); } _opaque = _path; //System.out.println("Is opaque:" + isOpaque + "," + tmp + " SSP:" + new String(_opaque)); setURI(); //System.out.println("toString:" + toString()); return; } /* * <p><blockquote><pre> * @@@@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ int at = indexFirstOf(tmp, "/?#", from); if (at == -1) { at = 0; } //System.out.println("HERE:" + tmp.toString()); //System.out.println("HERE:" + tmp.substring(from, length)); /* * Parse the authority component. * <p><blockquote><pre> * authority = $4 = jakarta.apache.org * @@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ // Reset flags _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false; if (0 <= at && at < length && tmp.charAt(at) == '/') { // Set flag _is_hier_part = true; if (at + 2 <= length && tmp.charAt(at + 1) == '/') { // the temporary index to start the search from int next = indexFirstOf(tmp, "/?#", at + 2); if (next == -1) { next = (tmp.substring(at + 2).length() == 0) ? at + 2 : tmp.length(); } parseAuthority(tmp.substring(at + 2, next)); from = at = next; // Set flag _is_net_path = true; } if (from == at) { // Set flag _is_abs_path = true; } } /* * Parse the path component. * <p><blockquote><pre> * path = $5 = /ietf/uri/ * @@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (from < length) { // rel_path = rel_segment [ abs_path ] int next = indexFirstOf(tmp, "?#", from); if (next == -1) { next = tmp.length(); } if (!_is_abs_path) { if (validate(tmp.substring(from, next).toCharArray(), rel_path)) { // Set flag _is_rel_path = true; } else if (validate(tmp.substring(from, next).toCharArray(), opaque_part)) { // Set flag _is_opaque_part = true; } else { // the path component may be empty // MV 200601 but not null.. _path = EMPTYCHARS; // ya TODO bu then setPath is called.. } } setRawPath(tmp.substring(from, next).toCharArray()); at = next; } else setRawPath(EMPTYCHARS); /* * Parse the query component. * <p><blockquote><pre> * query = $7 = <undefined> * @@@@@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (0 <= at && at < length && tmp.charAt(at) == '?') { if (at == length) { _query = EMPTYCHARS; // MV Changed mantain empty string (only ? present) at = length; } else { int next = tmp.indexOf('#', at + 1); if (next == -1) { next = tmp.length(); } _query = validateQuery(tmp.substring(at + 1, next)); at = next; } } /* * Parse the fragment component. * <p><blockquote><pre> * fragment = $9 = Related * @@@@@@@@ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * </pre></blockquote><p> */ if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') { if (at + 1 == length) { // empty fragment _fragment = EMPTYCHARS; } else { _fragment = validateFragment(tmp.substring(at + 1)); } } // set this URI. setURI(); } private static final char[] EMPTYCHARS = new char[] {}; /** * Get the earlier index that to be searched for the first occurrance in * one of any of the given string. * * @param s the string to be indexed * @param delims the delimiters used to index * @return the earlier index if there are delimiters */ protected int indexFirstOf(String s, String delims) { return indexFirstOf(s, delims, -1); } /** * Get the earlier index that to be searched for the first occurrance in * one of any of the given string. * * @param s the string to be indexed * @param delims the delimiters used to index * @param offset the from index * @return the earlier index if there are delimiters */ protected int indexFirstOf(String s, String delims, int offset) { if (s == null || s.length() == 0) { return -1; } if (delims == null || delims.length() == 0) { return -1; } // check boundaries if (offset < 0) { offset = 0; } else if (offset > s.length()) { return -1; } // s is never null int min = s.length(); char[] delim = delims.toCharArray(); for (int i = 0; i < delim.length; i++) { int at = s.indexOf(delim[i], offset); if (at >= 0 && at < min) { min = at; } } return (min == s.length()) ? -1 : min; } /** * Get the earlier index that to be searched for the first occurrance in * one of any of the given array. * * @param s the character array to be indexed * @param delim the delimiter used to index * @return the ealier index if there are a delimiter */ protected int indexFirstOf(char[] s, char delim) { return indexFirstOf(s, delim, 0); } /** * Get the earlier index that to be searched for the first occurrance in * one of any of the given array. * * @param s the character array to be indexed * @param delim the delimiter used to index * @param offset The offset. * @return the ealier index if there is a delimiter */ protected int indexFirstOf(char[] s, char delim, int offset) { if (s == null || s.length == 0) { return -1; } // check boundaries if (offset < 0) { offset = 0; } else if (offset > s.length) { return -1; } for (int i = offset; i < s.length; i++) { if (s[i] == delim) { return i; } } return -1; } /** * Parse the (percent encoded) authority component. * * @param original the percent encoded character * sequence of authority component * @throws URIException If an error occurs. */ protected void parseAuthority(String original) throws URIException { // Reset flags _is_reg_name = _is_server = _is_hostname = _is_IPv4address = _is_IPv6reference = false; boolean hasPort = true; int from = 0; int next = original.indexOf('@'); if (next != -1) { // neither -1 and 0 // each protocol extented from URI supports the specific userinfo _userinfo = original.substring(0, next).toCharArray(); from = next + 1; } next = original.indexOf('[', from); if (next >= from) { next = original.indexOf(']', from); if (next == -1) { throw new URIException(URIException.PARSING, "IPv6reference"); } else { next++; } // In IPv6reference, '[', ']' should be excluded _host = original.substring(from, next).toCharArray(); // Set flag _is_IPv6reference = true; } else { // only for !_is_IPv6reference next = original.indexOf(':', from); if (next == -1) { next = original.length(); hasPort = false; } // REMINDME: it doesn't need the pre-validation _host = original.substring(from, next).toCharArray(); if (validate(_host, IPv4address)) { // Set flag _is_IPv4address = true; } else if (validate(_host, hostname)) { // Set flag _is_hostname = true; } else { // Set flag _is_reg_name = true; } } if (_is_reg_name) { // Reset flags for a server-based naming authority _is_server = _is_hostname = _is_IPv4address = _is_IPv6reference = false; // set a registry-based naming authority _authority = original.toCharArray(); } else { if (original.length() - 1 > next && hasPort && original.charAt(next) == ':') { // not empty from = next + 1; try { //System.out.println("PARSEPORT [" + original.substring(from) + "]"); _port = Integer.parseInt(original.substring(from)); } catch (NumberFormatException error) { throw new URIException(URIException.PARSING, "invalid port number"); } } // set a server-based naming authority StringBuffer buf = new StringBuffer(); if (_userinfo != null) { // has_userinfo buf.append(_userinfo); buf.append('@'); } if (_host != null) { buf.append(_host); if (_port != -1) { buf.append(':'); buf.append(_port); } } _authority = buf.toString().toCharArray(); // Set flag _is_server = true; } } /** * Once it's parsed successfully, set this URI. * * See 5.3 Component Recomposition * http://www.apps.ietf.org/rfc/rfc3986.html#sec-5.3 * * @see #getRawURI */ protected void setURI() { StringBuffer buf = new StringBuffer(); if (_scheme != null) { buf.append(_scheme); buf.append(':'); } if (_is_net_path) { buf.append("//"); if (_authority != null) { // has_authority buf.append(_authority); } } if (_opaque != null && _is_opaque_part) { buf.append(_opaque); } else if (_path != null) { // _is_hier_part or _is_relativeURI if (_path.length != 0) { buf.append(_path); } } if (_query != null) { // has_query buf.append('?'); buf.append(_query); } // ignore the fragment identifier _uri = buf.toString().toCharArray(); hash = 0; } // ----------------------------------------------------------- Test methods /** * Tell whether or not this URI is absolute. * * @return true iif this URI is absoluteURI */ public boolean isAbsoluteURI() { // in rfc3986 absolute uri has also no fragment.. return (_scheme != null); } /** * Teldl whether or not this URI is relative. * * @return true iif this URI is relativeURI */ public boolean isRelativeURI() { return (_scheme == null); } /** * Tell whether or not the absoluteURI of this URI is hier_part. * * @return true iif the absoluteURI is hier_part */ public boolean isHierPart() { return _is_hier_part; } /** * Tell whether or not the absoluteURI of this URI is opaque_part. * * @return true iif the absoluteURI is opaque_part */ public boolean isOpaquePart() { return _is_opaque_part; } /** * Tell whether or not the relativeURI or heir_part of this URI is net_path. * It's the same function as the has_authority() method. * * @return true iif the relativeURI or heir_part is net_path * @see #hasAuthority */ public boolean isNetPath() { return _is_net_path || (_authority != null); } /** * Tell whether or not the relativeURI or hier_part of this URI is abs_path. * * @return true iif the relativeURI or hier_part is abs_path */ public boolean isAbsPath() { return _is_abs_path; } /** * Tell whether or not the relativeURI of this URI is rel_path. * * @return true iif the relativeURI is rel_path */ public boolean isRelPath() { return _is_rel_path; } /** * Tell whether or not this URI has authority. * It's the same function as the is_net_path() method. * * @return true iif this URI has authority * @see #isNetPath */ public boolean hasAuthority() { return (_authority != null) || _is_net_path; } /** * Tell whether or not the authority component of this URI is reg_name. * * @return true iif the authority component is reg_name */ public boolean isRegName() { return _is_reg_name; } /** * Tell whether or not the authority component of this URI is server. * * @return true iif the authority component is server */ public boolean isServer() { return _is_server; } /** * Tell whether or not this URI has userinfo. * * @return true iif this URI has userinfo */ public boolean hasUserinfo() { return (_userinfo != null); } /** * Tell whether or not the host part of this URI is hostname. * * @return true iif the host part is hostname */ public boolean isHostname() { return _is_hostname; } /** * Tell whether or not the host part of this URI is IPv4address. * * @return true iif the host part is IPv4address */ public boolean isIPv4address() { return _is_IPv4address; } /** * Tell whether or not the host part of this URI is IPv6reference. * * @return true iif the host part is IPv6reference */ public boolean isIPv6reference() { return _is_IPv6reference; } /** * Tell whether or not this URI has query. * * @return true iif this URI has query */ public boolean hasQuery() { return (_query != null); } /** * Tell whether or not this URI has fragment. * * @return true iif this URI has fragment */ public boolean hasFragment() { return (_fragment != null); } // ---------------------------------------------------------------- Charset /** * Set the default charset of the protocol. * <p> * The character set used to store files SHALL remain a local decision and * MAY depend on the capability of local operating systems. Prior to the * exchange of URIs they SHOULD be converted into a ISO/IEC 10646 format * and UTF-8 encoded. This approach, while allowing international exchange * of URIs, will still allow backward compatibility with older systems * because the code set positions for ASCII characters are identical to the * one byte sequence in UTF-8. * <p> * An individual URI scheme may require a single charset, define a default * charset, or provide a way to indicate the charset used. * * <p> * Always all the time, the setter method is always succeeded and throws * <code>DefaultCharsetChanged</code> exception. * * So API programmer must follow the following way: * <code><pre> * import org.apache.util.URI$DefaultCharsetChanged; * . * . * . * try { * URI.setDefaultProtocolCharset("UTF-8"); * } catch (DefaultCharsetChanged cc) { * // CASE 1: the exception could be ignored, when it is set by user * if (cc.getReasonCode() == DefaultCharsetChanged.PROTOCOL_CHARSET) { * // CASE 2: let user know the default protocol charset changed * } else { * // CASE 2: let user know the default document charset changed * } * } * </pre></code> * * The API programmer is responsible to set the correct charset. * And each application should remember its own charset to support. * * @param charset the default charset for each protocol * @throws DefaultCharsetChanged default charset changed */ public static void setDefaultProtocolCharset(String charset) throws DefaultCharsetChanged { defaultProtocolCharset = charset; throw new DefaultCharsetChanged(DefaultCharsetChanged.PROTOCOL_CHARSET, "the default protocol charset changed"); } /** * Get the default charset of the protocol. * <p> * An individual URI scheme may require a single charset, define a default * charset, or provide a way to indicate the charset used. * <p> * To work globally either requires support of a number of character sets * and to be able to convert between them, or the use of a single preferred * character set. * For support of global compatibility it is STRONGLY RECOMMENDED that * clients and servers use UTF-8 encoding when exchanging URIs. * * @return the default charset string */ public static String getDefaultProtocolCharset() { return defaultProtocolCharset; } /** * Get the protocol charset used by this current URI instance. * It was set by the constructor for this instance. If it was not set by * contructor, it will return the default protocol charset. * * @return the protocol charset string * @see #getDefaultProtocolCharset */ public String getProtocolCharset() { return (protocolCharset != null) ? protocolCharset : defaultProtocolCharset; } /** * Set the default charset of the document. * @param charset the default charset for the document */ public static void setDefaultDocumentCharset(String charset) { defaultDocumentCharset = charset; } /** * Get the recommended default charset of the document. * * @return the default charset string */ public static String getDefaultDocumentCharset() { return defaultDocumentCharset; } /** * Get the default charset of the document by locale. * * @return the default charset string by locale */ public static String getDefaultDocumentCharsetByLocale() { return defaultDocumentCharsetByLocale; } /** * Get the default charset of the document by platform. * * @return the default charset string by platform */ public static String getDefaultDocumentCharsetByPlatform() { return defaultDocumentCharsetByPlatform; } // ------------------------------------------------------------- The scheme /** * Get the scheme. * * @return the scheme */ protected char[] getRawScheme() { return _scheme; } /** * Get the scheme. * * @return the scheme * null if undefined scheme */ public String getScheme() { return (_scheme == null) ? null : new String(_scheme); } // ---------------------------------------------------------- The authority /** * Get the raw-escaped authority. * * @return the raw-escaped authority */ protected char[] getRawAuthority() { return _authority; } /** * Get the escaped authority. * * @return the escaped authority */ public String getEscapedAuthority() { return (_authority == null) ? null : new String(_authority); } /** * Get the authority. * * @return the authority * @throws URIException If {@link #decode} fails */ public String getAuthority() throws URIException { return (_authority == null) ? null : decode(_authority, getProtocolCharset()); } // ----------------------------------------------------------- The userinfo /** * Get the raw-escaped userinfo. * * @return the raw-escaped userinfo * @see #getAuthority */ protected char[] getRawUserinfo() { return _userinfo; } /** * Get the escaped userinfo. * * @return the escaped userinfo * @see #getAuthority */ public String getEscapedUserinfo() { return (_userinfo == null) ? null : new String(_userinfo); } /** * Get the userinfo. * * @return the userinfo * @throws URIException If {@link #decode} fails * @see #getAuthority */ public String getUserinfo() throws URIException { return (_userinfo == null) ? null : decode(_userinfo, getProtocolCharset()); } // --------------------------------------------------------------- The host /** * Get the host. * <p><blockquote><pre> * host = hostname | IPv4address | IPv6reference * </pre></blockquote><p> * * @return the host * @see #getAuthority */ protected char[] getRawHost() { return _host; } /** * Get the host. * <p><blockquote><pre> * host = hostname | IPv4address | IPv6reference * </pre></blockquote><p> * * @return the host * @throws URIException If {@link #decode} fails * @see #getAuthority */ public String getHost() throws URIException { if (_host != null) { return decode(_host, getProtocolCharset()); } else { return null; } } // --------------------------------------------------------------- The port /** * Get the port. In order to get the specfic default port, the specific * protocol-supported class extended from the URI class should be used. * It has the server-based naming authority. * * @return the port * if -1, it has the default port for the scheme or the server-based * naming authority is not supported in the specific URI. */ public int getPort() { return _port; } // --------------------------------------------------------------- The path /** * Set the raw-escaped path. * * @param escapedPath the path character sequence * @throws URIException encoding error or not proper for initial instance * @see #encode */ private void setRawPath(char[] escapedPath) throws URIException { if (escapedPath == null) throw new IllegalArgumentException( "Null path passed. " + "\nRFC 3986 3.3 A path is always defined for a URI, " + "though the defined path may be empty (zero length)."); if (escapedPath.length == 0) { _path = _opaque = escapedPath; setURI(); return; } // remove the fragment identifier escapedPath = removeFragmentIdentifier(escapedPath); if (_is_net_path || _is_abs_path) { if (escapedPath[0] != '/') { throw new URIException(URIException.PARSING, "not absolute path"); } if (!validate(escapedPath, abs_path)) { throw new URIException(URIException.ESCAPING, "escaped absolute path not valid"); } _opaque = EMPTYCHARS; // MV added _path = escapedPath; } else if (_is_rel_path) { int at = indexFirstOf(escapedPath, '/'); if (at == 0) { throw new URIException(URIException.PARSING, "incorrect path"); } if (at > 0 && !validate(escapedPath, 0, at - 1, rel_segment) && !validate(escapedPath, at, -1, abs_path) || at < 0 && !validate(escapedPath, 0, -1, rel_segment)) { throw new URIException(URIException.ESCAPING, "escaped relative path not valid"); } _opaque = EMPTYCHARS; // MV added _path = escapedPath; } else if (_is_opaque_part) { if (!uric_no_slash.get(escapedPath[0]) && !validate(escapedPath, 1, -1, uric)) { throw new URIException(URIException.ESCAPING, "escaped opaque part not valid"); } _opaque = escapedPath; // here TODO ?? put _path empty ?? exist in bot setPath and setRawPath // MV to add _path = EMPTYCHARS; _path = _opaque; } else { throw new URIException(URIException.PARSING, "incorrect path: " + new String(escapedPath)); } setURI(); } /** * Resolve the base and relative path. * * @param basePath a character array of the basePath * @param relPath a character array of the relPath * @return the resolved path * @throws URIException no more higher path level to be resolved */ protected static char[] resolvePath(char[] basePath, char[] relPath) throws URIException { // REMINDME: paths are never null String base = (basePath == null) ? "" : new String(basePath); //System.out.println("Resolve path:" + base + " with " + new String(relPath)); int at = base.lastIndexOf('/'); if (at != -1) { basePath = base.substring(0, at + 1).toCharArray(); } // _path could be empty if (relPath == null || relPath.length == 0) { return normalize(basePath); } else if (relPath[0] == '/') { return normalize(relPath); } else { StringBuffer buff = new StringBuffer(base.length() + relPath.length); buff.append((at != -1) ? base.substring(0, at + 1) : "/"); buff.append(relPath); return normalize(buff.toString().toCharArray()); } } /** * Get the raw-escaped current hierarchy level in the given path. * If the last namespace is a collection, the slash mark ('/') should be * ended with at the last character of the path string. * * @param path the path * @return the current hierarchy level * @throws URIException no hierarchy level */ protected char[] getRawCurrentHierPath(char[] path) throws URIException { if (_is_opaque_part) { throw new URIException(URIException.PARSING, "no hierarchy level"); } if (path == null) { throw new URIException(URIException.PARSING, "empty path"); } String buff = new String(path); int first = buff.indexOf('/'); int last = buff.lastIndexOf('/'); if (last == 0) { return rootPath; } else if (first != last && last != -1) { return buff.substring(0, last).toCharArray(); } // FIXME: it could be a document on the server side return path; } /** * Get the raw-escaped current hierarchy level. * * @return the raw-escaped current hierarchy level * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. */ protected char[] getRawCurrentHierPath() throws URIException { return (_path.length == 0) ? EMPTYCHARS : getRawCurrentHierPath(_path); } /** * Get the escaped current hierarchy level. * * @return the escaped current hierarchy level * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. */ private String getEscapedCurrentHierPath() throws URIException { char[] path = getRawCurrentHierPath(); return (path == null) ? null : new String(path); } /** * Get the current hierarchy level. * * @return the current hierarchy level * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. * @see #decode */ public String getCurrentHierPath() throws URIException { char[] path = getRawCurrentHierPath(); return (path == null) ? null : decode(path, getProtocolCharset()); } /** * Get the level above the this hierarchy level. * * @return the raw above hierarchy level * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. */ private char[] getRawAboveHierPath() throws URIException { char[] path = getRawCurrentHierPath(); return (path == null) ? null : getRawCurrentHierPath(path); } /** * Get the level above the this hierarchy level. * * @return the raw above hierarchy level * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails. */ public String getEscapedAboveHierPath() throws URIException { char[] path = getRawAboveHierPath(); return (path == null) ? null : new String(path); } /** * Get the escaped path. * @return the escaped path string */ private char[] getRawPath() { return _is_opaque_part ? _opaque : _path; } /** * Get the escaped path. * @return the escaped path string */ public String getEscapedPath() { char[] path = getRawPath(); return (path == null) ? null : new String(path); } /** * Get the path. * <p><blockquote><pre> * path = [ abs_path | opaque_part ] * </pre></blockquote><p> * @return the path string * @throws URIException If {@link #decode} fails. * @see #decode */ public String getPath() throws URIException { char[] path = getRawPath(); return (path == null) ? null : decode(path, getProtocolCharset()); } /** * Get the raw-escaped basename of the path. * * @return the raw-escaped basename */ private char[] getRawName() { if (_path.length == 0) { return EMPTYCHARS; } int at = 0; // TODO Should be == to // for (int i = _path.length; --i >= 0; ) { for (int i = _path.length - 1; i >= 0; i--) { if (_path[i] == '/') { at = i + 1; break; } } int len = _path.length - at; char[] basename = new char[len]; System.arraycopy(_path, at, basename, 0, len); return basename; } /** * Get the escaped basename of the path. * * @return the escaped basename string */ public String getEscapedName() { char[] basename = getRawName(); return (basename == null) ? null : new String(basename); } /** * Get the basename of the path. * * @return the basename string * @throws URIException incomplete trailing escape pattern or unsupported * character encoding * @see #decode */ public String getName() throws URIException { char[] basename = getRawName(); return (basename == null) ? null : decode(getRawName(), getProtocolCharset()); } // ----------------------------------------------------- The path and query /** * Get the raw-escaped path and query. * * @return the raw-escaped path and query */ private char[] getRawPathQuery() { if (_path.length > 0 && _query == null) { return null; } StringBuffer buff = new StringBuffer(); buff.append(_path); if (_query != null) { buff.append('?'); buff.append(_query); } return buff.toString().toCharArray(); } /** * Get the escaped query. * * @return the escaped path and query string */ public String getEscapedPathQuery() { char[] rawPathQuery = getRawPathQuery(); return (rawPathQuery == null) ? null : new String(rawPathQuery); } // -------------------------------------------------------------- The query /** * Get the escaped query. * * @return the escaped query string */ public String getEscapedQuery() { return (_query == null) ? null : new String(_query); } /** * Decodes the query for validation purposes * Package-friend for Apache URI Provider */ String getQuery() throws URIException { return (_query == null) ? null : decode(_query, getProtocolCharset()); } // ----------------------------------------------------------- The fragment /** * @param non null fragment private char[] escapeFragment(String fragment) throws URIException { return encode(fragment, allowed_fragment, getProtocolCharset()); } */ /** * @param non null fragment private String escapeFragmentAsString(String fragment) throws URIException { return new String(escapeFragment(fragment)); } */ /** * Get the escaped fragment. * <p> * The optional fragment identifier is not part of a URI, but is often used * in conjunction with a URI. * <p> * The format and interpretation of fragment identifiers is dependent on * the media type [RFC2046] of the retrieval result. * <p> * A fragment identifier is only meaningful when a URI reference is * intended for retrieval and the result of that retrieval is a document * for which the identified fragment is consistently defined. * * @return the escaped fragment string */ public String getEscapedFragment() { return (_fragment == null) ? null : new String(_fragment); } /** * Decodes the fragment for validation purposes * Package-friend for Apache URI Provider */ String getFragment() throws URIException { // return (_fragment == null) ? null : decode(_fragment, // getProtocolCharset()); try { return (_fragment == null) ? null : EncodingUtil.getString( URLCodec.decodeUrl(EncodingUtil.getAsciiBytes(new String(_fragment)), false), getProtocolCharset()); } catch (CodecException e) { throw new URIException("Could not decode fragment: " + e.getMessage()); } } // ------------------------------------------------------------- Utilities /** * Remove the fragment identifier of the given component. * * @param component the component that a fragment may be included * @return the component that the fragment identifier is removed */ protected char[] removeFragmentIdentifier(char[] component) { if (component == null) { return null; } int lastIndex = new String(component).indexOf('#'); if (lastIndex != -1) { // TODO optimize component = new String(component).substring(0, lastIndex).toCharArray(); } return component; } /** * Normalize the given hier path part. * * <p>Algorithm taken from URI reference parser at * http://www.apache.org/~fielding/uri/rev-2002/issues.html. * * @param path the path to normalize * @return the normalized path * @throws URIException no more higher path level to be normalized */ protected static char[] normalize(char[] path) throws URIException { //System.out.println("Normalize [" + new String(path) + "]"); if (path.length == 0) { return path; } String normalized = new String(path); // If the buffer begins with "./" or "../", the "." or ".." is removed. if (normalized.startsWith("./")) { normalized = normalized.substring(1); } else if (normalized.startsWith("../")) { normalized = normalized.substring(2); } else if (normalized.startsWith("..")) { normalized = normalized.substring(2); } // All occurrences of "/./" in the buffer are replaced with "/" int index = -1; while ((index = normalized.indexOf("/./")) != -1) { normalized = normalized.substring(0, index) + normalized.substring(index + 2); } // If the buffer ends with "/.", the "." is removed. if (normalized.endsWith("/.")) { normalized = normalized.substring(0, normalized.length() - 1); } int startIndex = 0; // All occurrences of "/<segment>/../" in the buffer, where ".." // and <segment> are complete path segments, are iteratively replaced // with "/" in order from left to right until no matching pattern remains. // If the buffer ends with "/<segment>/..", that is also replaced // with "/". Note that <segment> may be empty. while ((index = normalized.indexOf("/../", startIndex)) != -1) { int slashIndex = normalized.lastIndexOf('/', index - 1); if (slashIndex >= 0) { normalized = normalized.substring(0, slashIndex) + normalized.substring(index + 3); } else { startIndex = index + 3; } } if (normalized.endsWith("/..")) { int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4); if (slashIndex >= 0) { normalized = normalized.substring(0, slashIndex + 1); } } // All prefixes of "<segment>/../" in the buffer, where ".." // and <segment> are complete path segments, are iteratively replaced // with "/" in order from left to right until no matching pattern remains. // If the buffer ends with "<segment>/..", that is also replaced // with "/". Note that <segment> may be empty. while ((index = normalized.indexOf("/../")) != -1) { int slashIndex = normalized.lastIndexOf('/', index - 1); if (slashIndex >= 0) { break; } else { normalized = normalized.substring(index + 3); } } if (normalized.endsWith("/..")) { int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4); if (slashIndex < 0) { normalized = ""; } } //System.out.println("Normalize result [" + normalized + "]"); return normalized.toCharArray(); } /** * Creates a new URI obtained * normalizing the path part of this URI. * <p>Normalization is only meant to be performed on * URIs with an absolute path. * Calling this method on a relative path URI will have no * effect. * @return a normalized uri or the same uri if is not an absolute path. * @throws URIException no more higher path level to be normalized * * @see #isAbsPath() */ public URI normalize() throws URIException { if (isAbsPath()) { char[] npath = normalize(_path); URI uri = new URI(); copy(uri, true); uri._path = npath; uri.setURI(); return uri; } else { return this; } } /** * Test if the first array is equal to the second array. * * @param first the first character array * @param second the second character array * @return true if they're equal */ protected boolean equals(char[] first, char[] second) { if (first == null && second == null) { return true; } if (first == null || second == null) { return false; } if (first.length != second.length) { return false; } for (int i = 0; i < first.length; i++) { if (first[i] != second[i]) { return false; } } return true; } /** * Test an object if this URI is equal to another. * * @param obj an object to compare * @return true if two URI objects are equal */ public boolean equals(Object obj) { // normalize and test each components if (obj == this) { return true; } if (!(obj instanceof URI)) { return false; } URI another = (URI) obj; // scheme if (!equals(_scheme, another._scheme)) { //System.out.println("DIFF SKEME"); return false; } // is_opaque_part or is_hier_part? and opaque if (!equals(_opaque, another._opaque)) { //System.out.println("DIFF OPAQUE"); return false; } // is_hier_part // has_authority if (!equals(_authority, another._authority)) { //System.out.println("DIFF AUTH"); return false; } // path if (!equals(_path, another._path)) { //System.out.println("DIFF PATH"); return false; } // has_query if (!equals(_query, another._query)) { //System.out.println("DIFF QUERY"); return false; } // has_fragment? should be careful of the only fragment case. if (!equals(_fragment, another._fragment)) { //System.out.println("DIFF FRAG"); return false; } return true; } // ---------------------------------------------------------- Serialization /** * Write the content of this URI. * * @param oos the object-output stream * @throws IOException If an IO problem occurs. */ private void writeObject(ObjectOutputStream oos) throws IOException { // TODO here shoul write only the string... oos.defaultWriteObject(); } /** * Read a URI. * * @param ois the object-input stream * @throws ClassNotFoundException If one of the classes specified in the * input stream cannot be found. * @throws IOException If an IO problem occurs. */ private void readObject(ObjectInputStream ois) throws ClassNotFoundException, IOException { ois.defaultReadObject(); } // -------------------------------------------------------------- Hash code /** * Return a hash code for this URI. * * @return a has code value for this URI */ public int hashCode() { // here is implemented like in the String class, // http://www.cs.umd.edu/~pugh/java/memoryModel/archive/2178.html int t = hash; if (t == 0) { char[] c = _uri; if (c != null) { for (int i = 0, len = c.length; i < len; i++) { t = 31 * t + c[i]; } } c = _fragment; if (c != null) { for (int i = 0, len = c.length; i < len; i++) { t = 31 * t + c[i]; } } this.hash = t; } return hash; } // ------------------------------------------------------------- Comparison /** * Compare this URI to another object. * * @param obj the object to be compared. * @return 0, if it's same, * -1, if failed, first being compared with in the authority component * @throws ClassCastException not URI argument */ public int compareTo(Object obj) throws ClassCastException { URI another = (URI) obj; if (!equals(_authority, another.getRawAuthority())) { return -1; } return toString().compareTo(another.toString()); } // ------------------------------------------------------------------ Copy Implementation // This is set calling setURI // _uri; // Copy context // protocolCharset; // Copy // _scheme; // ..then.. // --- if opaque copyNotOpaqueAllButScheme // ----else copyOpaqueAllButScheme = CopyAuthority, CopyPath, CopyQueryFragment // copyNotOpaqueAllButScheme // _is_opaque_part; // _opaque = _path; // ?? impl // _fragment // Copy Authority (not opaque) // _authority; // _userinfo; // _is_reg_name; // _is_server; // _host; // _port; // _is_hostname; // _is_IPv4address; // _is_IPv6reference; // _is_hier_part; // _is_net_path; // Copy Path // _path; // _is_abs_path; // _is_rel_path; // Copy QueryFragment // _query; // _fragment; // NOTE: all implementations of copy accept a NEW uri, // not an already filled, existing URI. private void copy(final URI out, boolean normalize) { //normalize = false; out._scheme = normalize ? toLowerCase(this._scheme) : this._scheme; if (this._is_opaque_part) { copyOpaqueAllButScheme(out); } else { copyNotOpaqueAllButScheme(out, normalize); } out._uri = normalize ? this.toString().toCharArray() : this._uri; } private char[] toLowerCase(char[] a) { return a == null ? null : new String(a).toLowerCase().toCharArray(); } private void copyNotOpaqueAllButScheme(final URI out, boolean normalize) { if (_is_opaque_part) throw new IllegalStateException(); copyContext(out); copyAuthority(out, normalize); copyPath(out); copyQueryFragment(out); } private void copyOpaqueAllButScheme(final URI out) { if (!_is_opaque_part) throw new IllegalStateException(); copyContext(out); out._is_opaque_part = true; out._opaque = this._opaque; out._path = this._path; copyFragment(out); } private final void copyContext(final URI out) { out.protocolCharset = this.protocolCharset; } private void copyAuthority(final URI out, boolean normalize) { out._is_hostname = this._is_hostname; out._is_IPv4address = this._is_IPv4address; out._is_IPv6reference = this._is_IPv6reference; out._is_net_path = this._is_net_path; out._authority = normalize ? toLowerCase(this._authority) : this._authority; if (this._is_server) { out._is_server = this._is_server; out._userinfo = this._userinfo; out._host = normalize ? toLowerCase(this._host) : this._host; out._port = this._port; } else if (this._is_reg_name) { out._is_reg_name = this._is_reg_name; } out._opaque = this._opaque; } private final void copyPath(final URI out) { out._is_abs_path = this._is_abs_path; out._is_rel_path = this._is_rel_path; out._path = this._path; } private final void copyQueryFragment(final URI out) { out._query = this._query; copyFragment(out); } private final void copyFragment(final URI out) { out._fragment = this._fragment; } // ------------------------------------------------------------------ Clone /*** JUST FOR REF, no sense in cloning an immutable URI instance = new URI(); //this.copy(instance); instance._uri = _uri; instance._scheme = _scheme; instance._opaque = _opaque; instance._authority = _authority; instance._userinfo = _userinfo; instance._host = _host; instance._port = _port; instance._path = _path; instance._query = _query; instance._fragment = _fragment; // the charset to do escape encoding for this instance instance.protocolCharset = protocolCharset; // flags instance._is_hier_part = _is_hier_part; instance._is_opaque_part = _is_opaque_part; instance._is_net_path = _is_net_path; instance._is_abs_path = _is_abs_path; instance._is_rel_path = _is_rel_path; instance._is_reg_name = _is_reg_name; instance._is_server = _is_server; instance._is_hostname = _is_hostname; instance._is_IPv4address = _is_IPv4address; instance._is_IPv6reference = _is_IPv6reference; // mv test URI zz = new URI(); copy(zz); if (!zz.equals(instance)) throw new Error("different, copy is:" + zz + "\nold is:" + instance); return zz; } ******/ /** * Retrieves the percent encoded URI string. * @return the percent encoded URI string */ public String getEscapedURI() { return (_uri == null) ? null : new String(_uri); } /** * Get the escaped URI reference string. * * @return the escaped URI reference string */ public String getEscapedURIReference() { if (_fragment == null) { return getEscapedURI(); } if (_uri == null) { return getEscapedFragment(); } // if _uri != null && _fragment != null StringBuffer uriReference = new StringBuffer().append(_uri).append('#').append(_fragment); return uriReference.toString(); } /** * Get the escaped URI string. * @return the escaped URI string */ public String toString() { return getEscapedURI(); } // ------------------------------------------------------------ Inner class /** * The charset-changed normal operation to represent to be required to * alert to user the fact the default charset is changed. */ public static class DefaultCharsetChanged extends RuntimeException { // ------------------------------------------------------- constructors /** * The constructor with a reason string and its code arguments. * * @param reasonCode the reason code * @param reason the reason */ public DefaultCharsetChanged(int reasonCode, String reason) { super(reason); this.reason = reason; this.reasonCode = reasonCode; } // ---------------------------------------------------------- constants /** No specified reason code. */ public static final int UNKNOWN = 0; /** Protocol charset changed. */ public static final int PROTOCOL_CHARSET = 1; /** Document charset changed. */ public static final int DOCUMENT_CHARSET = 2; // ------------------------------------------------- instance variables /** The reason code. */ private int reasonCode; /** The reason message. */ private String reason; // ------------------------------------------------------------ methods /** * Get the reason code. * * @return the reason code */ public int getReasonCode() { return reasonCode; } /** * Get the reason message. * * @return the reason message */ public String getReason() { return reason; } } /** * A mapping to determine the (somewhat arbitrarily) preferred charset for a * given locale. Supports all locales recognized in JDK 1.1. * <p> * The distribution of this class is Servlets.com. It was originally * written by Jason Hunter [jhunter at acm.org] and used by with permission. */ public static class LocaleToCharsetMap { /** A mapping of language code to charset */ private static final Hashtable LOCALE_TO_CHARSET_MAP; static { LOCALE_TO_CHARSET_MAP = new Hashtable(); LOCALE_TO_CHARSET_MAP.put("ar", "ISO-8859-6"); LOCALE_TO_CHARSET_MAP.put("be", "ISO-8859-5"); LOCALE_TO_CHARSET_MAP.put("bg", "ISO-8859-5"); LOCALE_TO_CHARSET_MAP.put("ca", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("cs", "ISO-8859-2"); LOCALE_TO_CHARSET_MAP.put("da", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("de", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("el", "ISO-8859-7"); LOCALE_TO_CHARSET_MAP.put("en", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("es", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("et", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("fi", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("fr", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("hr", "ISO-8859-2"); LOCALE_TO_CHARSET_MAP.put("hu", "ISO-8859-2"); LOCALE_TO_CHARSET_MAP.put("is", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("it", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("iw", "ISO-8859-8"); LOCALE_TO_CHARSET_MAP.put("ja", "Shift_JIS"); LOCALE_TO_CHARSET_MAP.put("ko", "EUC-KR"); LOCALE_TO_CHARSET_MAP.put("lt", "ISO-8859-2"); LOCALE_TO_CHARSET_MAP.put("lv", "ISO-8859-2"); LOCALE_TO_CHARSET_MAP.put("mk", "ISO-8859-5"); LOCALE_TO_CHARSET_MAP.put("nl", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("no", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("pl", "ISO-8859-2"); LOCALE_TO_CHARSET_MAP.put("pt", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("ro", "ISO-8859-2"); LOCALE_TO_CHARSET_MAP.put("ru", "ISO-8859-5"); LOCALE_TO_CHARSET_MAP.put("sh", "ISO-8859-5"); LOCALE_TO_CHARSET_MAP.put("sk", "ISO-8859-2"); LOCALE_TO_CHARSET_MAP.put("sl", "ISO-8859-2"); LOCALE_TO_CHARSET_MAP.put("sq", "ISO-8859-2"); LOCALE_TO_CHARSET_MAP.put("sr", "ISO-8859-5"); LOCALE_TO_CHARSET_MAP.put("sv", "ISO-8859-1"); LOCALE_TO_CHARSET_MAP.put("tr", "ISO-8859-9"); LOCALE_TO_CHARSET_MAP.put("uk", "ISO-8859-5"); LOCALE_TO_CHARSET_MAP.put("zh", "GB2312"); LOCALE_TO_CHARSET_MAP.put("zh_TW", "Big5"); } /** * Get the preferred charset for the given locale. * * @param locale the locale * @return the preferred charset or null if the locale is not * recognized. */ public static String getCharset(Locale locale) { // try for an full name match (may include country) String charset = (String) LOCALE_TO_CHARSET_MAP.get(locale.toString()); if (charset != null) { return charset; } // if a full name didn't match, try just the language charset = (String) LOCALE_TO_CHARSET_MAP.get(locale.getLanguage()); return charset; // may be null } } }