org.apache.wicket.util.encoding.UrlEncoder.java Source code

Introduction

Here is the source code for org.apache.wicket.util.encoding.UrlEncoder.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.wicket.util.encoding;

import java.io.CharArrayWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.BitSet;

import org.apache.wicket.util.lang.Args;

/**
 * Adapted from java.net.URLEncoder, but defines instances for query string encoding versus URL path
 * component encoding.
 * <p/>
 * The difference is important because a space is encoded as a + in a query string, but this is a
 * valid value in a path component (and is therefore not decode back to a space).
 * 
 * @author Doug Donohoe
 * @see java.net.URLEncoder
 * @see <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC-2396</a>
 */
public class UrlEncoder {
    /**
     * encoder types
     */
    public enum Type {
        QUERY, PATH, HEADER
    }

    /**
     * List of what not to encode, i.e. characters (e.g. A-Z) and other allowed signs (e.g. !)
     * that are allowed but don't have a special meaning.
     */
    protected BitSet dontNeedEncoding;

    // used in decoding
    protected static final int caseDiff = ('a' - 'A');

    /**
     * Encoder used to encode name or value components of a query string.<br/>
     * <br/>
     * 
     * For example: http://org.acme/notthis/northis/oreventhis?buthis=isokay&asis=thispart
     */
    public static final UrlEncoder QUERY_INSTANCE = new UrlEncoder(Type.QUERY);

    /**
     * Encoder used to encode segments of a path.<br/>
     * <br/>
     * 
     * For example: http://org.acme/foo/thispart/orthispart?butnot=thispart
     */
    public static final UrlEncoder PATH_INSTANCE = new UrlEncoder(Type.PATH);

    /**
     * Encoder used to encode a header.
     */
    public static final UrlEncoder HEADER_INSTANCE = new UrlEncoder(Type.HEADER);

    /**
     * Allow subclass to call constructor.
     * 
     * @param type
     *            encoder type
     */
    protected UrlEncoder(final Type type) {
        /*
         * This note from java.net.URLEncoder ==================================
         * 
         * The list of characters that are not encoded has been determined as follows:
         * 
         * RFC 2396 states: ----- Data characters that are allowed in a URI but do not have a
         * reserved purpose are called unreserved. These include upper and lower case letters,
         * decimal digits, and a limited set of punctuation marks and symbols.
         * 
         * unreserved = alphanum | mark
         * 
         * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
         * 
         * Unreserved characters can be escaped without changing the semantics of the URI, but this
         * should not be done unless the URI is being used in a context that does not allow the
         * unescaped character to appear. -----
         * 
         * It appears that both Netscape and Internet Explorer escape all special characters from
         * this list with the exception of "-", "_", ".", "*". While it is not clear why they are
         * escaping the other characters, perhaps it is safest to assume that there might be
         * contexts in which the others are unsafe if not escaped. Therefore, we will use the same
         * list. It is also noteworthy that this is consistent with O'Reilly's
         * "HTML: The Definitive Guide" (page 164).
         * 
         * As a last note, Intenet Explorer does not encode the "@" character which is clearly not
         * unreserved according to the RFC. We are being consistent with the RFC in this matter, as
         * is Netscape.
         * 
         * This bit added by Doug Donohoe ================================== RFC 3986 (2005) updates
         * this (http://tools.ietf.org/html/rfc3986):
         * 
         * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
         * 
         * pct-encoded = "%" HEXDIG HEXDIG
         * 
         * reserved = gen-delims / sub-delims
         * 
         * gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
         * 
         * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" // -- PATH
         * COMPONENT -- //
         * 
         * path = (see RFC for all variations) path-abempty =( "/" segment ) segment =pchar pchar =
         * unreserved / pct-encoded / sub-delims / ":" / "@" // -- QUERY COMPONENT -- //
         * 
         * query =( pchar / "/" / "?" )
         */

        // unreserved
        dontNeedEncoding = new BitSet(256);
        int i;
        for (i = 'a'; i <= 'z'; i++) {
            dontNeedEncoding.set(i);
        }
        for (i = 'A'; i <= 'Z'; i++) {
            dontNeedEncoding.set(i);
        }
        for (i = '0'; i <= '9'; i++) {
            dontNeedEncoding.set(i);
        }
        dontNeedEncoding.set('-');
        dontNeedEncoding.set('.');
        dontNeedEncoding.set('_');
        // tilde encoded by java.net.URLEncoder version, but RFC is clear on this
        dontNeedEncoding.set('~');

        // sub-delims
        dontNeedEncoding.set('!');
        dontNeedEncoding.set('$');

        // encoding type-specific
        switch (type) {
        case QUERY:
            // this code consistent with java.net.URLEncoder version#

            // encoding a space to a + is done in the encode() method
            dontNeedEncoding.set(' ');

            // sub-delims continued
            dontNeedEncoding.set('*');
            dontNeedEncoding.set('/'); // to allow direct passing of URL in query
            dontNeedEncoding.set(',');
            // "'" doesn't need encoding, but it will make it easier to use in in JavaScript  
            // "(" and ")" don't need encoding, but we'll be conservative

            dontNeedEncoding.set(':'); // allowed and used in wicket interface
            dontNeedEncoding.set('@');

            /*
             * the below encoding of a ? is disabled because it interferes in portlet
             * environments. as far as i can tell it will not interfere with the ability to pass
             * around urls in the query string. however, should it cause problems we can
             * re-enable it as portlet environments are not high priority. we can also add a
             * switch somewhere to enable/disable this on applicaiton level. (WICKET-4019)
             */
            // dontNeedEncoding.set('?'); // to allow direct passing of URL in query
            break;

        case PATH:
            // this added to deal with encoding a PATH segment

            // sub-delims continued
            dontNeedEncoding.set('*');
            dontNeedEncoding.set('&');
            dontNeedEncoding.set('+');
            // "'" doesn't need encoding, but it will make it easier to use in in JavaScript  
            // "(" and ")" don't need encoding, but we'll be conservative
            dontNeedEncoding.set(',');
            dontNeedEncoding.set(';'); // semicolon is used in ;jsessionid=
            dontNeedEncoding.set('=');

            dontNeedEncoding.set(':'); // allowed and used in wicket interface
            dontNeedEncoding.set('@');

            break;

        // this added to deal with encoding a PATH component
        case HEADER:
            // this added to deal with encoding of header

            // ' ' is encoded

            // sub-delims continued
            dontNeedEncoding.set('#');
            dontNeedEncoding.set('&');
            dontNeedEncoding.set('+');

            dontNeedEncoding.set('^');
            dontNeedEncoding.set('`');
            dontNeedEncoding.set('|');
            break;
        }
    }

    /**
     * @param s
     *            string to encode
     * @param charset
     *            charset to use for encoding
     * @return encoded string
     * @see java.net.URLEncoder#encode(String, String)
     */
    public String encode(final String s, final Charset charset) {
        return encode(s, charset.name());
    }

    /**
     * @param unsafeInput
     *            string to encode
     * @param charsetName
     *            encoding to use
     * @return encoded string
     * @see java.net.URLEncoder#encode(String, String)
     */
    public String encode(final String unsafeInput, final String charsetName) {
        final String s = unsafeInput.replace("\0", "NULL");
        StringBuilder out = new StringBuilder(s.length());
        Charset charset;
        CharArrayWriter charArrayWriter = new CharArrayWriter();

        Args.notNull(charsetName, "charsetName");

        try {
            charset = Charset.forName(charsetName);
        } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
            throw new RuntimeException(new UnsupportedEncodingException(charsetName));
        }

        for (int i = 0; i < s.length();) {
            int c = s.charAt(i);

            // System.out.println("Examining character: " + c);
            if (dontNeedEncoding.get(c)) {
                if (c == ' ') {
                    c = '+';
                }
                // System.out.println("Storing: " + c);
                out.append((char) c);
                i++;
            } else {
                // convert to external encoding before hex conversion
                do {
                    charArrayWriter.write(c);
                    /*
                     * If this character represents the start of a Unicode surrogate pair, then pass
                     * in two characters. It's not clear what should be done if a bytes reserved in
                     * the surrogate pairs range occurs outside of a legal surrogate pair. For now,
                     * just treat it as if it were any other character.
                     */
                    if ((c >= 0xD800) && (c <= 0xDBFF)) {
                        /*
                         * System.out.println(Integer.toHexString(c) + " is high surrogate");
                         */
                        if ((i + 1) < s.length()) {
                            int d = s.charAt(i + 1);
                            /*
                             * System.out.println("\tExamining " + Integer.toHexString(d));
                             */
                            if ((d >= 0xDC00) && (d <= 0xDFFF)) {
                                /*
                                 * System.out.println("\t" + Integer.toHexString(d) + " is low
                                 * surrogate");
                                 */
                                charArrayWriter.write(d);
                                i++;
                            }
                        }
                    }
                    i++;
                } while ((i < s.length()) && !dontNeedEncoding.get((c = s.charAt(i))));

                charArrayWriter.flush();
                String str = new String(charArrayWriter.toCharArray());
                byte[] ba = str.getBytes(charset);
                for (byte b : ba) {
                    out.append('%');
                    char ch = Character.forDigit((b >> 4) & 0xF, 16);
                    // converting to use uppercase letter as part of
                    // the hex value if ch is a letter.
                    if (Character.isLetter(ch)) {
                        ch -= caseDiff;
                    }
                    out.append(ch);
                    ch = Character.forDigit(b & 0xF, 16);
                    if (Character.isLetter(ch)) {
                        ch -= caseDiff;
                    }
                    out.append(ch);
                }
                charArrayWriter.reset();
            }
        }

        return out.toString();
    }
}