Java HTML Unescape unescapeHTML(String source)

Description

Returns the provided string where all HTML special characters (e.g.

License

Open Source License

Parameter

Parameter	Description
source	a String possibly containing escaped HTML characters

Declaration

public static final String unescapeHTML(String source)

Method Source Code

//package com.java2s;
/*// w w  w .  ja  v  a 2s . c  om
 * Copyright 2009 David Jurgens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

import java.util.HashMap;

import java.util.Map;

public class Main {
    /**
     * A mapping from HTML codes for escaped special characters to their unicode
     * character equivalents.
     */
    private static final Map<String, String> HTML_CODES = new HashMap<String, String>();
    private static final Map<String, String> LATIN1_CODES = new HashMap<String, String>();

    /**
     * Returns the provided string where all HTML special characters
     * (e.g. <pre>&nbsp;</pre>) have been replaced with their utf8 equivalents.
     *
     * @param source a String possibly containing escaped HTML characters
     */
    public static final String unescapeHTML(String source) {

        StringBuilder sb = new StringBuilder(source.length());

        // position markers for the & and ;
        int start = -1, end = -1;

        // the end position of the last escaped HTML character
        int last = 0;

        start = source.indexOf("&");
        end = source.indexOf(";", start);

        while (start > -1 && end > start) {
            String encoded = source.substring(start, end + 1);
            String decoded = HTML_CODES.get(encoded);

            // if encoded form wasn't in the HTML codes, try checking to see if
            // it was a Latin-1 code
            if (decoded == null) {
                decoded = LATIN1_CODES.get(encoded);
            }

            if (decoded != null) {
                // append the string containing all characters from the last escaped
                // character to the current one
                String s = source.substring(last, start);
                sb.append(s).append(decoded);
                last = end + 1;
            }

            start = source.indexOf("&", end);
            end = source.indexOf(";", start);
        }
        // if there weren't any substitutions, don't both to create a new String
        if (sb.length() == 0)
            return source;

        // otherwise finish the substitution by appending all the text from the
        // last substitution until the end of the string
        sb.append(source.substring(last));
        return sb.toString();
    }

    /**
     * Modifies the provided {@link StringBuilder} by replacing all HTML special
     * characters (e.g. <pre>&nbsp;</pre>) with their utf8 equivalents.
     *
     * @param source a String possibly containing escaped HTML characters
     */
    public static final void unescapeHTML(StringBuilder source) {

        // position markers for the & and ;
        int start = -1, end = -1;

        // the end position of the last escaped HTML character
        int last = 0;

        start = source.indexOf("&");
        end = source.indexOf(";", start);

        while (start > -1 && end > start) {
            String encoded = source.substring(start, end + 1);
            String decoded = HTML_CODES.get(encoded);

            // if encoded form wasn't in the HTML codes, try checking to see if
            // it was a Latin-1 code
            if (decoded == null) {
                decoded = LATIN1_CODES.get(encoded);
            }

            // If the string had encoded HTML that was recognized, replace it
            // with the decoded version
            if (decoded != null) {
                source.replace(start, end + 1, decoded);
            }

            // Use the start+1 rather than end, since the decoded text may be
            // smaller than the encoded version.  However, don't use start in
            // case the decoded character was actually a '&'.
            start = source.indexOf("&", start + 1);
            end = source.indexOf(";", start);
        }
    }
}

Java HTML Unescape unescapeHTML(String source)

Description

License

Parameter

Declaration

Method Source Code

Related