Java HTML Unescape unescapeHTML(String source)

Here you can find the source of unescapeHTML(String source)

Description

Returns the provided string where all HTML special characters (e.g.

License

Open Source License

Parameter

Parameter Description
source a String possibly containing escaped HTML characters

Declaration

public static final String unescapeHTML(String source) 

Method Source Code

//package com.java2s;
/*// w w  w .  ja  v  a 2s . c  om
 * Copyright 2009 David Jurgens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

import java.util.HashMap;

import java.util.Map;

public class Main {
    /**
     * A mapping from HTML codes for escaped special characters to their unicode
     * character equivalents.
     */
    private static final Map<String, String> HTML_CODES = new HashMap<String, String>();
    private static final Map<String, String> LATIN1_CODES = new HashMap<String, String>();

    /**
     * Returns the provided string where all HTML special characters
     * (e.g. <pre>&nbsp;</pre>) have been replaced with their utf8 equivalents.
     *
     * @param source a String possibly containing escaped HTML characters
     */
    public static final String unescapeHTML(String source) {

        StringBuilder sb = new StringBuilder(source.length());

        // position markers for the & and ;
        int start = -1, end = -1;

        // the end position of the last escaped HTML character
        int last = 0;

        start = source.indexOf("&");
        end = source.indexOf(";", start);

        while (start > -1 && end > start) {
            String encoded = source.substring(start, end + 1);
            String decoded = HTML_CODES.get(encoded);

            // if encoded form wasn't in the HTML codes, try checking to see if
            // it was a Latin-1 code
            if (decoded == null) {
                decoded = LATIN1_CODES.get(encoded);
            }

            if (decoded != null) {
                // append the string containing all characters from the last escaped
                // character to the current one
                String s = source.substring(last, start);
                sb.append(s).append(decoded);
                last = end + 1;
            }

            start = source.indexOf("&", end);
            end = source.indexOf(";", start);
        }
        // if there weren't any substitutions, don't both to create a new String
        if (sb.length() == 0)
            return source;

        // otherwise finish the substitution by appending all the text from the
        // last substitution until the end of the string
        sb.append(source.substring(last));
        return sb.toString();
    }

    /**
     * Modifies the provided {@link StringBuilder} by replacing all HTML special
     * characters (e.g. <pre>&nbsp;</pre>) with their utf8 equivalents.
     *
     * @param source a String possibly containing escaped HTML characters
     */
    public static final void unescapeHTML(StringBuilder source) {

        // position markers for the & and ;
        int start = -1, end = -1;

        // the end position of the last escaped HTML character
        int last = 0;

        start = source.indexOf("&");
        end = source.indexOf(";", start);

        while (start > -1 && end > start) {
            String encoded = source.substring(start, end + 1);
            String decoded = HTML_CODES.get(encoded);

            // if encoded form wasn't in the HTML codes, try checking to see if
            // it was a Latin-1 code
            if (decoded == null) {
                decoded = LATIN1_CODES.get(encoded);
            }

            // If the string had encoded HTML that was recognized, replace it
            // with the decoded version
            if (decoded != null) {
                source.replace(start, end + 1, decoded);
            }

            // Use the start+1 rather than end, since the decoded text may be
            // smaller than the encoded version.  However, don't use start in
            // case the decoded character was actually a '&'.
            start = source.indexOf("&", start + 1);
            end = source.indexOf(";", start);
        }
    }
}

Related

  1. unescapeHtml(String s)
  2. unescapeHTML(String s)
  3. unescapeHtml(String s)
  4. unescapeHTML(String s)
  5. unescapeHTML(String s)
  6. unescapeHTML(String source)
  7. unescapeHTML(String source, int start)
  8. unescapeHTML(String str)
  9. unEscapeHtml(String text)