Replace all the occurences of HTML escape strings with the respective characters. : Document HTML « Development Class « Java






Replace all the occurences of HTML escape strings with the respective characters.

   
import java.util.HashMap;
import java.util.Map;

/* Copyright (c) 2008 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
//package com.google.gdata.util.common.base;


/**
 * Some common string manipulation utilities.
 */
public class Util{

    static Map<String, Character> escapeStrings;

    static {
      // HTML character entity references as defined in HTML 4
      // see http://www.w3.org/TR/REC-html40/sgml/entities.html
      escapeStrings = new HashMap<String, Character>(252);

      escapeStrings.put("&nbsp;", new Character('\u00A0'));
      escapeStrings.put("&iexcl;", new Character('\u00A1'));
      escapeStrings.put("&cent;", new Character('\u00A2'));
      escapeStrings.put("&pound;", new Character('\u00A3'));
      escapeStrings.put("&curren;", new Character('\u00A4'));
      escapeStrings.put("&yen;", new Character('\u00A5'));
      escapeStrings.put("&brvbar;", new Character('\u00A6'));
      escapeStrings.put("&sect;", new Character('\u00A7'));
      escapeStrings.put("&uml;", new Character('\u00A8'));
      escapeStrings.put("&copy;", new Character('\u00A9'));
      escapeStrings.put("&ordf;", new Character('\u00AA'));
      escapeStrings.put("&laquo;", new Character('\u00AB'));
      escapeStrings.put("&not;", new Character('\u00AC'));
      escapeStrings.put("&shy;", new Character('\u00AD'));
      escapeStrings.put("&reg;", new Character('\u00AE'));
      escapeStrings.put("&macr;", new Character('\u00AF'));
      escapeStrings.put("&deg;", new Character('\u00B0'));
      escapeStrings.put("&plusmn;", new Character('\u00B1'));
      escapeStrings.put("&sup2;", new Character('\u00B2'));
      escapeStrings.put("&sup3;", new Character('\u00B3'));
      escapeStrings.put("&acute;", new Character('\u00B4'));
      escapeStrings.put("&micro;", new Character('\u00B5'));
      escapeStrings.put("&para;", new Character('\u00B6'));
      escapeStrings.put("&middot;", new Character('\u00B7'));
      escapeStrings.put("&cedil;", new Character('\u00B8'));
      escapeStrings.put("&sup1;", new Character('\u00B9'));
      escapeStrings.put("&ordm;", new Character('\u00BA'));
      escapeStrings.put("&raquo;", new Character('\u00BB'));
      escapeStrings.put("&frac14;", new Character('\u00BC'));
      escapeStrings.put("&frac12;", new Character('\u00BD'));
      escapeStrings.put("&frac34;", new Character('\u00BE'));
      escapeStrings.put("&iquest;", new Character('\u00BF'));
      escapeStrings.put("&Agrave;", new Character('\u00C0'));
      escapeStrings.put("&Aacute;", new Character('\u00C1'));
      escapeStrings.put("&Acirc;", new Character('\u00C2'));
      escapeStrings.put("&Atilde;", new Character('\u00C3'));
      escapeStrings.put("&Auml;", new Character('\u00C4'));
      escapeStrings.put("&Aring;", new Character('\u00C5'));
      escapeStrings.put("&AElig;", new Character('\u00C6'));
      escapeStrings.put("&Ccedil;", new Character('\u00C7'));
      escapeStrings.put("&Egrave;", new Character('\u00C8'));
      escapeStrings.put("&Eacute;", new Character('\u00C9'));
      escapeStrings.put("&Ecirc;", new Character('\u00CA'));
      escapeStrings.put("&Euml;", new Character('\u00CB'));
      escapeStrings.put("&Igrave;", new Character('\u00CC'));
      escapeStrings.put("&Iacute;", new Character('\u00CD'));
      escapeStrings.put("&Icirc;", new Character('\u00CE'));
      escapeStrings.put("&Iuml;", new Character('\u00CF'));
      escapeStrings.put("&ETH;", new Character('\u00D0'));
      escapeStrings.put("&Ntilde;", new Character('\u00D1'));
      escapeStrings.put("&Ograve;", new Character('\u00D2'));
      escapeStrings.put("&Oacute;", new Character('\u00D3'));
      escapeStrings.put("&Ocirc;", new Character('\u00D4'));
      escapeStrings.put("&Otilde;", new Character('\u00D5'));
      escapeStrings.put("&Ouml;", new Character('\u00D6'));
      escapeStrings.put("&times;", new Character('\u00D7'));
      escapeStrings.put("&Oslash;", new Character('\u00D8'));
      escapeStrings.put("&Ugrave;", new Character('\u00D9'));
      escapeStrings.put("&Uacute;", new Character('\u00DA'));
      escapeStrings.put("&Ucirc;", new Character('\u00DB'));
      escapeStrings.put("&Uuml;", new Character('\u00DC'));
      escapeStrings.put("&Yacute;", new Character('\u00DD'));
      escapeStrings.put("&THORN;", new Character('\u00DE'));
      escapeStrings.put("&szlig;", new Character('\u00DF'));
      escapeStrings.put("&agrave;", new Character('\u00E0'));
      escapeStrings.put("&aacute;", new Character('\u00E1'));
      escapeStrings.put("&acirc;", new Character('\u00E2'));
      escapeStrings.put("&atilde;", new Character('\u00E3'));
      escapeStrings.put("&auml;", new Character('\u00E4'));
      escapeStrings.put("&aring;", new Character('\u00E5'));
      escapeStrings.put("&aelig;", new Character('\u00E6'));
      escapeStrings.put("&ccedil;", new Character('\u00E7'));
      escapeStrings.put("&egrave;", new Character('\u00E8'));
      escapeStrings.put("&eacute;", new Character('\u00E9'));
      escapeStrings.put("&ecirc;", new Character('\u00EA'));
      escapeStrings.put("&euml;", new Character('\u00EB'));
      escapeStrings.put("&igrave;", new Character('\u00EC'));
      escapeStrings.put("&iacute;", new Character('\u00ED'));
      escapeStrings.put("&icirc;", new Character('\u00EE'));
      escapeStrings.put("&iuml;", new Character('\u00EF'));
      escapeStrings.put("&eth;", new Character('\u00F0'));
      escapeStrings.put("&ntilde;", new Character('\u00F1'));
      escapeStrings.put("&ograve;", new Character('\u00F2'));
      escapeStrings.put("&oacute;", new Character('\u00F3'));
      escapeStrings.put("&ocirc;", new Character('\u00F4'));
      escapeStrings.put("&otilde;", new Character('\u00F5'));
      escapeStrings.put("&ouml;", new Character('\u00F6'));
      escapeStrings.put("&divide;", new Character('\u00F7'));
      escapeStrings.put("&oslash;", new Character('\u00F8'));
      escapeStrings.put("&ugrave;", new Character('\u00F9'));
      escapeStrings.put("&uacute;", new Character('\u00FA'));
      escapeStrings.put("&ucirc;", new Character('\u00FB'));
      escapeStrings.put("&uuml;", new Character('\u00FC'));
      escapeStrings.put("&yacute;", new Character('\u00FD'));
      escapeStrings.put("&thorn;", new Character('\u00FE'));
      escapeStrings.put("&yuml;", new Character('\u00FF'));
      escapeStrings.put("&fnof;", new Character('\u0192'));
      escapeStrings.put("&Alpha;", new Character('\u0391'));
      escapeStrings.put("&Beta;", new Character('\u0392'));
      escapeStrings.put("&Gamma;", new Character('\u0393'));
      escapeStrings.put("&Delta;", new Character('\u0394'));
      escapeStrings.put("&Epsilon;", new Character('\u0395'));
      escapeStrings.put("&Zeta;", new Character('\u0396'));
      escapeStrings.put("&Eta;", new Character('\u0397'));
      escapeStrings.put("&Theta;", new Character('\u0398'));
      escapeStrings.put("&Iota;", new Character('\u0399'));
      escapeStrings.put("&Kappa;", new Character('\u039A'));
      escapeStrings.put("&Lambda;", new Character('\u039B'));
      escapeStrings.put("&Mu;", new Character('\u039C'));
      escapeStrings.put("&Nu;", new Character('\u039D'));
      escapeStrings.put("&Xi;", new Character('\u039E'));
      escapeStrings.put("&Omicron;", new Character('\u039F'));
      escapeStrings.put("&Pi;", new Character('\u03A0'));
      escapeStrings.put("&Rho;", new Character('\u03A1'));
      escapeStrings.put("&Sigma;", new Character('\u03A3'));
      escapeStrings.put("&Tau;", new Character('\u03A4'));
      escapeStrings.put("&Upsilon;", new Character('\u03A5'));
      escapeStrings.put("&Phi;", new Character('\u03A6'));
      escapeStrings.put("&Chi;", new Character('\u03A7'));
      escapeStrings.put("&Psi;", new Character('\u03A8'));
      escapeStrings.put("&Omega;", new Character('\u03A9'));
      escapeStrings.put("&alpha;", new Character('\u03B1'));
      escapeStrings.put("&beta;", new Character('\u03B2'));
      escapeStrings.put("&gamma;", new Character('\u03B3'));
      escapeStrings.put("&delta;", new Character('\u03B4'));
      escapeStrings.put("&epsilon;", new Character('\u03B5'));
      escapeStrings.put("&zeta;", new Character('\u03B6'));
      escapeStrings.put("&eta;", new Character('\u03B7'));
      escapeStrings.put("&theta;", new Character('\u03B8'));
      escapeStrings.put("&iota;", new Character('\u03B9'));
      escapeStrings.put("&kappa;", new Character('\u03BA'));
      escapeStrings.put("&lambda;", new Character('\u03BB'));
      escapeStrings.put("&mu;", new Character('\u03BC'));
      escapeStrings.put("&nu;", new Character('\u03BD'));
      escapeStrings.put("&xi;", new Character('\u03BE'));
      escapeStrings.put("&omicron;", new Character('\u03BF'));
      escapeStrings.put("&pi;", new Character('\u03C0'));
      escapeStrings.put("&rho;", new Character('\u03C1'));
      escapeStrings.put("&sigmaf;", new Character('\u03C2'));
      escapeStrings.put("&sigma;", new Character('\u03C3'));
      escapeStrings.put("&tau;", new Character('\u03C4'));
      escapeStrings.put("&upsilon;", new Character('\u03C5'));
      escapeStrings.put("&phi;", new Character('\u03C6'));
      escapeStrings.put("&chi;", new Character('\u03C7'));
      escapeStrings.put("&psi;", new Character('\u03C8'));
      escapeStrings.put("&omega;", new Character('\u03C9'));
      escapeStrings.put("&thetasym;", new Character('\u03D1'));
      escapeStrings.put("&upsih;", new Character('\u03D2'));
      escapeStrings.put("&piv;", new Character('\u03D6'));
      escapeStrings.put("&bull;", new Character('\u2022'));
      escapeStrings.put("&hellip;", new Character('\u2026'));
      escapeStrings.put("&prime;", new Character('\u2032'));
      escapeStrings.put("&Prime;", new Character('\u2033'));
      escapeStrings.put("&oline;", new Character('\u203E'));
      escapeStrings.put("&frasl;", new Character('\u2044'));
      escapeStrings.put("&weierp;", new Character('\u2118'));
      escapeStrings.put("&image;", new Character('\u2111'));
      escapeStrings.put("&real;", new Character('\u211C'));
      escapeStrings.put("&trade;", new Character('\u2122'));
      escapeStrings.put("&alefsym;", new Character('\u2135'));
      escapeStrings.put("&larr;", new Character('\u2190'));
      escapeStrings.put("&uarr;", new Character('\u2191'));
      escapeStrings.put("&rarr;", new Character('\u2192'));
      escapeStrings.put("&darr;", new Character('\u2193'));
      escapeStrings.put("&harr;", new Character('\u2194'));
      escapeStrings.put("&crarr;", new Character('\u21B5'));
      escapeStrings.put("&lArr;", new Character('\u21D0'));
      escapeStrings.put("&uArr;", new Character('\u21D1'));
      escapeStrings.put("&rArr;", new Character('\u21D2'));
      escapeStrings.put("&dArr;", new Character('\u21D3'));
      escapeStrings.put("&hArr;", new Character('\u21D4'));
      escapeStrings.put("&forall;", new Character('\u2200'));
      escapeStrings.put("&part;", new Character('\u2202'));
      escapeStrings.put("&exist;", new Character('\u2203'));
      escapeStrings.put("&empty;", new Character('\u2205'));
      escapeStrings.put("&nabla;", new Character('\u2207'));
      escapeStrings.put("&isin;", new Character('\u2208'));
      escapeStrings.put("&notin;", new Character('\u2209'));
      escapeStrings.put("&ni;", new Character('\u220B'));
      escapeStrings.put("&prod;", new Character('\u220F'));
      escapeStrings.put("&sum;", new Character('\u2211'));
      escapeStrings.put("&minus;", new Character('\u2212'));
      escapeStrings.put("&lowast;", new Character('\u2217'));
      escapeStrings.put("&radic;", new Character('\u221A'));
      escapeStrings.put("&prop;", new Character('\u221D'));
      escapeStrings.put("&infin;", new Character('\u221E'));
      escapeStrings.put("&ang;", new Character('\u2220'));
      escapeStrings.put("&and;", new Character('\u2227'));
      escapeStrings.put("&or;", new Character('\u2228'));
      escapeStrings.put("&cap;", new Character('\u2229'));
      escapeStrings.put("&cup;", new Character('\u222A'));
      escapeStrings.put("&int;", new Character('\u222B'));
      escapeStrings.put("&there4;", new Character('\u2234'));
      escapeStrings.put("&sim;", new Character('\u223C'));
      escapeStrings.put("&cong;", new Character('\u2245'));
      escapeStrings.put("&asymp;", new Character('\u2248'));
      escapeStrings.put("&ne;", new Character('\u2260'));
      escapeStrings.put("&equiv;", new Character('\u2261'));
      escapeStrings.put("&le;", new Character('\u2264'));
      escapeStrings.put("&ge;", new Character('\u2265'));
      escapeStrings.put("&sub;", new Character('\u2282'));
      escapeStrings.put("&sup;", new Character('\u2283'));
      escapeStrings.put("&nsub;", new Character('\u2284'));
      escapeStrings.put("&sube;", new Character('\u2286'));
      escapeStrings.put("&supe;", new Character('\u2287'));
      escapeStrings.put("&oplus;", new Character('\u2295'));
      escapeStrings.put("&otimes;", new Character('\u2297'));
      escapeStrings.put("&perp;", new Character('\u22A5'));
      escapeStrings.put("&sdot;", new Character('\u22C5'));
      escapeStrings.put("&lceil;", new Character('\u2308'));
      escapeStrings.put("&rceil;", new Character('\u2309'));
      escapeStrings.put("&lfloor;", new Character('\u230A'));
      escapeStrings.put("&rfloor;", new Character('\u230B'));
      escapeStrings.put("&lang;", new Character('\u2329'));
      escapeStrings.put("&rang;", new Character('\u232A'));
      escapeStrings.put("&loz;", new Character('\u25CA'));
      escapeStrings.put("&spades;", new Character('\u2660'));
      escapeStrings.put("&clubs;", new Character('\u2663'));
      escapeStrings.put("&hearts;", new Character('\u2665'));
      escapeStrings.put("&diams;", new Character('\u2666'));
      escapeStrings.put("&quot;", new Character('\u0022'));
      escapeStrings.put("&amp;", new Character('\u0026'));
      escapeStrings.put("&lt;", new Character('\u003C'));
      escapeStrings.put("&gt;", new Character('\u003E'));
      escapeStrings.put("&OElig;", new Character('\u0152'));
      escapeStrings.put("&oelig;", new Character('\u0153'));
      escapeStrings.put("&Scaron;", new Character('\u0160'));
      escapeStrings.put("&scaron;", new Character('\u0161'));
      escapeStrings.put("&Yuml;", new Character('\u0178'));
      escapeStrings.put("&circ;", new Character('\u02C6'));
      escapeStrings.put("&tilde;", new Character('\u02DC'));
      escapeStrings.put("&ensp;", new Character('\u2002'));
      escapeStrings.put("&emsp;", new Character('\u2003'));
      escapeStrings.put("&thinsp;", new Character('\u2009'));
      escapeStrings.put("&zwnj;", new Character('\u200C'));
      escapeStrings.put("&zwj;", new Character('\u200D'));
      escapeStrings.put("&lrm;", new Character('\u200E'));
      escapeStrings.put("&rlm;", new Character('\u200F'));
      escapeStrings.put("&ndash;", new Character('\u2013'));
      escapeStrings.put("&mdash;", new Character('\u2014'));
      escapeStrings.put("&lsquo;", new Character('\u2018'));
      escapeStrings.put("&rsquo;", new Character('\u2019'));
      escapeStrings.put("&sbquo;", new Character('\u201A'));
      escapeStrings.put("&ldquo;", new Character('\u201C'));
      escapeStrings.put("&rdquo;", new Character('\u201D'));
      escapeStrings.put("&bdquo;", new Character('\u201E'));
      escapeStrings.put("&dagger;", new Character('\u2020'));
      escapeStrings.put("&Dagger;", new Character('\u2021'));
      escapeStrings.put("&permil;", new Character('\u2030'));
      escapeStrings.put("&lsaquo;", new Character('\u2039'));
      escapeStrings.put("&rsaquo;", new Character('\u203A'));
      escapeStrings.put("&euro;", new Character('\u20AC'));
    }

    /**
     * Replace all the occurences of HTML escape strings with the
     * respective characters.
     *
     * @param s a <code>String</code> value
     * @return a <code>String</code> value
     */
    public static final String unescapeHTML(String s) {
      char[] chars = s.toCharArray();
      char[] escaped = new char[chars.length];

      // Note: escaped[pos] = end of the escaped char array.
      int pos = 0;

      for (int i = 0; i < chars.length;) {
        if (chars[i] != '&') {
          escaped[pos++] = chars[i++];
          continue;
        }

        // Allow e.g. &#123;
        int j = i + 1;
        if (j < chars.length && chars[j] == '#')
          j++;

        // Scan until we find a char that is not letter or digit.
        for (; j < chars.length; j++) {
          if (!Character.isLetterOrDigit(chars[j]))
            break;
          }

        boolean replaced = false;
        if (j < chars.length && chars[j] == ';') {
          if (s.charAt(i + 1) == '#') { // Check for &#D; and &#xD; pattern
            try {
              long charcode = 0;
              char ch = s.charAt(i + 2);
              if (ch == 'x' || ch == 'X') {
                charcode = Long.parseLong(new String(chars, i + 3, j - i - 3),
                                          16);
              } else if (Character.isDigit(ch)) {
                charcode = Long.parseLong(new String(chars, i + 2, j - i - 2));
              }
              if (charcode > 0 && charcode < 65536) {
                escaped[pos++] = (char) charcode;
                replaced = true;
              }
            } catch (NumberFormatException ex) {
              // Failed, not replaced.
            }

          } else {
            String key = new String(chars, i, j - i + 1);
            Character repl = escapeStrings.get(key);
            if (repl != null) {
              escaped[pos++] = repl.charValue();
              replaced = true;
            }
          }
          j++;                            // Skip over ';'
        }

        if (!replaced) {
          // Not a recognized escape sequence, leave as-is
          System.arraycopy(chars, i, escaped, pos, j - i);
          pos += j - i;
        }
        i = j;
      }
      return new String(escaped, 0, pos);
    }
}

   
    
    
  








Related examples in the same category

1.HTMLDocument: Element Iterator Example
2.HTMLEditorKit DemoHTMLEditorKit Demo
3.SimpleAttributeSet ExampleSimpleAttributeSet Example
4.Text Tab SampleText Tab Sample
5.Styled DocumentStyled Document
6.Html utils for working with tag's names and attributes.
7.Escape HTML
8.Escape HTML
9.Encode HTML
10.HTML Rewriter
11.HTML Encode
12.XMLWriter is a generic class that provides common behavior to writers of a tagged language such as XML, WordML and HTML.
13.Escape html entities.
14.Html Encoder
15.Remove Comment