Replace all the occurences of HTML escape strings with the respective characters.
import java.util.HashMap;
import java.util.Map;
/* Copyright (c) 2008 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//package com.google.gdata.util.common.base;
/**
* Some common string manipulation utilities.
*/
public class Util{
static Map<String, Character> escapeStrings;
static {
// HTML character entity references as defined in HTML 4
// see http://www.w3.org/TR/REC-html40/sgml/entities.html
escapeStrings = new HashMap<String, Character>(252);
escapeStrings.put(" ", new Character('\u00A0'));
escapeStrings.put("¡", new Character('\u00A1'));
escapeStrings.put("¢", new Character('\u00A2'));
escapeStrings.put("£", new Character('\u00A3'));
escapeStrings.put("¤", new Character('\u00A4'));
escapeStrings.put("¥", new Character('\u00A5'));
escapeStrings.put("¦", new Character('\u00A6'));
escapeStrings.put("§", new Character('\u00A7'));
escapeStrings.put("¨", new Character('\u00A8'));
escapeStrings.put("©", new Character('\u00A9'));
escapeStrings.put("ª", new Character('\u00AA'));
escapeStrings.put("«", new Character('\u00AB'));
escapeStrings.put("¬", new Character('\u00AC'));
escapeStrings.put("­", new Character('\u00AD'));
escapeStrings.put("®", new Character('\u00AE'));
escapeStrings.put("¯", new Character('\u00AF'));
escapeStrings.put("°", new Character('\u00B0'));
escapeStrings.put("±", new Character('\u00B1'));
escapeStrings.put("²", new Character('\u00B2'));
escapeStrings.put("³", new Character('\u00B3'));
escapeStrings.put("´", new Character('\u00B4'));
escapeStrings.put("µ", new Character('\u00B5'));
escapeStrings.put("¶", new Character('\u00B6'));
escapeStrings.put("·", new Character('\u00B7'));
escapeStrings.put("¸", new Character('\u00B8'));
escapeStrings.put("¹", new Character('\u00B9'));
escapeStrings.put("º", new Character('\u00BA'));
escapeStrings.put("»", new Character('\u00BB'));
escapeStrings.put("¼", new Character('\u00BC'));
escapeStrings.put("½", new Character('\u00BD'));
escapeStrings.put("¾", new Character('\u00BE'));
escapeStrings.put("¿", new Character('\u00BF'));
escapeStrings.put("À", new Character('\u00C0'));
escapeStrings.put("Á", new Character('\u00C1'));
escapeStrings.put("Â", new Character('\u00C2'));
escapeStrings.put("Ã", new Character('\u00C3'));
escapeStrings.put("Ä", new Character('\u00C4'));
escapeStrings.put("Å", new Character('\u00C5'));
escapeStrings.put("Æ", new Character('\u00C6'));
escapeStrings.put("Ç", new Character('\u00C7'));
escapeStrings.put("È", new Character('\u00C8'));
escapeStrings.put("É", new Character('\u00C9'));
escapeStrings.put("Ê", new Character('\u00CA'));
escapeStrings.put("Ë", new Character('\u00CB'));
escapeStrings.put("Ì", new Character('\u00CC'));
escapeStrings.put("Í", new Character('\u00CD'));
escapeStrings.put("Î", new Character('\u00CE'));
escapeStrings.put("Ï", new Character('\u00CF'));
escapeStrings.put("Ð", new Character('\u00D0'));
escapeStrings.put("Ñ", new Character('\u00D1'));
escapeStrings.put("Ò", new Character('\u00D2'));
escapeStrings.put("Ó", new Character('\u00D3'));
escapeStrings.put("Ô", new Character('\u00D4'));
escapeStrings.put("Õ", new Character('\u00D5'));
escapeStrings.put("Ö", new Character('\u00D6'));
escapeStrings.put("×", new Character('\u00D7'));
escapeStrings.put("Ø", new Character('\u00D8'));
escapeStrings.put("Ù", new Character('\u00D9'));
escapeStrings.put("Ú", new Character('\u00DA'));
escapeStrings.put("Û", new Character('\u00DB'));
escapeStrings.put("Ü", new Character('\u00DC'));
escapeStrings.put("Ý", new Character('\u00DD'));
escapeStrings.put("Þ", new Character('\u00DE'));
escapeStrings.put("ß", new Character('\u00DF'));
escapeStrings.put("à", new Character('\u00E0'));
escapeStrings.put("á", new Character('\u00E1'));
escapeStrings.put("â", new Character('\u00E2'));
escapeStrings.put("ã", new Character('\u00E3'));
escapeStrings.put("ä", new Character('\u00E4'));
escapeStrings.put("å", new Character('\u00E5'));
escapeStrings.put("æ", new Character('\u00E6'));
escapeStrings.put("ç", new Character('\u00E7'));
escapeStrings.put("è", new Character('\u00E8'));
escapeStrings.put("é", new Character('\u00E9'));
escapeStrings.put("ê", new Character('\u00EA'));
escapeStrings.put("ë", new Character('\u00EB'));
escapeStrings.put("ì", new Character('\u00EC'));
escapeStrings.put("í", new Character('\u00ED'));
escapeStrings.put("î", new Character('\u00EE'));
escapeStrings.put("ï", new Character('\u00EF'));
escapeStrings.put("ð", new Character('\u00F0'));
escapeStrings.put("ñ", new Character('\u00F1'));
escapeStrings.put("ò", new Character('\u00F2'));
escapeStrings.put("ó", new Character('\u00F3'));
escapeStrings.put("ô", new Character('\u00F4'));
escapeStrings.put("õ", new Character('\u00F5'));
escapeStrings.put("ö", new Character('\u00F6'));
escapeStrings.put("÷", new Character('\u00F7'));
escapeStrings.put("ø", new Character('\u00F8'));
escapeStrings.put("ù", new Character('\u00F9'));
escapeStrings.put("ú", new Character('\u00FA'));
escapeStrings.put("û", new Character('\u00FB'));
escapeStrings.put("ü", new Character('\u00FC'));
escapeStrings.put("ý", new Character('\u00FD'));
escapeStrings.put("þ", new Character('\u00FE'));
escapeStrings.put("ÿ", new Character('\u00FF'));
escapeStrings.put("ƒ", new Character('\u0192'));
escapeStrings.put("Α", new Character('\u0391'));
escapeStrings.put("Β", new Character('\u0392'));
escapeStrings.put("Γ", new Character('\u0393'));
escapeStrings.put("Δ", new Character('\u0394'));
escapeStrings.put("Ε", new Character('\u0395'));
escapeStrings.put("Ζ", new Character('\u0396'));
escapeStrings.put("Η", new Character('\u0397'));
escapeStrings.put("Θ", new Character('\u0398'));
escapeStrings.put("Ι", new Character('\u0399'));
escapeStrings.put("Κ", new Character('\u039A'));
escapeStrings.put("Λ", new Character('\u039B'));
escapeStrings.put("Μ", new Character('\u039C'));
escapeStrings.put("Ν", new Character('\u039D'));
escapeStrings.put("Ξ", new Character('\u039E'));
escapeStrings.put("Ο", new Character('\u039F'));
escapeStrings.put("Π", new Character('\u03A0'));
escapeStrings.put("Ρ", new Character('\u03A1'));
escapeStrings.put("Σ", new Character('\u03A3'));
escapeStrings.put("Τ", new Character('\u03A4'));
escapeStrings.put("Υ", new Character('\u03A5'));
escapeStrings.put("Φ", new Character('\u03A6'));
escapeStrings.put("Χ", new Character('\u03A7'));
escapeStrings.put("Ψ", new Character('\u03A8'));
escapeStrings.put("Ω", new Character('\u03A9'));
escapeStrings.put("α", new Character('\u03B1'));
escapeStrings.put("β", new Character('\u03B2'));
escapeStrings.put("γ", new Character('\u03B3'));
escapeStrings.put("δ", new Character('\u03B4'));
escapeStrings.put("ε", new Character('\u03B5'));
escapeStrings.put("ζ", new Character('\u03B6'));
escapeStrings.put("η", new Character('\u03B7'));
escapeStrings.put("θ", new Character('\u03B8'));
escapeStrings.put("ι", new Character('\u03B9'));
escapeStrings.put("κ", new Character('\u03BA'));
escapeStrings.put("λ", new Character('\u03BB'));
escapeStrings.put("μ", new Character('\u03BC'));
escapeStrings.put("ν", new Character('\u03BD'));
escapeStrings.put("ξ", new Character('\u03BE'));
escapeStrings.put("ο", new Character('\u03BF'));
escapeStrings.put("π", new Character('\u03C0'));
escapeStrings.put("ρ", new Character('\u03C1'));
escapeStrings.put("ς", new Character('\u03C2'));
escapeStrings.put("σ", new Character('\u03C3'));
escapeStrings.put("τ", new Character('\u03C4'));
escapeStrings.put("υ", new Character('\u03C5'));
escapeStrings.put("φ", new Character('\u03C6'));
escapeStrings.put("χ", new Character('\u03C7'));
escapeStrings.put("ψ", new Character('\u03C8'));
escapeStrings.put("ω", new Character('\u03C9'));
escapeStrings.put("ϑ", new Character('\u03D1'));
escapeStrings.put("ϒ", new Character('\u03D2'));
escapeStrings.put("ϖ", new Character('\u03D6'));
escapeStrings.put("•", new Character('\u2022'));
escapeStrings.put("…", new Character('\u2026'));
escapeStrings.put("′", new Character('\u2032'));
escapeStrings.put("″", new Character('\u2033'));
escapeStrings.put("‾", new Character('\u203E'));
escapeStrings.put("⁄", new Character('\u2044'));
escapeStrings.put("℘", new Character('\u2118'));
escapeStrings.put("ℑ", new Character('\u2111'));
escapeStrings.put("ℜ", new Character('\u211C'));
escapeStrings.put("™", new Character('\u2122'));
escapeStrings.put("ℵ", new Character('\u2135'));
escapeStrings.put("←", new Character('\u2190'));
escapeStrings.put("↑", new Character('\u2191'));
escapeStrings.put("→", new Character('\u2192'));
escapeStrings.put("↓", new Character('\u2193'));
escapeStrings.put("↔", new Character('\u2194'));
escapeStrings.put("↵", new Character('\u21B5'));
escapeStrings.put("⇐", new Character('\u21D0'));
escapeStrings.put("⇑", new Character('\u21D1'));
escapeStrings.put("⇒", new Character('\u21D2'));
escapeStrings.put("⇓", new Character('\u21D3'));
escapeStrings.put("⇔", new Character('\u21D4'));
escapeStrings.put("∀", new Character('\u2200'));
escapeStrings.put("∂", new Character('\u2202'));
escapeStrings.put("∃", new Character('\u2203'));
escapeStrings.put("∅", new Character('\u2205'));
escapeStrings.put("∇", new Character('\u2207'));
escapeStrings.put("∈", new Character('\u2208'));
escapeStrings.put("∉", new Character('\u2209'));
escapeStrings.put("∋", new Character('\u220B'));
escapeStrings.put("∏", new Character('\u220F'));
escapeStrings.put("∑", new Character('\u2211'));
escapeStrings.put("−", new Character('\u2212'));
escapeStrings.put("∗", new Character('\u2217'));
escapeStrings.put("√", new Character('\u221A'));
escapeStrings.put("∝", new Character('\u221D'));
escapeStrings.put("∞", new Character('\u221E'));
escapeStrings.put("∠", new Character('\u2220'));
escapeStrings.put("∧", new Character('\u2227'));
escapeStrings.put("∨", new Character('\u2228'));
escapeStrings.put("∩", new Character('\u2229'));
escapeStrings.put("∪", new Character('\u222A'));
escapeStrings.put("∫", new Character('\u222B'));
escapeStrings.put("∴", new Character('\u2234'));
escapeStrings.put("∼", new Character('\u223C'));
escapeStrings.put("≅", new Character('\u2245'));
escapeStrings.put("≈", new Character('\u2248'));
escapeStrings.put("≠", new Character('\u2260'));
escapeStrings.put("≡", new Character('\u2261'));
escapeStrings.put("≤", new Character('\u2264'));
escapeStrings.put("≥", new Character('\u2265'));
escapeStrings.put("⊂", new Character('\u2282'));
escapeStrings.put("⊃", new Character('\u2283'));
escapeStrings.put("⊄", new Character('\u2284'));
escapeStrings.put("⊆", new Character('\u2286'));
escapeStrings.put("⊇", new Character('\u2287'));
escapeStrings.put("⊕", new Character('\u2295'));
escapeStrings.put("⊗", new Character('\u2297'));
escapeStrings.put("⊥", new Character('\u22A5'));
escapeStrings.put("⋅", new Character('\u22C5'));
escapeStrings.put("⌈", new Character('\u2308'));
escapeStrings.put("⌉", new Character('\u2309'));
escapeStrings.put("⌊", new Character('\u230A'));
escapeStrings.put("⌋", new Character('\u230B'));
escapeStrings.put("⟨", new Character('\u2329'));
escapeStrings.put("⟩", new Character('\u232A'));
escapeStrings.put("◊", new Character('\u25CA'));
escapeStrings.put("♠", new Character('\u2660'));
escapeStrings.put("♣", new Character('\u2663'));
escapeStrings.put("♥", new Character('\u2665'));
escapeStrings.put("♦", new Character('\u2666'));
escapeStrings.put(""", new Character('\u0022'));
escapeStrings.put("&", new Character('\u0026'));
escapeStrings.put("<", new Character('\u003C'));
escapeStrings.put(">", new Character('\u003E'));
escapeStrings.put("Œ", new Character('\u0152'));
escapeStrings.put("œ", new Character('\u0153'));
escapeStrings.put("Š", new Character('\u0160'));
escapeStrings.put("š", new Character('\u0161'));
escapeStrings.put("Ÿ", new Character('\u0178'));
escapeStrings.put("ˆ", new Character('\u02C6'));
escapeStrings.put("˜", new Character('\u02DC'));
escapeStrings.put(" ", new Character('\u2002'));
escapeStrings.put(" ", new Character('\u2003'));
escapeStrings.put(" ", new Character('\u2009'));
escapeStrings.put("‌", new Character('\u200C'));
escapeStrings.put("‍", new Character('\u200D'));
escapeStrings.put("‎", new Character('\u200E'));
escapeStrings.put("‏", new Character('\u200F'));
escapeStrings.put("–", new Character('\u2013'));
escapeStrings.put("—", new Character('\u2014'));
escapeStrings.put("‘", new Character('\u2018'));
escapeStrings.put("’", new Character('\u2019'));
escapeStrings.put("‚", new Character('\u201A'));
escapeStrings.put("“", new Character('\u201C'));
escapeStrings.put("”", new Character('\u201D'));
escapeStrings.put("„", new Character('\u201E'));
escapeStrings.put("†", new Character('\u2020'));
escapeStrings.put("‡", new Character('\u2021'));
escapeStrings.put("‰", new Character('\u2030'));
escapeStrings.put("‹", new Character('\u2039'));
escapeStrings.put("›", new Character('\u203A'));
escapeStrings.put("€", new Character('\u20AC'));
}
/**
* Replace all the occurences of HTML escape strings with the
* respective characters.
*
* @param s a <code>String</code> value
* @return a <code>String</code> value
*/
public static final String unescapeHTML(String s) {
char[] chars = s.toCharArray();
char[] escaped = new char[chars.length];
// Note: escaped[pos] = end of the escaped char array.
int pos = 0;
for (int i = 0; i < chars.length;) {
if (chars[i] != '&') {
escaped[pos++] = chars[i++];
continue;
}
// Allow e.g. {
int j = i + 1;
if (j < chars.length && chars[j] == '#')
j++;
// Scan until we find a char that is not letter or digit.
for (; j < chars.length; j++) {
if (!Character.isLetterOrDigit(chars[j]))
break;
}
boolean replaced = false;
if (j < chars.length && chars[j] == ';') {
if (s.charAt(i + 1) == '#') { // Check for &#D; and 
 pattern
try {
long charcode = 0;
char ch = s.charAt(i + 2);
if (ch == 'x' || ch == 'X') {
charcode = Long.parseLong(new String(chars, i + 3, j - i - 3),
16);
} else if (Character.isDigit(ch)) {
charcode = Long.parseLong(new String(chars, i + 2, j - i - 2));
}
if (charcode > 0 && charcode < 65536) {
escaped[pos++] = (char) charcode;
replaced = true;
}
} catch (NumberFormatException ex) {
// Failed, not replaced.
}
} else {
String key = new String(chars, i, j - i + 1);
Character repl = escapeStrings.get(key);
if (repl != null) {
escaped[pos++] = repl.charValue();
replaced = true;
}
}
j++; // Skip over ';'
}
if (!replaced) {
// Not a recognized escape sequence, leave as-is
System.arraycopy(chars, i, escaped, pos, j - i);
pos += j - i;
}
i = j;
}
return new String(escaped, 0, pos);
}
}
Related examples in the same category