A collection of all character entites defined in the HTML4 standard.
/**
*
* LibXML : a free Java layouting library
*
*
* Project Info: http://reporting.pentaho.org/libxml/
*
* (C) Copyright 2006-2008, by Object Refinery Ltd, Pentaho Corporation and Contributors.
*
* This library is free software; you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Foundation;
* either version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License along with this
* library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* [Java is a trademark or registered trademark of Sun Microsystems, Inc.
* in the United States and other countries.]
*
*
* ------------
* HtmlCharacterEntities.java
* ------------
*/
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
/**
* A collection of all character entites defined in the HTML4 standard. The key
* is the entity name, the property value is the decoded string.
*
* @author Thomas Morgner
*/
public class HtmlCharacterEntities extends Properties
{
/**
* The singleton instance for this entity-parser implementation.
*/
private static CharacterEntityParser entityParser;
private static final long serialVersionUID = 5118172339379209383L;
/**
* Gets the character entity parser for HTML content. The CharacterEntity
* parser translates known characters into predefined entities.
*
* @return the character entity parser instance.
*/
public static CharacterEntityParser getEntityParser()
{
if (entityParser == null)
{
entityParser = new CharacterEntityParser(new HtmlCharacterEntities());
}
return entityParser;
}
/**
* Creates an instance.
*/
public HtmlCharacterEntities()
{
setProperty("ang", "\u2220");
setProperty("spades", "\u2660");
setProperty("frasl", "\u2044");
setProperty("copy", "\u00a9");
setProperty("Upsilon", "\u03a5");
setProperty("rsquo", "\u2019");
setProperty("sdot", "\u22c5");
setProperty("beta", "\u03b2");
setProperty("egrave", "\u00e8");
setProperty("Pi", "\u03a0");
setProperty("micro", "\u00b5");
setProperty("lArr", "\u21d0");
setProperty("Beta", "\u0392");
setProperty("eacute", "\u00e9");
setProperty("agrave", "\u00e0");
setProperty("sbquo", "\u201a");
setProperty("ucirc", "\u00fb");
setProperty("mdash", "\u2014");
setProperty("rho", "\u03c1");
setProperty("Nu", "\u039d");
setProperty("ne", "\u2260");
setProperty("nsub", "\u2284");
setProperty("AElig", "\u00c6");
setProperty("raquo", "\u00bb");
setProperty("aacute", "\u00e1");
setProperty("le", "\u2264");
setProperty("harr", "\u2194");
setProperty("frac34", "\u00be");
setProperty("bdquo", "\u201e");
setProperty("cup", "\u222a");
setProperty("frac14", "\u00bc");
setProperty("exist", "\u2203");
setProperty("Ccedil", "\u00c7");
setProperty("phi", "\u03c6");
setProperty("Lambda", "\u039b");
setProperty("alpha", "\u03b1");
setProperty("sigma", "\u03c3");
setProperty("thetasym", "\u03d1");
setProperty("Rho", "\u03a1");
setProperty("hArr", "\u21d4");
setProperty("Dagger", "\u2021");
setProperty("otilde", "\u00f5");
setProperty("Epsilon", "\u0395");
setProperty("iuml", "\u00ef");
setProperty("Phi", "\u03a6");
setProperty("prod", "\u220f");
setProperty("Aring", "\u00c5");
setProperty("rlm", "\u200f");
setProperty("yen", "\u00a5");
setProperty("emsp", "\u2003");
setProperty("rang", "\u232a");
setProperty("Atilde", "\u00c3");
setProperty("Iuml", "\u00cf");
setProperty("iota", "\u03b9");
setProperty("deg", "\u00b0");
setProperty("prop", "\u221d");
setProperty("and", "\u2227");
setProperty("para", "\u00b6");
setProperty("darr", "\u2193");
setProperty("curren", "\u00a4");
setProperty("crarr", "\u21b5");
setProperty("not", "\u00ac");
setProperty("Iota", "\u0399");
setProperty("aelig", "\u00e6");
setProperty("rdquo", "\u201d");
setProperty("Ocirc", "\u00d4");
setProperty("ntilde", "\u00f1");
setProperty("reg", "\u00ae");
setProperty("zeta", "\u03b6");
setProperty("middot", "\u00b7");
setProperty("cent", "\u00a2");
setProperty("quot", "\"");
setProperty("hellip", "\u2026");
setProperty("Zeta", "\u0396");
setProperty("rceil", "\u2309");
setProperty("eta", "\u03b7");
setProperty("nbsp", "\u00a0");
setProperty("rarr", "\u2192");
setProperty("frac12", "\u00bd");
setProperty("real", "\u211c");
setProperty("mu", "\u03bc");
setProperty("dArr", "\u21d3");
setProperty("divide", "\u00f7");
setProperty("cap", "\u2229");
setProperty("chi", "\u03c7");
setProperty("times", "\u00d7");
setProperty("euml", "\u00eb");
setProperty("Gamma", "\u0393");
setProperty("loz", "\u25ca");
setProperty("acute", "\u00b4");
setProperty("Omega", "\u03a9");
setProperty("ndash", "\u2013");
setProperty("clubs", "\u2663");
setProperty("macr", "\u00af");
setProperty("Yacute", "\u00dd");
setProperty("Ugrave", "\u00d9");
setProperty("Euml", "\u00cb");
setProperty("Eta", "\u0397");
setProperty("sect", "\u00a7");
setProperty("asymp", "\u2248");
setProperty("ordm", "\u00ba");
setProperty("rArr", "\u21d2");
setProperty("radic", "\u221a");
setProperty("Uacute", "\u00da");
setProperty("omicron", "\u03bf");
setProperty("Chi", "\u03a7");
setProperty("aring", "\u00e5");
setProperty("Theta", "\u0398");
setProperty("supe", "\u2287");
setProperty("ensp", "\u2002");
setProperty("uml", "\u00a8");
setProperty("ccedil", "\u00e7");
setProperty("lambda", "\u03bb");
setProperty("gt", "\u003e");
setProperty("uarr", "\u2191");
setProperty("alefsym", "\u2135");
setProperty("auml", "\u00e4");
setProperty("sup3", "\u00b3");
setProperty("circ", "\u02c6");
setProperty("lsquo", "\u2018");
setProperty("Auml", "\u00c4");
setProperty("dagger", "\u2020");
setProperty("Kappa", "\u039a");
setProperty("cong", "\u2245");
setProperty("zwnj", "\u200c");
setProperty("shy", "\u00ad");
setProperty("ouml", "\u00f6");
setProperty("diams", "\u2666");
setProperty("uArr", "\u21d1");
setProperty("atilde", "\u00e3");
setProperty("THORN", "\u00de");
setProperty("or", "\u2228");
setProperty("Ograve", "\u00d2");
setProperty("ocirc", "\u00f4");
setProperty("plusm", "\u00b1");
setProperty("Ouml", "\u00d6");
setProperty("nabla", "\u2207");
setProperty("psi", "\u03c8");
setProperty("sigmaf", "\u03c2");
setProperty("euro", "\u20ac");
setProperty("sube", "\u2286");
setProperty("sup2", "\u00b2");
setProperty("laquo", "\u00ab");
setProperty("forall", "\u2200");
setProperty("Oacute", "\u00d3");
setProperty("iexcl", "\u00a1");
fillMoreEntities();
}
/**
* Externalized initialization method to make CheckStyle happy.
*/
private void fillMoreEntities()
{
setProperty("piv", "\u03d6");
setProperty("minus", "\u2212");
setProperty("zwj", "\u200d");
setProperty("tau", "\u03c4");
setProperty("Mu", "\u039c");
setProperty("gamma", "\u03b3");
setProperty("sup", "\u2283");
setProperty("Psi", "\u03a8");
setProperty("omega", "\u03c9");
setProperty("Oslash", "\u00d8");
setProperty("weierp", "\u2118");
setProperty("Igrave", "\u00cc");
setProperty("OElig", "\u0152");
setProperty("sup1", "\u00b9");
setProperty("cedil", "\u00b8");
setProperty("upsilon", "\u03c5");
setProperty("equiv", "\u2261");
setProperty("isin", "\u2208");
setProperty("Delta", "\u0394");
setProperty("yacute", "\u00fd");
setProperty("ugrave", "\u00f9");
setProperty("ge", "\u2265");
setProperty("Iacute", "\u00cd");
setProperty("brvbar", "\u00a6");
setProperty("Tau", "\u03a4");
setProperty("Prime", "\u2033");
setProperty("rfloor", "\u22a7");
setProperty("Ecirc", "\u00ca");
setProperty("ETH", "\u00d0");
setProperty("int", "\u222b");
setProperty("xi", "\u03be");
setProperty("uacute", "\u00fa");
setProperty("bull", "\u2022");
setProperty("Scaron", "\u0160");
setProperty("theta", "\u03b8");
setProperty("yuml", "\u00ff");
setProperty("oplus", "\u2295");
setProperty("part", "\u2202");
setProperty("ldquo", "\u201c");
setProperty("Icirc", "\u00ce");
setProperty("Yuml", "\u0178");
setProperty("eth", "\u00f0");
setProperty("Acirc", "\u00c2");
setProperty("sub", "\u2282");
setProperty("lceil", "\u2308");
setProperty("Egrave", "\u00c8");
setProperty("tilde", "\u02dc");
setProperty("pi", "\u03c0");
setProperty("rsaquo", "\u203a");
setProperty("kappa", "\u03ba");
setProperty("upsih", "\u03d2");
setProperty("Omicron", "\u039f");
setProperty("otimes", "\u2297");
setProperty("ni", "\u220b");
setProperty("amp", "\u0026");
setProperty("Eacute", "\u00c9");
setProperty("nu", "\u03bd");
setProperty("Ucirc", "\u00db");
setProperty("uuml", "\u00fc");
setProperty("oslash", "\u00f8");
setProperty("thorn", "\u00fe");
setProperty("trade", "\u2122");
setProperty("epsilon", "\u03b5");
setProperty("ograve", "\u00f2");
setProperty("hearts", "\u2665");
setProperty("iquest", "\u00bf");
setProperty("Uuml", "\u00dc");
setProperty("empty", "\u2205");
setProperty("lowast", "\u2217");
setProperty("sum", "\u2211");
setProperty("lfloor", "\u22a6");
setProperty("lrm", "\u200e");
setProperty("oacute", "\u00f3");
setProperty("image", "\u2111");
setProperty("Agrave", "\u00c0");
setProperty("oline", "\u203e");
setProperty("oelig", "\u0153");
setProperty("Sigma", "\u03a3");
setProperty("permil", "\u2030");
setProperty("perp", "\u22a5");
setProperty("lt", "\u003c");
setProperty("Aacute", "\u00c1");
setProperty("acirc", "\u00e2");
setProperty("lang", "\u2329");
setProperty("delta", "\u03b4");
setProperty("infin", "\u221e");
setProperty("igrave", "\u00ec");
setProperty("ordf", "\u00aa");
setProperty("lsaquo", "\u2039");
setProperty("prime", "\u2032");
setProperty("ecirc", "\u00ea");
setProperty("there4", "\u2234");
setProperty("iacute", "\u00ed");
setProperty("sim", "\u223c");
setProperty("Alpha", "\u0391");
setProperty("pound", "\u00a3");
setProperty("notin", "\u2209");
setProperty("Ntilde", "\u00d1");
setProperty("Xi", "\u039e");
setProperty("thinsp", "\u2009");
setProperty("Otilde", "\u00d5");
setProperty("icirc", "\u00ee");
setProperty("scaron", "\u0161");
setProperty("szlig", "\u00df");
setProperty("larr", "\u2190");
}
}
/**
*
* LibXML : a free Java layouting library
*
*
* Project Info: http://reporting.pentaho.org/libxml/
*
* (C) Copyright 2006-2008, by Object Refinery Ltd, Pentaho Corporation and Contributors.
*
* This library is free software; you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Foundation;
* either version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License along with this
* library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* [Java is a trademark or registered trademark of Sun Microsystems, Inc.
* in the United States and other countries.]
*
*
* ------------
* CharacterEntityParser.java
* ------------
*/
/**
* The character entity parser replaces all known occurrences of an entity in
* the format &entityname;.
*
* @author Thomas Morgner
*/
class CharacterEntityParser
{
private String[] charMap;
/**
* the entities, keyed by entity name.
*/
private final HashMap entities;
/**
* Creates a new CharacterEntityParser and initializes the parser with the
* given set of entities.
*
* @param characterEntities the entities used for the parser
*/
public CharacterEntityParser(final Properties characterEntities)
{
if (characterEntities == null)
{
throw new NullPointerException("CharacterEntities must not be null");
}
entities = new HashMap(characterEntities);
charMap = new String[65536];
final Iterator entries = entities.entrySet().iterator();
while (entries.hasNext())
{
final Map.Entry entry = (Map.Entry) entries.next();
final String value = (String) entry.getValue();
final String entityName = (String) entry.getKey();
if (value.length() != 1)
{
throw new IllegalStateException();
}
charMap[value.charAt(0)] = entityName;
}
}
/**
* Creates a new CharacterEntityParser and initializes the parser with the
* given set of entities.
*
* @param characterEntities the entities used for the parser
*/
public CharacterEntityParser(final HashMap characterEntities)
{
if (characterEntities == null)
{
throw new NullPointerException("CharacterEntities must not be null");
}
entities = (HashMap) characterEntities.clone();
charMap = new String[65536];
final Iterator entries = entities.entrySet().iterator();
while (entries.hasNext())
{
final Map.Entry entry = (Map.Entry) entries.next();
final String value = (String) entry.getValue();
final String entityName = (String) entry.getKey();
if (value.length() != 1)
{
throw new IllegalStateException();
}
charMap[value.charAt(0)] = entityName;
}
}
/**
* create a new Character entity parser and initializes the parser with the
* entities defined in the XML standard.
*
* @return the CharacterEntityParser initialized with XML entities.
*/
public static CharacterEntityParser createXMLEntityParser()
{
final HashMap entities = new HashMap();
entities.put("amp", "&");
entities.put("quot", "\"");
entities.put("lt", "<");
entities.put("gt", ">");
entities.put("apos", "\u0027");
return new CharacterEntityParser(entities);
}
/**
* returns the entities used in the parser.
*
* @return the properties for this parser.
*/
private HashMap getEntities()
{
return entities;
}
/**
* Looks up the character for the entity name specified in <code>key</code>.
*
* @param key the entity name
* @return the character as string with a length of 1
*/
private String lookupCharacter(final String key)
{
return (String) getEntities().get(key);
}
/**
* Encode the given String, so that all known entites are encoded. All
* characters represented by these entites are now removed from the string.
*
* @param value the original string
* @return the encoded string.
*/
public String encodeEntities(final String value)
{
if (value == null)
{
throw new NullPointerException();
}
final int length = value.length();
final StringBuffer writer = new StringBuffer(length);
for (int i = 0; i < length; i++)
{
final char character = value.charAt(i);
final String lookup = charMap[character];
if (lookup == null)
{
writer.append(character);
}
else
{
writer.append('&');
writer.append(lookup);
writer.append(';');
}
}
return writer.toString();
}
/**
* Decode the string, all known entities are replaced by their resolved
* characters.
*
* @param value the string that should be decoded.
* @return the decoded string.
*/
public String decodeEntities(final String value)
{
if (value == null)
{
throw new NullPointerException();
}
int parserIndex = 0;
int subStart = value.indexOf('&', parserIndex);
if (subStart == -1)
{
return value;
}
int subEnd = value.indexOf(';', subStart);
if (subEnd == -1)
{
return value;
}
final StringBuffer bufValue = new StringBuffer(value.substring(0, subStart));
do
{
// at this point we know, that there is at least one entity ..
if (value.charAt(subStart + 1) == '#')
{
final int subValue = parseInt(value.substring(subStart + 2, subEnd), 0);
if ((subValue >= 1) && (subValue <= 65536))
{
final char[] chr = new char[1];
chr[0] = (char) subValue;
bufValue.append(chr);
}
else
{
// invalid entity, do not decode ..
bufValue.append(value.substring(subStart, subEnd));
}
}
else
{
final String entity = value.substring(subStart + 1, subEnd);
final String replaceString = lookupCharacter(entity);
if (replaceString != null)
{
bufValue.append(decodeEntities(replaceString));
}
else
{
bufValue.append('&');
bufValue.append(entity);
bufValue.append(';');
}
}
parserIndex = subEnd + 1;
subStart = value.indexOf('&', parserIndex);
if (subStart == -1)
{
bufValue.append(value.substring(parserIndex));
subEnd = -1;
}
else
{
subEnd = value.indexOf(';', subStart);
if (subEnd == -1)
{
bufValue.append(value.substring(parserIndex));
}
else
{
bufValue.append(value.substring(parserIndex, subStart));
}
}
}
while (subStart != -1 && subEnd != -1);
return bufValue.toString();
}
/**
* Parses the given string into an int-value. On errors the default value
* is returned.
*
* @param s the string
* @param defaultVal the default value that should be used in case of errors
* @return the parsed int or the default value.
*/
private int parseInt(final String s, final int defaultVal)
{
if (s == null)
{
return defaultVal;
}
try
{
return Integer.parseInt(s);
}
catch (Exception e)
{
// ignored ..
}
return defaultVal;
}
}
Related examples in the same category