Here you can find the source of htmlUnescape(String s)
public static String htmlUnescape(String s)
//package com.java2s; /*/*from ww w . j a va2 s .co m*/ * Copyright 2002-2004 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.HashMap; import java.util.Map; public class Main { private static final String EMPTY_REFERENCE = "&;"; private static final String MALFORMED_REFERENCE = "&#;"; private static final Map ENTITIES = new HashMap(); /** * Turn HTML character references into their plain text UNICODE equivalent. * <p>Handles complete character set defined in HTML 4.01 recommendation * and all reference types (decimal, hex, and entity). * <p>Correctly converts the following formats: * <blockquote> * &#<i>Decimal</i>; - <i>(Example: D)</i><br> * &#x<i>Hex</i>;<br> - <i>(Example: å) case insensitive</i> * &#<i>Entity</i>; - <i>(Example: &) case sensitive</i> * </blockquote> * Gracefully handles malformed character references by copying original * characters as is when encountered.<p> * <p>Reference: * <a href="http://www.w3.org/TR/html4/sgml/entities.html"> * http://www.w3.org/TR/html4/sgml/entities.html * </a> */ public static String htmlUnescape(String s) { if (s == null) { return null; } StringBuffer unescaped = new StringBuffer(s.length()); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); if (c == '&') { // don't look more than 12 chars ahead as reference like strings // should not be longer than 12 chars in length (including ';') // prevents the entire string from being searched when an '&' // with no following ';' is an encountered int start = Math.min(i + 1, s.length() - 1); int end = Math.min(s.length(), start + 12); String reference = s.substring(start, end); int semi = reference.indexOf(';'); if (semi == -1) { unescaped.append(c); continue; } reference = reference.substring(0, semi); i = start + semi; // try entity reference first Integer iso = (Integer) ENTITIES.get(reference); if (iso != null) { unescaped.append((char) iso.intValue()); continue; } if (reference.length() == 0) { unescaped.append(EMPTY_REFERENCE); continue; } if (reference.charAt(0) == '#') { if (reference.length() > 2) { int index = 1; if (reference.charAt(1) == 'x' || reference.charAt(1) == 'X') { index = 2; } try { unescaped.append( (char) Integer.parseInt(reference.substring(index), (index == 1) ? 10 : 16)); continue; } catch (NumberFormatException e) { // wasn't hex or decimal, copy original chars unescaped.append('&' + reference + ';'); continue; } } unescaped.append(MALFORMED_REFERENCE); continue; } // may not be valid reference, forget it i = start - 1; } unescaped.append(c); } return unescaped.toString(); } }