Java tutorial
/* infoScoop OpenSource * Copyright (C) 2010 Beacon IT Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License version 3 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/lgpl-3.0-standalone.html>. */ /* * $Id: HtmlUtil.java,v 1.4 2009/02/17 08:44:08 hr-endoh Exp $ * * Beacon-IT inicio Project * Copyright (c) 2003 by Beacon Information Technology, Inc. * 163-1507 Tokyo-to, Shinjuku-ku, Nishi-Shinjuku 1-6-1 Shinjuku L-Tower * All rights reserved. * ==================================================================== */ package org.infoscoop.util; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.HashMap; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.xerces.xni.parser.XMLDocumentFilter; import org.apache.xerces.xni.parser.XMLInputSource; import org.apache.xerces.xni.parser.XMLParserConfiguration; import org.cyberneko.html.HTMLConfiguration; /** * A utility class about the HTML. * * @author Atsuhiko Kimura */ public class HtmlUtil { private static Log log = LogFactory.getLog(HtmlUtil.class); //The dictionary of the entity of HTML4.0the reference of a numerical value letter private static final Map _numericRefDict = new HashMap(); //The dictionary of the entity of HTML4.0substance private static final Map _entityDict = new HashMap(); static { _numericRefDict.put(" ", " "); _numericRefDict.put("¡", "¡"); _numericRefDict.put("¢", "¢"); _numericRefDict.put("£", "£"); _numericRefDict.put("¤", "¤"); _numericRefDict.put("¥", "¥"); _numericRefDict.put("¦", "¦"); _numericRefDict.put("§", "§"); _numericRefDict.put("¨", "¨"); _numericRefDict.put("©", "©"); _numericRefDict.put("ª", "ª"); _numericRefDict.put("«", "«"); _numericRefDict.put("¬", "¬"); _numericRefDict.put("­", "­"); _numericRefDict.put("®", "®"); _numericRefDict.put("¯", "¯"); _numericRefDict.put("°", "°"); _numericRefDict.put("±", "±"); _numericRefDict.put("²", "²"); _numericRefDict.put("³", "³"); _numericRefDict.put("´", "´"); _numericRefDict.put("µ", "µ"); _numericRefDict.put("¶", "¶"); _numericRefDict.put("·", "·"); _numericRefDict.put("¸", "¸"); _numericRefDict.put("¹", "¹"); _numericRefDict.put("º", "º"); _numericRefDict.put("»", "»"); _numericRefDict.put("¼", "¼"); _numericRefDict.put("½", "½"); _numericRefDict.put("¾", "¾"); _numericRefDict.put("¿", "¿"); _numericRefDict.put("À", "À"); _numericRefDict.put("Á", "Á"); _numericRefDict.put("Â", "Â"); _numericRefDict.put("Ã", "Ã"); _numericRefDict.put("Ä", "Ä"); _numericRefDict.put("Å", "Å"); _numericRefDict.put("Æ", "Æ"); _numericRefDict.put("Ç", "Ç"); _numericRefDict.put("È", "È"); _numericRefDict.put("É", "É"); _numericRefDict.put("Ê", "Ê"); _numericRefDict.put("Ë", "Ë"); _numericRefDict.put("Ì", "Ì"); _numericRefDict.put("Í", "Í"); _numericRefDict.put("Î", "Î"); _numericRefDict.put("Ï", "Ï"); _numericRefDict.put("Ð", "Ð"); _numericRefDict.put("Ñ", "Ñ"); _numericRefDict.put("Ò", "Ò"); _numericRefDict.put("Ó", "Ó"); _numericRefDict.put("Ô", "Ô"); _numericRefDict.put("Õ", "Õ"); _numericRefDict.put("Ö", "Ö"); _numericRefDict.put("×", "×"); _numericRefDict.put("Ø", "Ø"); _numericRefDict.put("Ù", "Ù"); _numericRefDict.put("Ú", "Ú"); _numericRefDict.put("Û", "Û"); _numericRefDict.put("Ü", "Ü"); _numericRefDict.put("Ý", "Ý"); _numericRefDict.put("Þ", "Þ"); _numericRefDict.put("ß", "ß"); _numericRefDict.put("à", "à"); _numericRefDict.put("á", "á"); _numericRefDict.put("â", "â"); _numericRefDict.put("ã", "ã"); _numericRefDict.put("ä", "ä"); _numericRefDict.put("å", "å"); _numericRefDict.put("æ", "æ"); _numericRefDict.put("ç", "ç"); _numericRefDict.put("è", "è"); _numericRefDict.put("é", "é"); _numericRefDict.put("ê", "ê"); _numericRefDict.put("ë", "ë"); _numericRefDict.put("ì", "ì"); _numericRefDict.put("í", "í"); _numericRefDict.put("î", "î"); _numericRefDict.put("ï", "ï"); _numericRefDict.put("ð", "ð"); _numericRefDict.put("ñ", "ñ"); _numericRefDict.put("ò", "ò"); _numericRefDict.put("ó", "ó"); _numericRefDict.put("ô", "ô"); _numericRefDict.put("õ", "õ"); _numericRefDict.put("ö", "ö"); _numericRefDict.put("÷", "÷"); _numericRefDict.put("ø", "ø"); _numericRefDict.put("ù", "ù"); _numericRefDict.put("ú", "ú"); _numericRefDict.put("û", "û"); _numericRefDict.put("ü", "ü"); _numericRefDict.put("ý", "ý"); _numericRefDict.put("þ", "þ"); _numericRefDict.put("ÿ", "ÿ"); _numericRefDict.put("ƒ", "ƒ"); _numericRefDict.put("Α", "Α"); _numericRefDict.put("Β", "Β"); _numericRefDict.put("Γ", "Γ"); _numericRefDict.put("Δ", "Δ"); _numericRefDict.put("Ε", "Ε"); _numericRefDict.put("Ζ", "Ζ"); _numericRefDict.put("Η", "Η"); _numericRefDict.put("Θ", "Θ"); _numericRefDict.put("Ι", "Ι"); _numericRefDict.put("Κ", "Κ"); _numericRefDict.put("Λ", "Λ"); _numericRefDict.put("Μ", "Μ"); _numericRefDict.put("Ν", "Ν"); _numericRefDict.put("Ξ", "Ξ"); _numericRefDict.put("Ο", "Ο"); _numericRefDict.put("Π", "Π"); _numericRefDict.put("Ρ", "Ρ"); _numericRefDict.put("Σ", "Σ"); _numericRefDict.put("Τ", "Τ"); _numericRefDict.put("Υ", "Υ"); _numericRefDict.put("Φ", "Φ"); _numericRefDict.put("Χ", "Χ"); _numericRefDict.put("Ψ", "Ψ"); _numericRefDict.put("Ω", "Ω"); _numericRefDict.put("α", "α"); _numericRefDict.put("β", "β"); _numericRefDict.put("γ", "γ"); _numericRefDict.put("δ", "δ"); _numericRefDict.put("ε", "ε"); _numericRefDict.put("ζ", "ζ"); _numericRefDict.put("η", "η"); _numericRefDict.put("θ", "θ"); _numericRefDict.put("ι", "ι"); _numericRefDict.put("κ", "κ"); _numericRefDict.put("λ", "λ"); _numericRefDict.put("μ", "μ"); _numericRefDict.put("ν", "ν"); _numericRefDict.put("ξ", "ξ"); _numericRefDict.put("ο", "ο"); _numericRefDict.put("π", "π"); _numericRefDict.put("ρ", "ρ"); _numericRefDict.put("ς", "ς"); _numericRefDict.put("σ", "σ"); _numericRefDict.put("τ", "τ"); _numericRefDict.put("υ", "υ"); _numericRefDict.put("φ", "φ"); _numericRefDict.put("χ", "χ"); _numericRefDict.put("ψ", "ψ"); _numericRefDict.put("ω", "ω"); _numericRefDict.put("ϑ", "ϑ"); _numericRefDict.put("ϒ", "ϒ"); _numericRefDict.put("ϖ", "ϖ"); _numericRefDict.put("•", "•"); _numericRefDict.put("…", "…"); _numericRefDict.put("′", "′"); _numericRefDict.put("″", "″"); _numericRefDict.put("‾", "‾"); _numericRefDict.put("⁄", "⁄"); _numericRefDict.put("℘", "℘"); _numericRefDict.put("ℑ", "ℑ"); _numericRefDict.put("ℜ", "ℜ"); _numericRefDict.put("™", "™"); _numericRefDict.put("ℵ", "ℵ"); _numericRefDict.put("←", "←"); _numericRefDict.put("↑", "↑"); _numericRefDict.put("→", "→"); _numericRefDict.put("↓", "↓"); _numericRefDict.put("↔", "↔"); _numericRefDict.put("↵", "↵"); _numericRefDict.put("⇐", "⇐"); _numericRefDict.put("⇑", "⇑"); _numericRefDict.put("⇒", "⇒"); _numericRefDict.put("⇓", "⇓"); _numericRefDict.put("⇔", "⇔"); _numericRefDict.put("∀", "∀"); _numericRefDict.put("∂", "∂"); _numericRefDict.put("∃", "∃"); _numericRefDict.put("∅", "∅"); _numericRefDict.put("∇", "∇"); _numericRefDict.put("∈", "∈"); _numericRefDict.put("∉", "∉"); _numericRefDict.put("∋", "∋"); _numericRefDict.put("∏", "∏"); _numericRefDict.put("∑", "∑"); _numericRefDict.put("−", "−"); _numericRefDict.put("∗", "∗"); _numericRefDict.put("√", "√"); _numericRefDict.put("∝", "∝"); _numericRefDict.put("∞", "∞"); _numericRefDict.put("∠", "∠"); _numericRefDict.put("∧", "∧"); _numericRefDict.put("∨", "∨"); _numericRefDict.put("∩", "∩"); _numericRefDict.put("∪", "∪"); _numericRefDict.put("∫", "∫"); _numericRefDict.put("∴", "∴"); _numericRefDict.put("∼", "∼"); _numericRefDict.put("≅", "≅"); _numericRefDict.put("≈", "≈"); _numericRefDict.put("≠", "≠"); _numericRefDict.put("≡", "≡"); _numericRefDict.put("≤", "≤"); _numericRefDict.put("≥", "≥"); _numericRefDict.put("⊂", "⊂"); _numericRefDict.put("⊃", "⊃"); _numericRefDict.put("⊄", "⊄"); _numericRefDict.put("⊆", "⊆"); _numericRefDict.put("⊇", "⊇"); _numericRefDict.put("⊕", "⊕"); _numericRefDict.put("⊗", "⊗"); _numericRefDict.put("⊥", "⊥"); _numericRefDict.put("⋅", "⋅"); _numericRefDict.put("⌈", "⌈"); _numericRefDict.put("⌉", "⌉"); _numericRefDict.put("⌊", "⌊"); _numericRefDict.put("⌋", "⌋"); _numericRefDict.put("⟨", "〈"); _numericRefDict.put("⟩", "〉"); _numericRefDict.put("◊", "◊"); _numericRefDict.put("♠", "♠"); _numericRefDict.put("♣", "♣"); _numericRefDict.put("♥", "♥"); _numericRefDict.put("♦", "♦"); _numericRefDict.put(""", """); _numericRefDict.put("&", "&"); _numericRefDict.put("<", "<"); _numericRefDict.put(">", ">"); _numericRefDict.put("Œ", "Œ"); _numericRefDict.put("œ", "œ"); _numericRefDict.put("Š", "Š"); _numericRefDict.put("š", "š"); _numericRefDict.put("Ÿ", "Ÿ"); _numericRefDict.put("ˆ", "ˆ"); _numericRefDict.put("˜", "˜"); _numericRefDict.put(" ", " "); _numericRefDict.put(" ", " "); _numericRefDict.put(" ", " "); _numericRefDict.put("‌", "‌"); _numericRefDict.put("‍", "‍"); _numericRefDict.put("‎", "‎"); _numericRefDict.put("‏", "‏"); _numericRefDict.put("–", "–"); _numericRefDict.put("—", "—"); _numericRefDict.put("‘", "‘"); _numericRefDict.put("’", "’"); _numericRefDict.put("‚", "‚"); _numericRefDict.put("“", "“"); _numericRefDict.put("”", "”"); _numericRefDict.put("„", "„"); _numericRefDict.put("†", "†"); _numericRefDict.put("‡", "‡"); _numericRefDict.put("‰", "‰"); _numericRefDict.put("‹", "‹"); _numericRefDict.put("›", "›"); _numericRefDict.put("€", "€"); } static { _entityDict.put(" ", "\u00a0"); _entityDict.put("¡", "\u00a1"); _entityDict.put("¢", "\u00a2"); _entityDict.put("£", "\u00a3"); _entityDict.put("¤", "\u00a4"); _entityDict.put("¥", "\u00a5"); _entityDict.put("¦", "\u00a6"); _entityDict.put("§", "\u00a7"); _entityDict.put("¨", "\u00a8"); _entityDict.put("©", "\u00a9"); _entityDict.put("ª", "\u00aa"); _entityDict.put("«", "\u00ab"); _entityDict.put("¬", "\u00ac"); _entityDict.put("­", "\u00ad"); _entityDict.put("®", "\u00ae"); _entityDict.put("¯", "\u00af"); _entityDict.put("°", "\u00b0"); _entityDict.put("±", "\u00b1"); _entityDict.put("²", "\u00b2"); _entityDict.put("³", "\u00b3"); _entityDict.put("´", "\u00b4"); _entityDict.put("µ", "\u00b5"); _entityDict.put("¶", "\u00b6"); _entityDict.put("·", "\u00b7"); _entityDict.put("¸", "\u00b8"); _entityDict.put("¹", "\u00b9"); _entityDict.put("º", "\u00ba"); _entityDict.put("»", "\u00bb"); _entityDict.put("¼", "\u00bc"); _entityDict.put("½", "\u00bd"); _entityDict.put("¾", "\u00be"); _entityDict.put("¿", "\u00bf"); _entityDict.put("À", "\u00c0"); _entityDict.put("Á", "\u00c1"); _entityDict.put("Â", "\u00c2"); _entityDict.put("Ã", "\u00c3"); _entityDict.put("Ä", "\u00c4"); _entityDict.put("Å", "\u00c5"); _entityDict.put("Æ", "\u00c6"); _entityDict.put("Ç", "\u00c7"); _entityDict.put("È", "\u00c8"); _entityDict.put("É", "\u00c9"); _entityDict.put("Ê", "\u00ca"); _entityDict.put("Ë", "\u00cb"); _entityDict.put("Ì", "\u00cc"); _entityDict.put("Í", "\u00cd"); _entityDict.put("Î", "\u00ce"); _entityDict.put("Ï", "\u00cf"); _entityDict.put("Ð", "\u00d0"); _entityDict.put("Ñ", "\u00d1"); _entityDict.put("Ò", "\u00d2"); _entityDict.put("Ó", "\u00d3"); _entityDict.put("Ô", "\u00d4"); _entityDict.put("Õ", "\u00d5"); _entityDict.put("Ö", "\u00d6"); _entityDict.put("×", "\u00d7"); _entityDict.put("Ø", "\u00d8"); _entityDict.put("Ù", "\u00d9"); _entityDict.put("Ú", "\u00da"); _entityDict.put("Û", "\u00db"); _entityDict.put("Ü", "\u00dc"); _entityDict.put("Ý", "\u00dd"); _entityDict.put("Þ", "\u00de"); _entityDict.put("ß", "\u00df"); _entityDict.put("à", "\u00e0"); _entityDict.put("á", "\u00e1"); _entityDict.put("â", "\u00e2"); _entityDict.put("ã", "\u00e3"); _entityDict.put("ä", "\u00e4"); _entityDict.put("å", "\u00e5"); _entityDict.put("æ", "\u00e6"); _entityDict.put("ç", "\u00e7"); _entityDict.put("è", "\u00e8"); _entityDict.put("é", "\u00e9"); _entityDict.put("ê", "\u00ea"); _entityDict.put("ë", "\u00eb"); _entityDict.put("ì", "\u00ec"); _entityDict.put("í", "\u00ed"); _entityDict.put("î", "\u00ee"); _entityDict.put("ï", "\u00ef"); _entityDict.put("ð", "\u00f0"); _entityDict.put("ñ", "\u00f1"); _entityDict.put("ò", "\u00f2"); _entityDict.put("ó", "\u00f3"); _entityDict.put("ô", "\u00f4"); _entityDict.put("õ", "\u00f5"); _entityDict.put("ö", "\u00f6"); _entityDict.put("÷", "\u00f7"); _entityDict.put("ø", "\u00f8"); _entityDict.put("ù", "\u00f9"); _entityDict.put("ú", "\u00fa"); _entityDict.put("û", "\u00fb"); _entityDict.put("ü", "\u00fc"); _entityDict.put("ý", "\u00fd"); _entityDict.put("þ", "\u00fe"); _entityDict.put("ÿ", "\u00ff"); _entityDict.put("ƒ", "\u0192"); _entityDict.put("Α", "\u0391"); _entityDict.put("Β", "\u0392"); _entityDict.put("Γ", "\u0393"); _entityDict.put("Δ", "\u0394"); _entityDict.put("Ε", "\u0395"); _entityDict.put("Ζ", "\u0396"); _entityDict.put("Η", "\u0397"); _entityDict.put("Θ", "\u0398"); _entityDict.put("Ι", "\u0399"); _entityDict.put("Κ", "\u039a"); _entityDict.put("Λ", "\u039b"); _entityDict.put("Μ", "\u039c"); _entityDict.put("Ν", "\u039d"); _entityDict.put("Ξ", "\u039e"); _entityDict.put("Ο", "\u039f"); _entityDict.put("Π", "\u03a0"); _entityDict.put("Ρ", "\u03a1"); _entityDict.put("Σ", "\u03a3"); _entityDict.put("Τ", "\u03a4"); _entityDict.put("Υ", "\u03a5"); _entityDict.put("Φ", "\u03a6"); _entityDict.put("Χ", "\u03a7"); _entityDict.put("Ψ", "\u03a8"); _entityDict.put("Ω", "\u03a9"); _entityDict.put("α", "\u03b1"); _entityDict.put("β", "\u03b2"); _entityDict.put("γ", "\u03b3"); _entityDict.put("δ", "\u03b4"); _entityDict.put("ε", "\u03b5"); _entityDict.put("ζ", "\u03b6"); _entityDict.put("η", "\u03b7"); _entityDict.put("θ", "\u03b8"); _entityDict.put("ι", "\u03b9"); _entityDict.put("κ", "\u03ba"); _entityDict.put("λ", "\u03bb"); _entityDict.put("μ", "\u03bc"); _entityDict.put("ν", "\u03bd"); _entityDict.put("ξ", "\u03be"); _entityDict.put("ο", "\u03bf"); _entityDict.put("π", "\u03c0"); _entityDict.put("ρ", "\u03c1"); _entityDict.put("ς", "\u03c2"); _entityDict.put("σ", "\u03c3"); _entityDict.put("τ", "\u03c4"); _entityDict.put("υ", "\u03c5"); _entityDict.put("φ", "\u03c6"); _entityDict.put("χ", "\u03c7"); _entityDict.put("ψ", "\u03c8"); _entityDict.put("ω", "\u03c9"); _entityDict.put("ϑ", "\u03d1"); _entityDict.put("ϒ", "\u03d2"); _entityDict.put("ϖ", "\u03d6"); _entityDict.put("•", "\u2022"); _entityDict.put("…", "\u2026"); _entityDict.put("′", "\u2032"); _entityDict.put("″", "\u2033"); _entityDict.put("‾", "\u203e"); _entityDict.put("⁄", "\u2044"); _entityDict.put("℘", "\u2118"); _entityDict.put("ℑ", "\u2111"); _entityDict.put("ℜ", "\u211c"); _entityDict.put("™", "\u2122"); _entityDict.put("ℵ", "\u2135"); _entityDict.put("←", "\u2190"); _entityDict.put("↑", "\u2191"); _entityDict.put("→", "\u2192"); _entityDict.put("↓", "\u2193"); _entityDict.put("↔", "\u2194"); _entityDict.put("↵", "\u21b5"); _entityDict.put("⇐", "\u21d0"); _entityDict.put("⇑", "\u21d1"); _entityDict.put("⇒", "\u21d2"); _entityDict.put("⇓", "\u21d3"); _entityDict.put("⇔", "\u21d4"); _entityDict.put("∀", "\u2200"); _entityDict.put("∂", "\u2202"); _entityDict.put("∃", "\u2203"); _entityDict.put("∅", "\u2205"); _entityDict.put("∇", "\u2207"); _entityDict.put("∈", "\u2208"); _entityDict.put("∉", "\u2209"); _entityDict.put("∋", "\u220b"); _entityDict.put("∏", "\u220f"); _entityDict.put("∑", "\u2211"); _entityDict.put("−", "\u2212"); _entityDict.put("∗", "\u2217"); _entityDict.put("√", "\u221a"); _entityDict.put("∝", "\u221d"); _entityDict.put("∞", "\u221e"); _entityDict.put("∠", "\u2220"); _entityDict.put("∧", "\u2227"); _entityDict.put("∨", "\u2228"); _entityDict.put("∩", "\u2229"); _entityDict.put("∪", "\u222a"); _entityDict.put("∫", "\u222b"); _entityDict.put("∴", "\u2234"); _entityDict.put("∼", "\u223c"); _entityDict.put("≅", "\u2245"); _entityDict.put("≈", "\u2248"); _entityDict.put("≠", "\u2260"); _entityDict.put("≡", "\u2261"); _entityDict.put("≤", "\u2264"); _entityDict.put("≥", "\u2265"); _entityDict.put("⊂", "\u2282"); _entityDict.put("⊃", "\u2283"); _entityDict.put("⊄", "\u2284"); _entityDict.put("⊆", "\u2286"); _entityDict.put("⊇", "\u2287"); _entityDict.put("⊕", "\u2295"); _entityDict.put("⊗", "\u2297"); _entityDict.put("⊥", "\u22a5"); _entityDict.put("⋅", "\u22c5"); _entityDict.put("⌈", "\u2308"); _entityDict.put("⌉", "\u2309"); _entityDict.put("⌊", "\u230a"); _entityDict.put("⌋", "\u230b"); _entityDict.put("⟨", "\u2329"); _entityDict.put("⟩", "\u232a"); _entityDict.put("◊", "\u25ca"); _entityDict.put("♠", "\u2660"); _entityDict.put("♣", "\u2663"); _entityDict.put("♥", "\u2665"); _entityDict.put("♦", "\u2666"); _entityDict.put(""", "\""); _entityDict.put("&", "\u0026"); _entityDict.put("<", "\u003c"); _entityDict.put(">", "\u003e"); _entityDict.put("Œ", "\u0152"); _entityDict.put("œ", "\u0153"); _entityDict.put("Š", "\u0160"); _entityDict.put("š", "\u0161"); _entityDict.put("Ÿ", "\u0178"); _entityDict.put("ˆ", "\u02c6"); _entityDict.put("˜", "\u02dc"); _entityDict.put(" ", "\u2002"); _entityDict.put(" ", "\u2003"); _entityDict.put(" ", "\u2009"); _entityDict.put("‌", "\u200c"); _entityDict.put("‍", "\u200d"); _entityDict.put("‎", "\u200e"); _entityDict.put("‏", "\u200f"); _entityDict.put("–", "\u2013"); _entityDict.put("—", "\u2014"); _entityDict.put("‘", "\u2018"); _entityDict.put("’", "\u2019"); _entityDict.put("‚", "\u201a"); _entityDict.put("“", "\u201c"); _entityDict.put("”", "\u201d"); _entityDict.put("„", "\u201e"); _entityDict.put("†", "\u2020"); _entityDict.put("‡", "\u2021"); _entityDict.put("‰", "\u2030"); _entityDict.put("‹", "\u2039"); _entityDict.put("›", "\u203a"); _entityDict.put("€", "\u20ac"); } /** * substitute "<>"&" for "&lt;&gt;&quot;&amp;". * * @param str * @return */ public static String escapeHtmlEntities(String str) { StringBuffer result = new StringBuffer(); for (int i = 0; i < str.length(); i++) { result.append(escapeHtmlEntities(str.charAt(i))); } return result.toString(); } public static String escapeHtmlURL(String str) { StringBuffer result = new StringBuffer(); for (int i = 0; i < str.length(); i++) { char ch = str.charAt(i); if (ch == '&') { result.append("&"); } else { result.append(ch); } } return result.toString(); } /** * substitute "<>"&" for "&lt;&gt;&quot;&amp;". * * @param ch * @return */ public static String escapeHtmlEntities(char ch) { if (ch == '&') { return "&"; } else if (ch == '<') { return "<"; } else if (ch == '>') { return ">"; } else if (ch == '"') { return """; } else { return Character.toString(ch); } } /** * encode the value of the attribute. * * @param str * @return */ public static String escapeAttributeValue(String str) { StringBuffer result = new StringBuffer(); for (int i = 0; i < str.length(); i++) { char ch = str.charAt(i); switch (ch) { case '\"': result.append("""); break; case '\'': case '\\': result.append("\\"); // fall through default: result.append(ch); } } return result.toString(); } /** * analyze an entity defined in HTML4.0. * * @param str * @return */ public static String resolveHtmlEntities(String str) { return StringUtil.replaceMap(str, _entityDict); } /** * convert a entity into a reference of a numerical value letter. * * @param str * @return */ public static String translateNumEntities(String str) { return StringUtil.replaceMap(str, _numericRefDict); } /** * @param id * @return */ public static String encodeUrl(String id) { try { return URLEncoder.encode(id, "UTF-8"); } catch (UnsupportedEncodingException e) { return id; } } /** * convert HTML into XML. * * @param str * @return */ public static String html2Xml(String str) { StringWriter result = new StringWriter(); XMLParserConfiguration parser = new HTMLConfiguration(); XMLDocumentFilter[] filters = { new XHtmlWriter(result) }; parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true); parser.setProperty("http://cyberneko.org/html/properties/filters", filters); parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower"); try { parser.parse(new XMLInputSource(null, null, null, new StringReader(str), null)); } catch (IOException e) { log.error("", e); return "Error in parse html."; } return result.toString(); } /** * convert HTML into XML. * * @param str * @return */ public static String html2text(String str) { StringWriter result = new StringWriter(); XMLParserConfiguration parser = new HTMLConfiguration(); XMLDocumentFilter[] filters = { new HtmlTextWriter(result) }; parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true); parser.setProperty("http://cyberneko.org/html/properties/filters", filters); parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower"); try { parser.parse(new XMLInputSource(null, null, null, new StringReader(str), null)); } catch (IOException e) { log.error("", e); return "Error in parse html."; } return result.toString(); } /** * decode the text of the appointed tag. * * @param html * @return */ public static String decode(String html, String tagName) { String str = html; int start = 0; while (start >= 0) { start = str.indexOf("<" + tagName, start); if (start >= 0) { start = str.indexOf(">", start); if (start > 0) { int end = str.indexOf("</" + tagName + ">", start); if (end > start) { String script = str.substring(start + 1, end); script = XmlUtil.resolveNumEntities(script); str = str.substring(0, start + 1) + script + str.substring(end); start = end; } } } } return str; } }