Java tutorial
/* * Copyright (c) 2004-2013 YAMJ Members * https://github.com/organizations/YAMJ/teams * * This file is part of the Yet Another Media Jukebox (YAMJ). * * YAMJ is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * any later version. * * YAMJ is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with YAMJ. If not, see <http://www.gnu.org/licenses/>. * * Web: https://github.com/YAMJ/yamj-v3 * */ package org.yamj.core.tools.web; import org.apache.commons.lang3.StringUtils; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public final class HTMLTools { private static final Map<Character, String> AGGRESSIVE_HTML_ENCODE_MAP = new HashMap<Character, String>(); private static final Map<Character, String> DEFENSIVE_HTML_ENCODE_MAP = new HashMap<Character, String>(); private static final Map<String, Character> HTML_DECODE_MAP = new HashMap<String, Character>(); private static final Logger LOG = LoggerFactory.getLogger(HTMLTools.class); static { /* * Html encoding mapping according to the HTML 4.0 spec * http://www.w3.org/TR/REC-html40/sgml/entities.html */ // Special characters for HTML AGGRESSIVE_HTML_ENCODE_MAP.put('\u0026', "&"); AGGRESSIVE_HTML_ENCODE_MAP.put('\u003C', "<"); AGGRESSIVE_HTML_ENCODE_MAP.put('\u003E', ">"); AGGRESSIVE_HTML_ENCODE_MAP.put('\u0022', """); DEFENSIVE_HTML_ENCODE_MAP.put('\u0152', "Œ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u0153', "œ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u0160', "Š"); DEFENSIVE_HTML_ENCODE_MAP.put('\u0161', "š"); DEFENSIVE_HTML_ENCODE_MAP.put('\u0178', "Ÿ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u02C6', "ˆ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u02DC', "˜"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2002', " "); DEFENSIVE_HTML_ENCODE_MAP.put('\u2003', " "); DEFENSIVE_HTML_ENCODE_MAP.put('\u2009', " "); DEFENSIVE_HTML_ENCODE_MAP.put('\u200C', "‌"); DEFENSIVE_HTML_ENCODE_MAP.put('\u200D', "‍"); DEFENSIVE_HTML_ENCODE_MAP.put('\u200E', "‎"); DEFENSIVE_HTML_ENCODE_MAP.put('\u200F', "‏"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2013', "–"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2014', "—"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2018', "‘"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2019', "’"); DEFENSIVE_HTML_ENCODE_MAP.put('\u201A', "‚"); DEFENSIVE_HTML_ENCODE_MAP.put('\u201C', "“"); DEFENSIVE_HTML_ENCODE_MAP.put('\u201D', "”"); DEFENSIVE_HTML_ENCODE_MAP.put('\u201E', "„"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2020', "†"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2021', "‡"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2030', "‰"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2039', "‹"); DEFENSIVE_HTML_ENCODE_MAP.put('\u203A', "›"); DEFENSIVE_HTML_ENCODE_MAP.put('\u20AC', "€"); // Character entity references for ISO 8859-1 characters DEFENSIVE_HTML_ENCODE_MAP.put('\u00A0', " "); DEFENSIVE_HTML_ENCODE_MAP.put('\u00A1', "¡"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00A2', "¢"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00A3', "£"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00A4', "¤"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00A5', "¥"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00A6', "¦"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00A7', "§"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00A8', "¨"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00A9', "©"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00AA', "ª"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00AB', "«"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00AC', "¬"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00AD', "­"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00AE', "®"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00AF', "¯"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00B0', "°"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00B1', "±"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00B2', "²"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00B3', "³"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00B4', "´"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00B5', "µ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00B6', "¶"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00B7', "·"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00B8', "¸"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00B9', "¹"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00BA', "º"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00BB', "»"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00BC', "¼"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00BD', "½"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00BE', "¾"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00BF', "¿"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00C0', "À"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00C1', "Á"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00C2', "Â"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00C3', "Ã"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00C4', "Ä"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00C5', "Å"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00C6', "Æ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00C7', "Ç"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00C8', "È"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00C9', "É"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00CA', "Ê"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00CB', "Ë"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00CC', "Ì"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00CD', "Í"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00CE', "Î"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00CF', "Ï"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00D0', "Ð"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00D1', "Ñ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00D2', "Ò"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00D3', "Ó"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00D4', "Ô"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00D5', "Õ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00D6', "Ö"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00D7', "×"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00D8', "Ø"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00D9', "Ù"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00DA', "Ú"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00DB', "Û"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00DC', "Ü"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00DD', "Ý"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00DE', "Þ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00DF', "ß"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00E0', "à"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00E1', "á"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00E2', "â"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00E3', "ã"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00E4', "ä"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00E5', "å"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00E6', "æ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00E7', "ç"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00E8', "è"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00E9', "é"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00EA', "ê"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00EB', "ë"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00EC', "ì"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00ED', "í"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00EE', "î"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00EF', "ï"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00F0', "ð"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00F1', "ñ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00F2', "ò"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00F3', "ó"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00F4', "ô"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00F5', "õ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00F6', "ö"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00F7', "÷"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00F8', "ø"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00F9', "ù"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00FA', "ú"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00FB', "û"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00FC', "ü"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00FD', "ý"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00FE', "þ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u00FF', "ÿ"); // Mathematical, Greek and Symbolic characters for HTML DEFENSIVE_HTML_ENCODE_MAP.put('\u0192', "ƒ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u0391', "Α"); DEFENSIVE_HTML_ENCODE_MAP.put('\u0392', "Β"); DEFENSIVE_HTML_ENCODE_MAP.put('\u0393', "Γ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u0394', "Δ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u0395', "Ε"); DEFENSIVE_HTML_ENCODE_MAP.put('\u0396', "Ζ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u0397', "Η"); DEFENSIVE_HTML_ENCODE_MAP.put('\u0398', "Θ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u0399', "Ι"); DEFENSIVE_HTML_ENCODE_MAP.put('\u039A', "Κ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u039B', "Λ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u039C', "Μ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u039D', "Ν"); DEFENSIVE_HTML_ENCODE_MAP.put('\u039E', "Ξ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u039F', "Ο"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03A0', "Π"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03A1', "Ρ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03A3', "Σ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03A4', "Τ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03A5', "Υ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03A6', "Φ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03A7', "Χ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03A8', "Ψ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03A9', "Ω"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03B1', "α"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03B2', "β"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03B3', "γ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03B4', "δ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03B5', "ε"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03B6', "ζ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03B7', "η"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03B8', "θ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03B9', "ι"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03BA', "κ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03BB', "λ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03BC', "μ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03BD', "ν"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03BE', "ξ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03BF', "ο"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03C0', "π"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03C1', "ρ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03C2', "ς"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03C3', "σ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03C4', "τ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03C5', "υ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03C6', "φ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03C7', "χ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03C8', "ψ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03C9', "ω"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03D1', "ϑ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03D2', "ϒ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u03D6', "ϖ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2022', "•"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2026', "…"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2032', "′"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2033', "″"); DEFENSIVE_HTML_ENCODE_MAP.put('\u203E', "‾"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2044', "⁄"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2118', "℘"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2111', "ℑ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u211C', "ℜ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2122', "™"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2135', "ℵ"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2190', "←"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2191', "↑"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2192', "→"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2193', "↓"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2194', "↔"); DEFENSIVE_HTML_ENCODE_MAP.put('\u21B5', "↵"); DEFENSIVE_HTML_ENCODE_MAP.put('\u21D0', "⇐"); DEFENSIVE_HTML_ENCODE_MAP.put('\u21D1', "⇑"); DEFENSIVE_HTML_ENCODE_MAP.put('\u21D2', "⇒"); DEFENSIVE_HTML_ENCODE_MAP.put('\u21D3', "⇓"); DEFENSIVE_HTML_ENCODE_MAP.put('\u21D4', "⇔"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2200', "∀"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2202', "∂"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2203', "∃"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2205', "∅"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2207', "∇"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2208', "∈"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2209', "∉"); DEFENSIVE_HTML_ENCODE_MAP.put('\u220B', "∋"); DEFENSIVE_HTML_ENCODE_MAP.put('\u220F', "∏"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2211', "∑"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2212', "−"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2217', "∗"); DEFENSIVE_HTML_ENCODE_MAP.put('\u221A', "√"); DEFENSIVE_HTML_ENCODE_MAP.put('\u221D', "∝"); DEFENSIVE_HTML_ENCODE_MAP.put('\u221E', "∞"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2220', "∠"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2227', "∧"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2228', "∨"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2229', "∩"); DEFENSIVE_HTML_ENCODE_MAP.put('\u222A', "∪"); DEFENSIVE_HTML_ENCODE_MAP.put('\u222B', "∫"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2234', "∴"); DEFENSIVE_HTML_ENCODE_MAP.put('\u223C', "∼"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2245', "≅"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2248', "≈"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2260', "≠"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2261', "≡"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2264', "≤"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2265', "≥"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2282', "⊂"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2283', "⊃"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2284', "⊄"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2286', "⊆"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2287', "⊇"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2295', "⊕"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2297', "⊗"); DEFENSIVE_HTML_ENCODE_MAP.put('\u22A5', "⊥"); DEFENSIVE_HTML_ENCODE_MAP.put('\u22C5', "⋅"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2308', "⌈"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2309', "⌉"); DEFENSIVE_HTML_ENCODE_MAP.put('\u230A', "⌊"); DEFENSIVE_HTML_ENCODE_MAP.put('\u230B', "⌋"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2329', "⟨"); DEFENSIVE_HTML_ENCODE_MAP.put('\u232A', "⟩"); DEFENSIVE_HTML_ENCODE_MAP.put('\u25CA', "◊"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2660', "♠"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2663', "♣"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2665', "♥"); DEFENSIVE_HTML_ENCODE_MAP.put('\u2666', "♦"); Set<Map.Entry<Character, String>> aggresiveEntries = AGGRESSIVE_HTML_ENCODE_MAP.entrySet(); for (Map.Entry<Character, String> entry : aggresiveEntries) { HTML_DECODE_MAP.put(entry.getValue(), entry.getKey()); } Set<Map.Entry<Character, String>> defensiveEntries = DEFENSIVE_HTML_ENCODE_MAP.entrySet(); for (Map.Entry<Character, String> entry : defensiveEntries) { HTML_DECODE_MAP.put(entry.getValue(), entry.getKey()); } } private HTMLTools() { throw new UnsupportedOperationException("Utility class"); } public static String decodeHtml(String source) { if (null == source || 0 == source.length()) { return source; } int currentIndex = 0; int delimiterStartIndex; int delimiterEndIndex; StringBuilder result = null; while (currentIndex <= source.length()) { delimiterStartIndex = source.indexOf('&', currentIndex); if (delimiterStartIndex != -1) { delimiterEndIndex = source.indexOf(';', delimiterStartIndex + 1); if (delimiterEndIndex != -1) { // ensure that the string builder is setup correctly if (null == result) { result = new StringBuilder(); } // add the text that leads up to this match if (delimiterStartIndex > currentIndex) { result.append(source.substring(currentIndex, delimiterStartIndex)); } // add the decoded entity String entity = source.substring(delimiterStartIndex, delimiterEndIndex + 1); currentIndex = delimiterEndIndex + 1; // try to decoded numeric entities if (entity.charAt(1) == '#') { int start = 2; int radix = 10; // check if the number is hexadecimal if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') { start++; radix = 16; } try { Character c = Character.valueOf( (char) Integer.parseInt(entity.substring(start, entity.length() - 1), radix)); result.append(c); } // when the number of the entity can't be parsed, add the entity as-is catch (NumberFormatException error) { result.append(entity); } } else { // try to decode the entity as a literal Character decoded = HTML_DECODE_MAP.get(entity); if (decoded != null) { result.append(decoded); } // if there was no match, add the entity as-is else { result.append(entity); } } } else { break; } } else { break; } } if (null == result) { return source; } else if (currentIndex < source.length()) { result.append(source.substring(currentIndex)); } return result.toString(); } public static String decodeUrl(String url) { if (url != null && url.length() != 0) { try { return URLDecoder.decode(url, "UTF-8"); } catch (UnsupportedEncodingException ignored) { LOG.info("Could not decode URL string '{}', will proceed with undecoded string.", url); } } return url; } public static String encodeUrl(String url) { String returnUrl = url; if (url != null && url.length() != 0) { try { returnUrl = URLEncoder.encode(url, "UTF-8"); returnUrl = returnUrl.replace((CharSequence) "+", (CharSequence) "%20"); // why does URLEncoder do that??!! } catch (UnsupportedEncodingException ignored) { LOG.info("Could not decode URL string '{}', will proceed with undecoded string.", returnUrl); } } return returnUrl; } public static String encodeUrlPath(String url) { if (url != null && url.length() != 0) { int slash = url.lastIndexOf('/'); String parentPart = ""; if (slash != -1) { parentPart = encodeUrlPath(url.substring(0, slash)) + '/'; } return parentPart + encodeUrl(url.substring(slash + 1)); } return url; } public static List<String> extractHtmlTags(String src, String sectionStart, String sectionEnd, String startTag, String endTag) { ArrayList<String> tags = new ArrayList<String>(); int index = src.indexOf(sectionStart); if (index == -1) { return tags; } index += sectionStart.length(); int endIndex = src.indexOf(sectionEnd, index); if (endIndex == -1) { return tags; } String sectionText = src.substring(index, endIndex); int lastIndex = sectionText.length(); index = 0; int endLen = endTag.length(); if (startTag != null) { index = sectionText.indexOf(startTag); } while (index != -1) { endIndex = sectionText.indexOf(endTag, index); if (endIndex == -1) { endIndex = lastIndex; } endIndex += endLen; String text = sectionText.substring(index, endIndex); tags.add(text); if (endIndex > lastIndex) { break; } if (startTag != null) { index = sectionText.indexOf(startTag, endIndex); } else { index = endIndex; } } return tags; } public static String extractTag(String src, String findStr) { return extractTag(src, findStr, 0); } public static String extractTag(String src, String findStr, int skip) { return extractTag(src, findStr, skip, "><"); } public static String extractTag(String src, String findStr, int skip, String separator) { return extractTag(src, findStr, skip, separator, true); } public static String extractTag(String src, String findStr, int skip, String separator, boolean checkDirty) { int beginIndex = src.indexOf(findStr); String value = StringUtils.EMPTY; if (beginIndex >= 0) { StringTokenizer st = new StringTokenizer(src.substring(beginIndex + findStr.length()), separator); for (int i = 0; i < skip; i++) { st.nextToken(); } value = HTMLTools.decodeHtml(st.nextToken().trim()); if (checkDirty && value.indexOf("uiv=\"content-ty") != -1 || value.indexOf("cast") != -1 || value.indexOf("title") != -1 || value.indexOf('<') != -1) { value = StringUtils.EMPTY; } } return value; } public static String extractTag(String src, String startStr, String endStr) { int beginIndex = src.indexOf(startStr); if (beginIndex < 0) { return StringUtils.EMPTY; } try { String subString = src.substring(beginIndex + startStr.length()); int endIndex = subString.indexOf(endStr); if (endIndex < 0) { return StringUtils.EMPTY; } subString = subString.substring(0, endIndex); return HTMLTools.decodeHtml(subString.trim()); } catch (Exception error) { return StringUtils.EMPTY; } } public static List<String> extractTags(String src, String sectionStart) { return extractTags(src, sectionStart, "</div>"); } public static List<String> extractTags(String src, String sectionStart, String sectionEnd) { return extractTags(src, sectionStart, sectionEnd, null, "|"); } public static List<String> extractTags(String src, String sectionStart, String sectionEnd, String startTag, String endTag) { return extractTags(src, sectionStart, sectionEnd, startTag, endTag, true); } public static List<String> extractTags(String src, String sectionStart, String sectionEnd, String startTag, String endTag, boolean forceCloseTag) { ArrayList<String> tags = new ArrayList<String>(); int startIndex = src.indexOf(sectionStart); if (startIndex == -1) { return tags; } startIndex += sectionStart.length(); int endIndex = src.indexOf(sectionEnd, startIndex); if (endIndex == -1) { return tags; } String sectionText = src.substring(startIndex, endIndex); int lastIndex = sectionText.length(); startIndex = 0; int startLen = 0; int endLen = endTag.length(); if (startTag != null) { startIndex = sectionText.indexOf(startTag); startLen = startTag.length(); } while (startIndex != -1) { startIndex += startLen; if (forceCloseTag) { int close = sectionText.indexOf('>', startIndex); if (close != -1) { startIndex = close + 1; } } endIndex = sectionText.indexOf(endTag, startIndex); if (endIndex == -1) { endIndex = lastIndex; } String text = sectionText.substring(startIndex, endIndex); tags.add(HTMLTools.decodeHtml(text.trim())); endIndex += endLen; if (endIndex > lastIndex) { break; } if (startTag != null) { startIndex = sectionText.indexOf(startTag, endIndex); } else { startIndex = endIndex; } } return tags; } public static String getTextAfterElem(String src, String findStr) { return getTextAfterElem(src, findStr, 0); } public static String getTextAfterElem(String src, String findStr, int skip) { return getTextAfterElem(src, findStr, skip, 0); } /** * Example: src = "<a id="specialID"><br/> <img src="a.gif"/>my text</a> findStr = "specialID" result = "my text" * * @param src html text * @param findStr string to find in src * @param skip count of found texts to skip * @param fromIndex begin index in src * @return string from html text which is plain text without html tags */ public static String getTextAfterElem(String src, String findStr, int skip, int fromIndex) { int beginIndex = src.indexOf(findStr, fromIndex); if (beginIndex == -1) { return StringUtils.EMPTY; } StringTokenizer st = new StringTokenizer(src.substring(beginIndex + findStr.length()), "<"); int i = 0; while (st.hasMoreElements()) { String elem = st.nextToken().replaceAll(" | ", "").trim(); if (elem.length() != 0 && !elem.endsWith(">") && i++ >= skip) { String[] elems = elem.split(">"); if (elems.length > 1) { return HTMLTools.decodeHtml(elems[1].trim()); } else { return HTMLTools.decodeHtml(elems[0].trim()); } } } return StringUtils.EMPTY; } public static String removeHtmlTags(String src) { return replaceHtmlTags(src, ""); } public static String replaceHtmlTags(String src, String replacement) { return src.replaceAll("\\<.*?>", replacement); } public static String stripTags(String s) { Pattern stripTagsRegex = Pattern.compile("([^\\<]*)(?:\\<[^\\>]*\\>)?"); Matcher m = stripTagsRegex.matcher(s); StringBuilder res = new StringBuilder(); while (m.find()) { res.append(m.group(1)); } // Replace escaped spaces String finalRes = res.toString().replaceAll(" ", " "); return finalRes.trim(); } }