Java tutorial
package ecar.util; import java.util.ArrayList; import java.util.List; import java.util.Stack; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.validator.UrlValidator; /** * Copyright (c) 2009 Open Lab, Permission is hereby * granted, free of charge, to any person obtaining a copy of this software and * associated documentation files (the "Software"), to deal in the Software * without restriction, including without limitation the rights to use, copy, * modify, merge, publish, distribute, sublicense, and/or sell copies of the * Software, and to permit persons to whom the Software is furnished to do so, * subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ public class HtmlSanitizer { public static Pattern forbiddenTags = Pattern.compile("^(script|object|embed|link|style|form|input|font)$"); public static Pattern allowedTags = Pattern.compile( "^(b|p|i|s|a|img|table|thead|tbody|tfoot|tr|th|td|dd|dl|dt|em|h1|h2|h3|h4|h5|h6|li|ul|ol|span|div|strike|strong|" + "sub|sup|pre|del|code|blockquote|strike|kbd|br|hr|area|map|object|embed|param|link|form|small|big)$"); private static Pattern commentPattern = Pattern.compile("<!--.*"); // <!--.........> private static Pattern tagStartPattern = Pattern.compile("<(?i)(\\w+\\b)\\s*(.*)/?>$"); // <tag ....props.....> private static Pattern tagClosePattern = Pattern.compile("</(?i)(\\w+\\b)\\s*>$"); // </tag .........> private static Pattern standAloneTags = Pattern.compile("^(img|br|hr)$"); private static Pattern selfClosed = Pattern.compile("<.+/>"); private static Pattern attributesPattern = Pattern.compile("(\\w*)\\s*=\\s*\"([^\"]*)\""); // prop="...." private static Pattern stylePattern = Pattern.compile("([^\\s^:]+)\\s*:\\s*([^;]+);?"); // color:red; private static Pattern urlStylePattern = Pattern.compile("(?i).*\\b\\s*url\\s*\\(['\"]([^)]*)['\"]\\)"); // url('....')" public static Pattern forbiddenStylePattern = Pattern.compile("(?:(expression|eval|javascript))\\s*\\("); // expression(....)" // thanks to // Ben // Summer /** * This method should be used to test input. * * @param html * @return true if the input is "valid" */ public static boolean isSanitized(String html) { return sanitizer(html).isValid; } /** * Used to clean every html before to output it in any html page * * @param html * @return sanitized html */ public static String sanitize(String html) { return sanitizer(html).html; } /** * Used to get the text, tags removed or encoded * * @param html * @return sanitized text */ public static String getText(String html) { return sanitizer(html).text; } /** * This is the main method of sanitizing. It will be used both for * validation and cleaning * * @param html * @return a SanitizeResult object */ public static SanitizeResult sanitizer(String html) { return sanitizer(html, allowedTags, forbiddenTags); } public static SanitizeResult sanitizer(String html, Pattern allowedTags, Pattern forbiddenTags) { SanitizeResult ret = new SanitizeResult(); Stack<String> openTags = new Stack(); List<String> tokens = tokenize(html); // ------------------- LOOP for every token -------------------------- for (String token : tokens) { boolean isAcceptedToken = false; Matcher startMatcher = tagStartPattern.matcher(token); Matcher endMatcher = tagClosePattern.matcher(token); // -------------------------------------------------------------------------------- // COMMENT <!-- ......... --> if (commentPattern.matcher(token).find()) { ret.val = ret.val + token + (token.endsWith("-->") ? "" : "-->"); ret.invalidTags.add(token + (token.endsWith("-->") ? "" : "-->")); continue; // -------------------------------------------------------------------------------- // OPEN TAG <tag .........> } else if (startMatcher.find()) { // tag name extraction String tag =; // ----------------------------------------------------- // FORBIDDEN TAG <script .........> if (forbiddenTags.matcher(tag).find()) { ret.invalidTags.add("<" + tag + ">"); continue; // -------------------------------------------------- WELL // KNOWN TAG } else if (allowedTags.matcher(tag).find()) { String cleanToken = "<" + tag; String tokenBody =; // first test table consistency // table tbody tfoot thead th tr td if ("thead".equals(tag) || "tbody".equals(tag) || "tfoot".equals(tag) || "tr".equals(tag)) { if ("table") < 1) { ret.invalidTags.add("<" + tag + ">"); continue; } } else if ("td".equals(tag) || "th".equals(tag)) { if ("tr") < 1) { ret.invalidTags.add("<" + tag + ">"); continue; } } // then test properties Matcher attributes = attributesPattern.matcher(tokenBody); boolean foundURL = false; // URL flag while (attributes.find()) { String attr =; String val =; // we will accept href in case of <A> if ("a".equals(tag) && "href".equals(attr)) { // <a // href="......"> String[] customSchemes = { "http", "https" }; if (new UrlValidator(customSchemes).isValid(val)) { foundURL = true; } else { // may be it is a mailto? // case <a // href="" if (val.toLowerCase().startsWith("mailto:") && val.indexOf("@") >= 0) { String val1 = "http://www." + val.substring(val.indexOf("@") + 1); if (new UrlValidator(customSchemes).isValid(val1)) { foundURL = true; } else { ret.invalidTags.add(attr + " " + val); val = ""; } } else { ret.invalidTags.add(attr + " " + val); val = ""; } } } else if (tag.matches("img|embed") && "src".equals(attr)) { // <img src="......"> String[] customSchemes = { "http", "https" }; if (new UrlValidator(customSchemes).isValid(val)) { foundURL = true; } else { ret.invalidTags.add(attr + " " + val); val = ""; } } else if ("href".equals(attr) || "src".equals(attr)) { // <tag // src/href="......"> // skipped ret.invalidTags.add(tag + " " + attr + " " + val); continue; } else if (attr.matches("width|height")) { // <tag // width/height="......"> if (!val.toLowerCase().matches("\\d+%|\\d+$")) { // test // numeric // values ret.invalidTags.add(tag + " " + attr + " " + val); continue; } } else if ("style".equals(attr)) { // <tag // style="......"> // then test properties Matcher styles = stylePattern.matcher(val); String cleanStyle = ""; while (styles.find()) { String styleName =; String styleValue =; // suppress invalid styles values if (forbiddenStylePattern.matcher(styleValue).find()) { ret.invalidTags.add(tag + " " + attr + " " + styleValue); continue; } // check if valid url Matcher urlStyleMatcher = urlStylePattern.matcher(styleValue); if (urlStyleMatcher.find()) { String[] customSchemes = { "http", "https" }; String url =; if (!new UrlValidator(customSchemes).isValid(url)) { ret.invalidTags.add(tag + " " + attr + " " + styleValue); continue; } } cleanStyle = cleanStyle + styleName + ":" + encode(styleValue) + ";"; } val = cleanStyle; } else if (attr.startsWith("on")) { // skip all // javascript events ret.invalidTags.add(tag + " " + attr + " " + val); continue; } else { // by default encode all properies val = encode(val); } cleanToken = cleanToken + " " + attr + "=\"" + val + "\""; } cleanToken = cleanToken + ">"; isAcceptedToken = true; // for <img> and <a> if (tag.matches("a|img|embed") && !foundURL) { isAcceptedToken = false; cleanToken = ""; } token = cleanToken; // push the tag if require closure and it is accepted // (otherwirse is encoded) if (isAcceptedToken && !(standAloneTags.matcher(tag).find() || selfClosed.matcher(tag).find())) openTags.push(tag); // -------------------------------------------------------------------------------- // UNKNOWN TAG } else { ret.invalidTags.add(token); ret.val = ret.val + token; continue; } // -------------------------------------------------------------------------------- // CLOSE TAG </tag> } else if (endMatcher.find()) { String tag =; // is self closing if (selfClosed.matcher(tag).find()) { ret.invalidTags.add(token); continue; } if (forbiddenTags.matcher(tag).find()) { ret.invalidTags.add("/" + tag); continue; } if (!allowedTags.matcher(tag).find()) { ret.invalidTags.add(token); ret.val = ret.val + token; continue; } else { String cleanToken = ""; // check tag position in the stack int pos =; // if found on top ok for (int i = 1; i <= pos; i++) { // pop all elements before tag and close it String poppedTag = openTags.pop(); cleanToken = cleanToken + "</" + poppedTag + ">"; isAcceptedToken = true; } token = cleanToken; } } ret.val = ret.val + token; if (isAcceptedToken) { ret.html = ret.html + token; // ret.text = ret.text + " "; } else { String sanToken = htmlEncodeApexesAndTags(token); ret.html = ret.html + sanToken; ret.text = ret.text + htmlEncodeApexesAndTags(removeLineFeed(token)); } } // must close remaining tags while (openTags.size() > 0) { // pop all elements before tag and close it String poppedTag = openTags.pop(); ret.html = ret.html + "</" + poppedTag + ">"; ret.val = ret.val + "</" + poppedTag + ">"; } // set boolean value ret.isValid = ret.invalidTags.size() == 0; return ret; } /** * Splits html tag and tag content <......>. * * @param html * @return a list of token */ private static List<String> tokenize(String html) { ArrayList tokens = new ArrayList(); int pos = 0; String token = ""; int len = html.length(); while (pos < len) { char c = html.charAt(pos); String ahead = html.substring(pos, pos > len - 4 ? len : pos + 4); // a comment is starting if ("<!--".equals(ahead)) { // store the current token if (token.length() > 0) tokens.add(token); // clear the token token = ""; // serch the end of <......> int end = moveToMarkerEnd(pos, "-->", html); tokens.add(html.substring(pos, end)); pos = end; // a new "<" token is starting } else if ('<' == c) { // store the current token if (token.length() > 0) tokens.add(token); // clear the token token = ""; // serch the end of <......> int end = moveToMarkerEnd(pos, ">", html); tokens.add(html.substring(pos, end)); pos = end; } else { token = token + c; pos++; } } // store the last token if (token.length() > 0) tokens.add(token); return tokens; } private static int moveToMarkerEnd(int pos, String marker, String s) { int i = s.indexOf(marker, pos); if (i > -1) pos = i + marker.length(); else pos = s.length(); return pos; } /** * Contains the sanitizing results. html is the sanitized html encoded ready * to be printed. Unaccepted tag are encode, text inside tag is always * encoded MUST BE USED WHEN PRINTING HTML text is the text inside valid * tags. Contains invalid tags encoded SHOULD BE USED TO PRINT EXCERPTS val * is the html source cleaned from unaccepted tags. It is not encoded: * SHOULD BE USED IN SAVE ACTIONS isValid is true when every tag is accepted * without forcing encoding invalidTags is the list of encoded-killed tags */ public static class SanitizeResult { public String html = ""; public String text = ""; public String val = ""; public boolean isValid = true; public List<String> invalidTags = new ArrayList(); } public static String encode(String s) { return convertLineFeedToBR(htmlEncodeApexesAndTags(s == null ? "" : s)); } public static final String htmlEncodeApexesAndTags(String source) { return htmlEncodeTag(htmlEncodeApexes(source)); } public static final String htmlEncodeApexes(String source) { if (source != null) { String result = replaceAllNoRegex(source, new String[] { "\"", "'" }, new String[] { """, "'" }); return result; } else return null; } public static final String htmlEncodeTag(String source) { if (source != null) { String result = replaceAllNoRegex(source, new String[] { "<", ">" }, new String[] { "<", ">" }); return result; } else return null; } public static String convertLineFeedToBR(String text) { if (text != null) return replaceAllNoRegex(text, new String[] { "\n", "\f", "\r" }, new String[] { "<br>", "<br>", " " }); else return null; } public static String removeLineFeed(String text) { if (text != null) return replaceAllNoRegex(text, new String[] { "\n", "\f", "\r" }, new String[] { " ", " ", " " }); else return null; } public static final String replaceAllNoRegex(String source, String searches[], String replaces[]) { int k; String tmp = source; for (k = 0; k < searches.length; k++) tmp = replaceAllNoRegex(tmp, searches[k], replaces[k]); return tmp; } public static final String replaceAllNoRegex(String source, String search, String replace) { StringBuffer buffer = new StringBuffer(); if (source != null) { if (search.length() == 0) return source; int oldPos, pos; for (oldPos = 0, pos = source.indexOf(search, oldPos); pos != -1; oldPos = pos + search.length(), pos = source.indexOf(search, oldPos)) { buffer.append(source.substring(oldPos, pos)); buffer.append(replace); } if (oldPos < source.length()) buffer.append(source.substring(oldPos)); } return new String(buffer); } }