ecar.util.HtmlSanitizer.java Source code

Introduction

Here is the source code for ecar.util.HtmlSanitizer.java
Source

package ecar.util;

import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.validator.UrlValidator;

/**
 * Copyright (c) 2009 Open Lab, http://www.open-lab.com/ Permission is hereby
 * granted, free of charge, to any person obtaining a copy of this software and
 * associated documentation files (the "Software"), to deal in the Software
 * without restriction, including without limitation the rights to use, copy,
 * modify, merge, publish, distribute, sublicense, and/or sell copies of the
 * Software, and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

public class HtmlSanitizer {
    public static Pattern forbiddenTags = Pattern.compile("^(script|object|embed|link|style|form|input|font)$");
    public static Pattern allowedTags = Pattern.compile(
            "^(b|p|i|s|a|img|table|thead|tbody|tfoot|tr|th|td|dd|dl|dt|em|h1|h2|h3|h4|h5|h6|li|ul|ol|span|div|strike|strong|"
                    + "sub|sup|pre|del|code|blockquote|strike|kbd|br|hr|area|map|object|embed|param|link|form|small|big)$");

    private static Pattern commentPattern = Pattern.compile("<!--.*"); // <!--.........>
    private static Pattern tagStartPattern = Pattern.compile("<(?i)(\\w+\\b)\\s*(.*)/?>$"); // <tag ....props.....>
    private static Pattern tagClosePattern = Pattern.compile("</(?i)(\\w+\\b)\\s*>$"); // </tag .........>

    private static Pattern standAloneTags = Pattern.compile("^(img|br|hr)$");
    private static Pattern selfClosed = Pattern.compile("<.+/>");

    private static Pattern attributesPattern = Pattern.compile("(\\w*)\\s*=\\s*\"([^\"]*)\""); // prop="...."
    private static Pattern stylePattern = Pattern.compile("([^\\s^:]+)\\s*:\\s*([^;]+);?"); // color:red;

    private static Pattern urlStylePattern = Pattern.compile("(?i).*\\b\\s*url\\s*\\(['\"]([^)]*)['\"]\\)"); // url('....')"

    public static Pattern forbiddenStylePattern = Pattern.compile("(?:(expression|eval|javascript))\\s*\\("); // expression(....)"
    // thanks to
    // Ben
    // Summer

    /**
     * This method should be used to test input.
     * 
     * @param html
     * @return true if the input is "valid"
     */
    public static boolean isSanitized(String html) {
        return sanitizer(html).isValid;
    }

    /**
     * Used to clean every html before to output it in any html page
     * 
     * @param html
     * @return sanitized html
     */
    public static String sanitize(String html) {
        return sanitizer(html).html;
    }

    /**
     * Used to get the text, tags removed or encoded
     * 
     * @param html
     * @return sanitized text
     */
    public static String getText(String html) {
        return sanitizer(html).text;
    }

    /**
     * This is the main method of sanitizing. It will be used both for
     * validation and cleaning
     * 
     * @param html
     * @return a SanitizeResult object
     */
    public static SanitizeResult sanitizer(String html) {
        return sanitizer(html, allowedTags, forbiddenTags);
    }

    public static SanitizeResult sanitizer(String html, Pattern allowedTags, Pattern forbiddenTags) {
        SanitizeResult ret = new SanitizeResult();
        Stack<String> openTags = new Stack();

        List<String> tokens = tokenize(html);

        // ------------------- LOOP for every token --------------------------
        for (String token : tokens) {
            boolean isAcceptedToken = false;

            Matcher startMatcher = tagStartPattern.matcher(token);
            Matcher endMatcher = tagClosePattern.matcher(token);

            // --------------------------------------------------------------------------------
            // COMMENT <!-- ......... -->
            if (commentPattern.matcher(token).find()) {
                ret.val = ret.val + token + (token.endsWith("-->") ? "" : "-->");
                ret.invalidTags.add(token + (token.endsWith("-->") ? "" : "-->"));
                continue;

                // --------------------------------------------------------------------------------
                // OPEN TAG <tag .........>
            } else if (startMatcher.find()) {

                // tag name extraction
                String tag = startMatcher.group(1).toLowerCase();

                // -----------------------------------------------------
                // FORBIDDEN TAG <script .........>
                if (forbiddenTags.matcher(tag).find()) {
                    ret.invalidTags.add("<" + tag + ">");
                    continue;

                    // -------------------------------------------------- WELL
                    // KNOWN TAG
                } else if (allowedTags.matcher(tag).find()) {

                    String cleanToken = "<" + tag;
                    String tokenBody = startMatcher.group(2);

                    // first test table consistency
                    // table tbody tfoot thead th tr td
                    if ("thead".equals(tag) || "tbody".equals(tag) || "tfoot".equals(tag) || "tr".equals(tag)) {
                        if (openTags.search("table") < 1) {
                            ret.invalidTags.add("<" + tag + ">");
                            continue;
                        }
                    } else if ("td".equals(tag) || "th".equals(tag)) {
                        if (openTags.search("tr") < 1) {
                            ret.invalidTags.add("<" + tag + ">");
                            continue;
                        }
                    }

                    // then test properties
                    Matcher attributes = attributesPattern.matcher(tokenBody);

                    boolean foundURL = false; // URL flag
                    while (attributes.find()) {

                        String attr = attributes.group(1).toLowerCase();
                        String val = attributes.group(2);

                        // we will accept href in case of <A>
                        if ("a".equals(tag) && "href".equals(attr)) { // <a
                            // href="......">
                            String[] customSchemes = { "http", "https" };
                            if (new UrlValidator(customSchemes).isValid(val)) {
                                foundURL = true;
                            } else {
                                // may be it is a mailto?
                                // case <a
                                // href="mailto:pippo@pippo.com?subject=...."
                                if (val.toLowerCase().startsWith("mailto:") && val.indexOf("@") >= 0) {
                                    String val1 = "http://www." + val.substring(val.indexOf("@") + 1);
                                    if (new UrlValidator(customSchemes).isValid(val1)) {
                                        foundURL = true;
                                    } else {
                                        ret.invalidTags.add(attr + " " + val);
                                        val = "";
                                    }
                                } else {
                                    ret.invalidTags.add(attr + " " + val);
                                    val = "";
                                }
                            }

                        } else if (tag.matches("img|embed") && "src".equals(attr)) { // <img src="......">
                            String[] customSchemes = { "http", "https" };
                            if (new UrlValidator(customSchemes).isValid(val)) {
                                foundURL = true;
                            } else {
                                ret.invalidTags.add(attr + " " + val);
                                val = "";
                            }

                        } else if ("href".equals(attr) || "src".equals(attr)) { // <tag
                            // src/href="......">
                            // skipped
                            ret.invalidTags.add(tag + " " + attr + " " + val);
                            continue;

                        } else if (attr.matches("width|height")) { // <tag
                            // width/height="......">
                            if (!val.toLowerCase().matches("\\d+%|\\d+$")) { // test
                                // numeric
                                // values
                                ret.invalidTags.add(tag + " " + attr + " " + val);
                                continue;
                            }

                        } else if ("style".equals(attr)) { // <tag
                            // style="......">

                            // then test properties
                            Matcher styles = stylePattern.matcher(val);
                            String cleanStyle = "";

                            while (styles.find()) {
                                String styleName = styles.group(1).toLowerCase();
                                String styleValue = styles.group(2);

                                // suppress invalid styles values
                                if (forbiddenStylePattern.matcher(styleValue).find()) {
                                    ret.invalidTags.add(tag + " " + attr + " " + styleValue);
                                    continue;
                                }

                                // check if valid url
                                Matcher urlStyleMatcher = urlStylePattern.matcher(styleValue);
                                if (urlStyleMatcher.find()) {
                                    String[] customSchemes = { "http", "https" };
                                    String url = urlStyleMatcher.group(1);
                                    if (!new UrlValidator(customSchemes).isValid(url)) {
                                        ret.invalidTags.add(tag + " " + attr + " " + styleValue);
                                        continue;
                                    }
                                }

                                cleanStyle = cleanStyle + styleName + ":" + encode(styleValue) + ";";

                            }
                            val = cleanStyle;

                        } else if (attr.startsWith("on")) { // skip all
                            // javascript events
                            ret.invalidTags.add(tag + " " + attr + " " + val);
                            continue;

                        } else { // by default encode all properies
                            val = encode(val);
                        }

                        cleanToken = cleanToken + " " + attr + "=\"" + val + "\"";
                    }
                    cleanToken = cleanToken + ">";

                    isAcceptedToken = true;

                    // for <img> and <a>
                    if (tag.matches("a|img|embed") && !foundURL) {
                        isAcceptedToken = false;
                        cleanToken = "";
                    }

                    token = cleanToken;

                    // push the tag if require closure and it is accepted
                    // (otherwirse is encoded)
                    if (isAcceptedToken && !(standAloneTags.matcher(tag).find() || selfClosed.matcher(tag).find()))
                        openTags.push(tag);

                    // --------------------------------------------------------------------------------
                    // UNKNOWN TAG
                } else {
                    ret.invalidTags.add(token);
                    ret.val = ret.val + token;
                    continue;

                }

                // --------------------------------------------------------------------------------
                // CLOSE TAG </tag>
            } else if (endMatcher.find()) {
                String tag = endMatcher.group(1).toLowerCase();

                // is self closing
                if (selfClosed.matcher(tag).find()) {
                    ret.invalidTags.add(token);
                    continue;
                }
                if (forbiddenTags.matcher(tag).find()) {
                    ret.invalidTags.add("/" + tag);
                    continue;
                }
                if (!allowedTags.matcher(tag).find()) {
                    ret.invalidTags.add(token);
                    ret.val = ret.val + token;
                    continue;
                } else {

                    String cleanToken = "";

                    // check tag position in the stack
                    int pos = openTags.search(tag);
                    // if found on top ok
                    for (int i = 1; i <= pos; i++) {
                        // pop all elements before tag and close it
                        String poppedTag = openTags.pop();
                        cleanToken = cleanToken + "</" + poppedTag + ">";
                        isAcceptedToken = true;
                    }

                    token = cleanToken;
                }

            }

            ret.val = ret.val + token;

            if (isAcceptedToken) {
                ret.html = ret.html + token;
                // ret.text = ret.text + " ";
            } else {
                String sanToken = htmlEncodeApexesAndTags(token);
                ret.html = ret.html + sanToken;
                ret.text = ret.text + htmlEncodeApexesAndTags(removeLineFeed(token));
            }

        }

        // must close remaining tags
        while (openTags.size() > 0) {
            // pop all elements before tag and close it
            String poppedTag = openTags.pop();
            ret.html = ret.html + "</" + poppedTag + ">";
            ret.val = ret.val + "</" + poppedTag + ">";
        }

        // set boolean value
        ret.isValid = ret.invalidTags.size() == 0;

        return ret;
    }

    /**
     * Splits html tag and tag content <......>.
     * 
     * @param html
     * @return a list of token
     */
    private static List<String> tokenize(String html) {
        ArrayList tokens = new ArrayList();
        int pos = 0;
        String token = "";
        int len = html.length();
        while (pos < len) {
            char c = html.charAt(pos);

            String ahead = html.substring(pos, pos > len - 4 ? len : pos + 4);

            // a comment is starting
            if ("<!--".equals(ahead)) {
                // store the current token
                if (token.length() > 0)
                    tokens.add(token);

                // clear the token
                token = "";

                // serch the end of <......>
                int end = moveToMarkerEnd(pos, "-->", html);
                tokens.add(html.substring(pos, end));
                pos = end;

                // a new "<" token is starting
            } else if ('<' == c) {

                // store the current token
                if (token.length() > 0)
                    tokens.add(token);

                // clear the token
                token = "";

                // serch the end of <......>
                int end = moveToMarkerEnd(pos, ">", html);
                tokens.add(html.substring(pos, end));
                pos = end;

            } else {
                token = token + c;
                pos++;
            }

        }

        // store the last token
        if (token.length() > 0)
            tokens.add(token);

        return tokens;
    }

    private static int moveToMarkerEnd(int pos, String marker, String s) {
        int i = s.indexOf(marker, pos);
        if (i > -1)
            pos = i + marker.length();
        else
            pos = s.length();
        return pos;
    }

    /**
     * Contains the sanitizing results. html is the sanitized html encoded ready
     * to be printed. Unaccepted tag are encode, text inside tag is always
     * encoded MUST BE USED WHEN PRINTING HTML text is the text inside valid
     * tags. Contains invalid tags encoded SHOULD BE USED TO PRINT EXCERPTS val
     * is the html source cleaned from unaccepted tags. It is not encoded:
     * SHOULD BE USED IN SAVE ACTIONS isValid is true when every tag is accepted
     * without forcing encoding invalidTags is the list of encoded-killed tags
     */
    public static class SanitizeResult {
        public String html = "";
        public String text = "";
        public String val = "";
        public boolean isValid = true;
        public List<String> invalidTags = new ArrayList();
    }

    public static String encode(String s) {
        return convertLineFeedToBR(htmlEncodeApexesAndTags(s == null ? "" : s));
    }

    public static final String htmlEncodeApexesAndTags(String source) {
        return htmlEncodeTag(htmlEncodeApexes(source));
    }

    public static final String htmlEncodeApexes(String source) {
        if (source != null) {
            String result = replaceAllNoRegex(source, new String[] { "\"", "'" },
                    new String[] { "&quot;", "&#39;" });
            return result;
        } else
            return null;
    }

    public static final String htmlEncodeTag(String source) {
        if (source != null) {
            String result = replaceAllNoRegex(source, new String[] { "<", ">" }, new String[] { "&lt;", "&gt;" });
            return result;
        } else
            return null;
    }

    public static String convertLineFeedToBR(String text) {
        if (text != null)
            return replaceAllNoRegex(text, new String[] { "\n", "\f", "\r" }, new String[] { "<br>", "<br>", " " });
        else
            return null;
    }

    public static String removeLineFeed(String text) {

        if (text != null)
            return replaceAllNoRegex(text, new String[] { "\n", "\f", "\r" }, new String[] { " ", " ", " " });
        else
            return null;
    }

    public static final String replaceAllNoRegex(String source, String searches[], String replaces[]) {
        int k;
        String tmp = source;
        for (k = 0; k < searches.length; k++)
            tmp = replaceAllNoRegex(tmp, searches[k], replaces[k]);
        return tmp;
    }

    public static final String replaceAllNoRegex(String source, String search, String replace) {
        StringBuffer buffer = new StringBuffer();
        if (source != null) {
            if (search.length() == 0)
                return source;
            int oldPos, pos;
            for (oldPos = 0, pos = source.indexOf(search, oldPos); pos != -1; oldPos = pos
                    + search.length(), pos = source.indexOf(search, oldPos)) {
                buffer.append(source.substring(oldPos, pos));
                buffer.append(replace);
            }
            if (oldPos < source.length())
                buffer.append(source.substring(oldPos));
        }
        return new String(buffer);
    }

}