 * ***** BEGIN LICENSE BLOCK *****
 * Zimbra Collaboration Suite Server
 * Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Synacor, Inc.
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software Foundation,
 * version 2 of the License.
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with this program.
 * If not, see <>.
 * ***** END LICENSE BLOCK *****

package com.zimbra.cs.html;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.NamespaceContext;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLResourceIdentifier;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.cyberneko.html.filters.DefaultFilter;
import org.owasp.html.PolicyFactory;
import org.owasp.html.Sanitizers;

import com.zimbra.common.localconfig.DebugConfig;
import com.zimbra.common.util.StringUtil;
import com.zimbra.common.util.ZimbraLog;
import com.zimbra.cs.servlet.ZThreadLocal;

 * very Mutated version of filter from cyberneko html.
 * change accepted/removed elements to static hashmaps for one-time
 * initialization, switched from Hashtable to HashMap, sanatize
 * attributes, etc.
 * TODO: more checks:
 * allow limited use of <meta> tags? like for Content-Type?
 * make sure any clicked links pop up in new window
 * figure out how to block images by default, and how to re-enable them. styles?
 * strict attr value checking?
 *  don't allow id attr in tags if we aren't putting html into an iframe (I'm assuming we are, and id's in iframes don't conflict with iframes elsewhere)
public class DefangFilter extends DefaultFilter {

    private static final int ASCII_DATA_VALUE = 127;

      * disable all form/input type tags
    private static final boolean ENABLE_INPUT_TAGS = true;

     * enable table tags
    private static final boolean ENABLE_TABLE_TAGS = true;

     * enable phrase tags (EM, STRONG, CITE, DFN, CODE, SAMP, KBD, VAR, ABBR, ACRONYM)
    private static final boolean ENABLE_PHRASE_TAGS = true;

     * enable list tags (UL, OL, LI, DL, DT, DD, DIR, MENU)
    private static final boolean ENABLE_LIST_TAGS = true;

     * enable font style tags (TT, I, B, BIG, SMALL, STRIKE, S, U)
    private static final boolean ENABLE_FONT_STYLE_TAGS = true;

    /** The Host header received in the request. */
    private String reqVirtualHost = null;

    /** enable same host post request for a form in email */
    private static boolean sameHostFormPostCheck = DebugConfig.defang_block_form_same_host_post_req;

    // Constants

    /** A "null" object. */
    protected static final Object NULL = new Object();
    private static final PolicyFactory sanitizer = Sanitizers.IMAGES.and(Sanitizers.LINKS);

    // regexes inside of attr values to strip out
    private static final Pattern AV_JS_ENTITY = Pattern.compile(DebugConfig.defangAvJsEntity);
    private static final Pattern AV_SCRIPT_TAG = Pattern.compile(DebugConfig.defangAvScriptTag,
    private static final Pattern AV_JAVASCRIPT = Pattern.compile(DebugConfig.defangAvJavascript,
    private static final Pattern AV_VBSCRIPT = Pattern.compile(DebugConfig.defangAvVbscript,
    private static final Pattern AV_TAB = Pattern.compile(DebugConfig.defangAvTab, Pattern.CASE_INSENSITIVE);

    // regex for URLs href. TODO: beef this up
    private static final Pattern VALID_EXT_URL = Pattern.compile(DebugConfig.defangValidExtUrl,
    private static final Pattern VALID_IMG_FILE = Pattern.compile(DebugConfig.defangValidImgFile);
    private static final Pattern VALID_INT_IMG = Pattern.compile(DebugConfig.defangValidIntImg,
    private static List<String> ATTRIBUTES_CAN_ALLOW_SCRIPTS = Arrays

    // matches the file format that convertd uses so it doesn't get 'pnsrc'ed
    private static final Pattern VALID_CONVERTD_FILE = Pattern.compile(DebugConfig.defangValidConvertdFile);
    //matches cid:1040f05975d4d4b8fcf8747be3eb9ae3c08e5cd4@
    private static final Pattern IMG_SKIP_OWASPSANITIZE = Pattern.compile(DebugConfig.defangImgSkipOwaspSanitize,

    // Data

    // information

    /** attr Set cache */
    private static HashMap<String, HashSet<String>> mAttrSetCache = new HashMap<String, HashSet<String>>();

    /** Accepted elements. */
    private static HashMap<String, HashSet<String>> mAcceptedElements = new HashMap<String, HashSet<String>>();

    /** Removed elements. */
    private static HashMap<String, Object> mRemovedElements = new HashMap<String, Object>();

    // state

    private String mBaseHref = null;
    private URI mBaseHrefURI = null;

    /** Strip images */
    boolean mNeuterImages;

    /** The name of the element in the process of being removed. */
    protected String mRemovalElementName;

    /** Tracks the recursive nesting level of the element being removed.
     *  Since we're skipping from the element's open-tag to its close-tag,
     *  we need to make sure not to stop skipping if another element of
     *  the same type was nested in the first.  For instance,
     *  <pre>
     *    &lt;skipme>&lt;foo>&lt;skipme>XX&lt;/skipme>&lt;/foo>&lt;/skipme>
     *  </pre> should not stop skipping at the first <tt>&lt;/skipme></tt>
     *  but rather after the second. */
    protected int mRemovalElementCount;

    /** The style element depth */
    protected int mStyleDepth;

    //private static String[] STD_CORE = { "id", "class", "title", "style" };
    private static String CORE = "id,class,title,style,";
    private static String LANG = "dir,lang,xml:lang,";
    private static String CORE_LANG = CORE + LANG;
    private static String KBD = "accesskey,tabindex,";

    static {
        // set which elements to accept
        acceptElement("a", CORE + KBD + ",charset,coords,href,hreflang,name,rel,rev,shape,target,type");
        acceptElement("address", CORE_LANG);
        //acceptElement("base", "href"); //,target");
        acceptElement("bdo", CORE_LANG);
        acceptElement("blockquote", CORE_LANG + "cite");
        acceptElement("body", CORE_LANG + "background"); //+"alink,background,bgcolor,link,text,vlink");
        acceptElement("br", CORE + "clear");
        acceptElement("center", CORE_LANG);
        acceptElement("del", CORE_LANG + "cite,datetime");
        acceptElement("div", CORE_LANG + "align");
        acceptElement("head", LANG); // profile attr removed
        acceptElement("h1", CORE_LANG + "align");
        acceptElement("h2", CORE_LANG + "align");
        acceptElement("h3", CORE_LANG + "align");
        acceptElement("h4", CORE_LANG + "align");
        acceptElement("h5", CORE_LANG + "align");
        acceptElement("h6", CORE_LANG + "align");
        acceptElement("hr", CORE_LANG + "align,noshade,size,width");
        acceptElement("html", LANG + "xmlns");
        acceptElement("img", CORE_LANG
                + "align,alt,border,height,hspace,ismap,longdesc,src,usemap,vspace,width,dfsrc,data-mce-src");
        acceptElement("ins", CORE_LANG + "cite");
        acceptElement("label", CORE_LANG + "for");
        //acceptElement("link", CORE_LANG+"charset,href,hreflang,media,ntarget,rel,rev,type");

        // NOTE: comment out noframes so its text shows up, since we are nuke frame-related tags
        //acceptElement("noframes", CORE_LANG);
        // NOTE: comment out noscript so its text shows up, since we are nuking script tags
        //acceptElement("noscript", CORE_LANG); // maybe convert to always execute if we are stripping script?
        acceptElement("p", CORE_LANG + "align");
        acceptElement("pre", CORE_LANG + "width");
        acceptElement("q", CORE_LANG + "cite");
        acceptElement("span", CORE_LANG);

        acceptElement("style", CORE_LANG);
        acceptElement("sub", CORE_LANG);
        acceptElement("sup", CORE_LANG);

        //acceptElement("title", CORE_LANG);
        acceptElement("title", "");

            acceptElement("b", CORE_LANG);
            acceptElement("basefont", CORE_LANG + "color,face,size");
            acceptElement("big", CORE_LANG);
            acceptElement("font", CORE_LANG + "color,face,size");
            acceptElement("i", CORE_LANG);
            acceptElement("s", CORE_LANG);
            acceptElement("small", CORE_LANG);
            acceptElement("strike", CORE_LANG);
            acceptElement("tt", CORE_LANG);
            acceptElement("u", CORE_LANG);
        } else {
            // allow the text, just strip the tags

        if (ENABLE_LIST_TAGS) {
            acceptElement("dir", CORE_LANG + "compact");
            acceptElement("dl", CORE_LANG);
            acceptElement("dt", CORE_LANG);
            acceptElement("li", CORE_LANG + "type,value");
            acceptElement("ol", CORE_LANG + "compact,start,type");
            acceptElement("ul", CORE_LANG + "compact,type");
            acceptElement("dd", CORE_LANG);
            acceptElement("menu", CORE_LANG + "compact");
        } else {
            // allow the text, just strip the tags

        if (ENABLE_PHRASE_TAGS) {
            acceptElement("abbr", CORE_LANG);
            acceptElement("acronym", CORE_LANG);
            acceptElement("cite", CORE_LANG);
            acceptElement("code", CORE_LANG);
            acceptElement("dfn", CORE_LANG);
            acceptElement("em", CORE_LANG);
            acceptElement("kbd", CORE_LANG);
            acceptElement("samp", CORE_LANG);
            acceptElement("strong", CORE_LANG);
            acceptElement("var", CORE_LANG);
        } else {
            // allow the text, just strip the tags

        if (ENABLE_TABLE_TAGS) {
            acceptElement("caption", CORE_LANG + "align");
            acceptElement("col", CORE_LANG + "alink,background,char,charoff,span,valign,width");
            acceptElement("colgroup", CORE_LANG + "alink,background,char,charoff,span,valign,width");
            acceptElement("table", CORE_LANG
                    + "align,valign,background,bgcolor,border,cellpadding,cellspacing,frame,rules,summary,width");
            acceptElement("tbody", CORE_LANG + "align,background,char,charoff,valign");
            acceptElement("td", CORE_LANG
                    + "abbr,align,axis,background,bgcolor,char,charoff,colspan,headers,height,nowrap,rowspan,scope,,valign,width");
            acceptElement("tfoot", CORE_LANG + "align,background,char,charoff,valign");
            acceptElement("th", CORE_LANG
                    + "abbr,align,axis,background,bgcolor,char,charoff,colspan,headers,height,nowrap,rowspan,scope,valign,width");
            acceptElement("thead", CORE_LANG + "align,background,char,charoff,valign");
            acceptElement("tr", CORE_LANG + "align,background,bgcolor,char,charoff,valign");
        } else {
            // allow the text, just strip the tags

        if (ENABLE_INPUT_TAGS) {
            acceptElement("area", CORE_LANG + KBD + "alt,coords,href,nohref,shape,target");
            acceptElement("button", CORE_LANG + KBD + "disabled,name,type,value");
            acceptElement("fieldset", CORE_LANG);
            acceptElement("form", CORE_LANG + "action,accept,acceptcharset,enctype,method,name,target");
                    CORE_LANG + "accept,align,alt,checked,disabled,maxlength,name,readonly,size,src,type,value");
            acceptElement("legend", CORE_LANG + "align");
            acceptElement("map", CORE_LANG + "name");
            acceptElement("optgroup", CORE_LANG + "disabled,label");
            acceptElement("option", CORE_LANG + KBD + "disabled,label,selected,value");
            acceptElement("select", CORE_LANG + KBD + "disabled,multiple,name,size");
            acceptElement("textarea", CORE_LANG + "cols,disabled,name,readonly,rows");
        } else {

        // completely remove these elements and all enclosing tags/text

        // don't remove "content" of these tags since they have none.

     * @param neuterImages
    public DefangFilter(boolean neuterImages) {
        mNeuterImages = neuterImages;
        if (ZThreadLocal.getRequestContext() != null) {
            this.reqVirtualHost = ZThreadLocal.getRequestContext().getVirtualHost();

     * Specifies that the given element should be accepted and, optionally,
     * which attributes of that element should be kept.
     * @param element The element to accept.
     * @param attributes The comma-seperated list of attributes to be kept or null if no
     *                   attributes should be kept for this element.
     * see #removeElement
    public static void acceptElement(String element, String attributes) {
        element = element.toLowerCase();
        HashSet<String> set = mAttrSetCache.get(attributes);
        if (set != null) {
            //System.out.println(element+" cached set "+set.size());
            mAcceptedElements.put(element, set);
        set = new HashSet<String>();
        String attrs[] = attributes.toLowerCase().split(",");
        if (attrs != null && attrs.length > 0) {
            for (int i = 0; i < attrs.length; i++) {
                //deal with consecutive commas
                if (attrs[i].length() > 0)
        mAcceptedElements.put(element, set);
        mAttrSetCache.put(attributes, set);

     * Specifies that the given element should be completely removed. If an
     * element is encountered during processing that is on the remove list,
     * the element's start and end tags as well as all of content contained
     * within the element will be removed from the processing stream.
     * @param element The element to completely remove.
    public static void removeElement(String element) {
        String key = element.toLowerCase();
        Object value = NULL;
        mRemovedElements.put(key, value);

    // XMLDocumentHandler methods

    // since Xerces-J 2.2.0

    /** Start document. */
    public void startDocument(XMLLocator locator, String encoding, NamespaceContext nscontext, Augmentations augs)
            throws XNIException {
        mRemovalElementCount = 0;
        super.startDocument(locator, encoding, nscontext, augs);

    // old methods

    /** Start document. */
    public void startDocument(XMLLocator locator, String encoding, Augmentations augs) throws XNIException {
        startDocument(locator, encoding, null, augs);

    /** Start prefix mapping. */
    public void startPrefixMapping(String prefix, String uri, Augmentations augs) throws XNIException {
        if (mRemovalElementName == null) {
            super.startPrefixMapping(prefix, uri, augs);

    /** Start element. */
    public void startElement(QName element, XMLAttributes attributes, Augmentations augs) throws XNIException {
        String name = element.localpart;
        if (mRemovalElementName == null) {
            if (handleOpenTag(element, attributes))
                super.startElement(element, attributes, augs);
        } else {
            if (name.equalsIgnoreCase(mRemovalElementName))
        if (name.equalsIgnoreCase("style"))

    /** Empty element. */
    public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs) throws XNIException {
        if (mRemovalElementName == null && handleOpenTag(element, attributes)) {
            super.emptyElement(element, attributes, augs);

    /** Comment. */
    public void comment(XMLString text, Augmentations augs) throws XNIException {
        // we can safely ignore comments
        // they can only provide loop holes for hackers to exploit
        // e.g. CDATA sections are reported as comments with our HTML parser configuration

    /** Processing instruction. */
    public void processingInstruction(String target, XMLString data, Augmentations augs) throws XNIException {
        if (mRemovalElementName == null) {
            super.processingInstruction(target, data, augs);

    /** Characters. */
    public void characters(XMLString text, Augmentations augs) throws XNIException {
        if (mRemovalElementName == null) {
            if (mStyleDepth > 0) {
                String result = null;
                if (!StringUtil.isAsciiString(text.toString())) {
                    result = extractAndSanitizeAsciiData(text.toString());
                } else {
                    result = sanitizeStyleValue(text.toString());
                super.characters(new XMLString(result.toCharArray(), 0, result.length()), augs);
            } else {
                super.characters(text, augs);

    private static final Pattern COMMENT = Pattern.compile(DebugConfig.defangComment);
    protected static final Pattern STYLE_UNWANTED_FUNC = Pattern.compile(DebugConfig.defangStyleUnwantedFunc,
            Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
    private static final Pattern STYLE_UNWANTED_IMPORT = Pattern.compile(DebugConfig.defangStyleUnwantedImport,

    private static String sanitizeStyleValue(String value) {
        // remove comments
        value = COMMENT.matcher(value).replaceAll("");
        // strip off unwanted functions
        value = STYLE_UNWANTED_FUNC.matcher(value).replaceAll("");
        // strip off any @import
        return STYLE_UNWANTED_IMPORT.matcher(value).replaceAll("");

    /** Ignorable whitespace. */
    public void ignorableWhitespace(XMLString text, Augmentations augs) throws XNIException {
        if (mRemovalElementName == null) {
            super.ignorableWhitespace(text, augs);

    /** Start general entity. */
    public void startGeneralEntity(String name, XMLResourceIdentifier id, String encoding, Augmentations augs)
            throws XNIException {
        if (mRemovalElementName == null) {
            super.startGeneralEntity(name, id, encoding, augs);

    /** Text declaration. */
    public void textDecl(String version, String encoding, Augmentations augs) throws XNIException {
        if (mRemovalElementName == null) {
            super.textDecl(version, encoding, augs);

    /** End general entity. */
    public void endGeneralEntity(String name, Augmentations augs) throws XNIException {
        if (mRemovalElementName == null) {
            super.endGeneralEntity(name, augs);

    /** Start CDATA section. */
    public void startCDATA(Augmentations augs) throws XNIException {
        if (mRemovalElementName == null) {

    /** End CDATA section. */
    public void endCDATA(Augmentations augs) throws XNIException {
        if (mRemovalElementName == null) {

    /** End element. */
    public void endElement(QName element, Augmentations augs) throws XNIException {
        String name = element.localpart;
        if (mRemovalElementName == null) {
            if (elementAccepted(element.rawname))
                super.endElement(element, augs);
        } else {
            if (name.equalsIgnoreCase(mRemovalElementName) && --mRemovalElementCount == 0)
                mRemovalElementName = null;
        if (name.equalsIgnoreCase("style"))

    /** End prefix mapping. */
    public void endPrefixMapping(String prefix, Augmentations augs) throws XNIException {
        if (mRemovalElementName == null) {
            super.endPrefixMapping(prefix, augs);

    // Protected methods

    /** Returns true if the specified element is accepted. */
    protected static boolean elementAccepted(String element) {
        String key = element.toLowerCase();
        return mAcceptedElements.containsKey(key);

    /** Returns true if the specified element should be removed. */
    protected static boolean elementRemoved(String element) {
        String key = element.toLowerCase();
        return mRemovedElements.containsKey(key);

    /** Handles an open tag. */
    protected boolean handleOpenTag(QName element, XMLAttributes attributes) {
        String eName = element.rawname.toLowerCase();
        if (eName.equals("base")) {
            int index = attributes.getIndex("href");
            if (index != -1) {
                mBaseHref = attributes.getValue(index);
                if (mBaseHref != null) {
                    try {
                        mBaseHrefURI = new URI(mBaseHref);
                    } catch (URISyntaxException e) {
                        if (!mBaseHref.endsWith("/"))
                            mBaseHref += "/";
        if (elementAccepted(element.rawname)) {
            HashSet<String> value = mAcceptedElements.get(eName);
            if (value != NULL) {
                HashSet<String> anames = value;
                int attributeCount = attributes.getLength();
                for (int i = 0; i < attributeCount; i++) {
                    String aName = attributes.getQName(i).toLowerCase();
                    // remove the attribute if it isn't in the list of accepted names
                    // or it has invalid content
                    if (!anames.contains(aName) || removeAttrValue(eName, aName, attributes, i)) {
                    } else {
                        sanitizeAttrValue(eName, aName, attributes, i);
            } else {

            if (eName.equals("img") || eName.equals("input")) {
                fixUrlBase(attributes, "src");
            } else if (eName.equals("a") || eName.equals("area")) {
                fixUrlBase(attributes, "href");
            fixUrlBase(attributes, "background");

            if (eName.equals("a") || eName.equals("area")) {
            if (mNeuterImages) {
                String srcValue = Strings.nullToEmpty(attributes.getValue("src"));
                if (eName.equals("img") || eName.equals("input")) {
                    if (VALID_EXT_URL.matcher(srcValue).find() || (!VALID_INT_IMG.matcher(srcValue).find()
                            && !VALID_IMG_FILE.matcher(srcValue).find())) {
                        neuterTag(attributes, "src", "df");
                    } else if (!VALID_INT_IMG.matcher(srcValue).find() && VALID_IMG_FILE.matcher(srcValue).find()
                            && !VALID_CONVERTD_FILE.matcher(srcValue).find()) {
                        neuterTag(attributes, "src", "pn");
                neuterTag(attributes, "background", "df");
            return true;
        } else if (elementRemoved(element.rawname)) {
            mRemovalElementName = element.rawname;
            mRemovalElementCount = 1;
        return false;

    private void fixUrlBase(XMLAttributes attributes, String attrName) {
        int index = attributes.getIndex(attrName);
        if (index != -1) {
            String value = attributes.getValue(index);
            if (!value.startsWith("/")) {
                value = "/" + value;
            if (mBaseHref != null && value != null && value.indexOf(":") == -1) {
                if (mBaseHrefURI != null) {
                    try {
                        attributes.setValue(index, mBaseHrefURI.resolve(value).toString());
                    } catch (IllegalArgumentException e) {
                        // ignore and do string-logic
                attributes.setValue(index, mBaseHref + value);

     * @param attributes
    private void neuterTag(XMLAttributes attributes, String aName, String prefix) {
        String df_aName = prefix + aName;
        int dfIndex = attributes.getIndex(df_aName);
        int index = attributes.getIndex(aName);
        if (index != -1) {
            String aValue = attributes.getValue(index);
            if (dfIndex != -1) {
                attributes.setValue(dfIndex, aValue);
            } else {
                attributes.addAttribute(new QName("", df_aName, df_aName, null), "CDATA", aValue);
            // remove dups if there are multiple src attributes
            index = attributes.getIndex(aName);
            while (index != -1) {
                index = attributes.getIndex(aName);

     * make sure all <a> tags have a target="_blank" attribute set.
     * @param attributes
    private void fixATag(XMLAttributes attributes) {
        // BEGIN: bug 7927
        int index = attributes.getIndex("href");
        if (index == -1) // links that don't have a href don't need target="_blank"
        String href = attributes.getValue(index);
        if (href.indexOf('#') == 0) // LOCAL links don't need target="_blank"
        // END: bug 7927
        index = attributes.getIndex("target");
        if (index != -1) {
            attributes.setValue(index, "_blank");
        } else {
            attributes.addAttribute(new QName("", "target", "target", null), "CDATA", "_blank");

     * Checks to see if an attr value should just be removed
     * @param eName The element name
     * @param aName The attribute name
     * @param attributes The set of the attribtues
     * @param i The index of the attribute
     * @return true if the attr should be removed, false if not
    private boolean removeAttrValue(String eName, String aName, XMLAttributes attributes, int i) {
        String value = attributes.getValue(i);
        // get rid of any spaces that might throw off the regex
        value = value == null ? null : value.trim();

        if (aName.equalsIgnoreCase("href")) {
            if (VALID_EXT_URL.matcher(value).find()) {
                return false;
            sanitizeAttrValue(eName, aName, attributes, i);
        } else if (aName.equalsIgnoreCase("longdesc") || aName.equalsIgnoreCase("usemap")) {
            if (!VALID_EXT_URL.matcher(value).find()) {
                return true;
        // We'll treat the SRC a little different since deleting it
        // may annoy the front end. Here, we'll check for
        // a valid url as well as just a valid filename in the
        // case that its an inline image
        if (aName.equals("src") || aName.equals("dfsrc") || aName.equals("data-mce-src")) {
            if (!(VALID_EXT_URL.matcher(value).find() || VALID_INT_IMG.matcher(value).find()
                    || VALID_IMG_FILE.matcher(value).find())) {
                attributes.setValue(i, "#");
                return false;
        return false;

    public static String sanitize(String result, boolean isAllowedScript) {
        result = removeAnySpacesAndEncodedChars(result);
        if (!(IMG_SKIP_OWASPSANITIZE.matcher(result).find())) {
            result = sanitizer.sanitize(result);
        result = AV_JS_ENTITY.matcher(result).replaceAll("JS-ENTITY-BLOCKED");
        result = AV_SCRIPT_TAG.matcher(result).replaceAll("SCRIPT-TAG-BLOCKED");

        if (isAllowedScript) {
            if (AV_TAB.matcher(result).find()) {
                result = AV_TAB.matcher(result).replaceAll("");
            if (AV_JAVASCRIPT.matcher(result).find())
                result = AV_JAVASCRIPT.matcher(result).replaceAll("JAVASCRIPT-BLOCKED:");
            else if (!VALID_INT_IMG.matcher(result).find()) {
                result = result.replaceAll("(?i)data\\s*:", "DATAURI-BLOCKED:");
            if (AV_VBSCRIPT.matcher(result).find()) {
                result = AV_VBSCRIPT.matcher(result).replaceAll("VBSCRIPT-BLOCKED:");
        return result;

     * @param result
     * @return
    public static String removeAnySpacesAndEncodedChars(String result) {
        String sanitizedStr = result;
        StringBuilder sb = new StringBuilder();
        int index = result.indexOf(":");

        if (index > -1) {
            String jsString = result.substring(0, index);
            char[] chars = jsString.toCharArray();
            for (int i = 0; i < chars.length; ++i) {
                if (!Character.isSpace(chars[i])) {
        String temp = sb.toString();
        temp = StringEscapeUtils.unescapeHtml(temp);
        if (index != -1 && (temp.toLowerCase().contains("javascript") || temp.toLowerCase().contains("vbscript"))) {
            sanitizedStr = temp + result.substring(index);
        return sanitizedStr;

     * sanitize an attr value. For now, this means stirpping out Java Script entity tags &{...},
     * and <script> tags.
    private void sanitizeAttrValue(String eName, String aName, XMLAttributes attributes, int i) {
        String value = attributes.getValue(i);
        boolean canAllowScript = ATTRIBUTES_CAN_ALLOW_SCRIPTS.contains(aName.toLowerCase());
        String result = sanitize(value, canAllowScript);

        if (aName.equalsIgnoreCase("style")) {
            result = sanitizeStyleValue(value);

        if (!result.equals(value)) {
            attributes.setValue(i, result);

        if (aName.equalsIgnoreCase("action") && sameHostFormPostCheck == true && this.reqVirtualHost != null) {
            try {
                URL url = new URL(value);
                String formActionHost = url.getHost().toLowerCase();

                if (formActionHost.equalsIgnoreCase(reqVirtualHost)) {
                    value = value.replace(formActionHost, "SAMEHOSTFORMPOST-BLOCKED");
                    attributes.setValue(i, value);
            } catch (MalformedURLException e) {
      "Failure while trying to block mailicious code. Check for URL "
                        + " match between the host and the action URL of a FORM."
                        + "Error parsing URL, possible relative URL." + e.getMessage());
                attributes.setValue(i, "SAMEHOSTFORMPOST-BLOCKED");


     * @param string
     * @return
    String extractAndSanitizeAsciiData(String data) {
        char c[] = data.toCharArray();
        StringBuilder sanitizedStrg = new StringBuilder();
        StringBuilder asciiData = new StringBuilder();
        for (int i = 0; i < c.length; ++i) {
            if (c[i] <= ASCII_DATA_VALUE) {

            } else {
                String temp = asciiData.toString();
                if (!StringUtil.isNullOrEmpty(temp)) {
                    temp = sanitizeStyleValue(temp);
                    asciiData = new StringBuilder();
        //Append the asciiData to the sanitizedStrg
        return sanitizedStrg.toString();
