Sanitize any potentially dangerous tags from the provided raw HTML input using a whitelist based approach

 

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.ComponentModel;

namespace NearForums
{
  public static class Utils
  {
    /// <summary>
    /// sanitize any potentially dangerous tags from the provided raw HTML input using 
    /// a whitelist based approach, leaving the "safe" HTML tags
    /// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937 / http://refactormycode.com/codes/333-sanitize-html
    /// </summary>
    /// <param name="html">Html to sanitize</param>
    /// <param name="whiteListTags">Regex containing the allowed name of the html elements. For example: em|h(2|3|4)|strong|p</param>
    public static string SanitizeHtml(string html, string whiteListTags = "b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3|4)|i|kbd|li|ol|p(re)?|s(ub|up|trong|trike)?|ul|a|img")
    {
      #region Regex definitions
      Regex tagsRegex = new Regex("<[^>]*(>|$)",
        RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);

      Regex cleanupRegex = new Regex("((?<=<\\w+[^>]*)(?!\\shref|\\sclass|\\srel|\\stitle|\\sclass|\\swidth|\\sheight|\\salt|\\ssrc)(\\s[\\w-]+)=[\"']?((?:.(?![\"']?\\s+(?:\\S+)=|[>\"']))+.)[\"']?)|((?<=<p[^>]*)\\sclass=\"MsoNormal\")",
          RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnoreCase | RegexOptions.Compiled);

      Regex whitelistRegex = new Regex("^</?(" + whiteListTags + ")>$|^<(b|h)r\\s?/?>$",
        RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace);

      Regex whitelistAnchorRegex = new Regex(@"
      ^<a\s
      href=""(\#\w+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+)""
      (
      (\sclass=""([\w-]+)"")|(\stitle=""[^""<>]+"")|
      (\srel=""nofollow""))*
      \s?>$|
      ^</a>$",
        RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace);

      Regex whitelistImageRegex = new Regex(@"
      ^<img\s
      src=""https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+""
      ((\swidth=""\d{1,3}"")|
      (\sheight=""\d{1,3}"")|
      (\salt=""[^""<>]*"")|
      (\stitle=""[^""<>]*""))*
      \s?/?>$",
        RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace);
      #endregion

      if (String.IsNullOrEmpty(html))
        return html;

      //Do a previous cleanup, for not not allowed attributes included comming from word
      html = cleanupRegex.Replace(html, "");

      string tagname;
      Match tag;

      // match every HTML tag in the input
      MatchCollection tags = tagsRegex.Matches(html);
      for (int i = tags.Count - 1; i > -1; i--)
      {
        tag = tags[i];
        tagname = tag.Value.ToLowerInvariant();

        if (!(whitelistRegex.IsMatch(tagname) || whitelistAnchorRegex.IsMatch(tagname) || whitelistImageRegex.IsMatch(tagname)))
        {
          html = html.Remove(tag.Index, tag.Length);
          System.Diagnostics.Debug.WriteLine("tag sanitized: " + tagname);
        }
      }

      return html;
    }

  }
}

Related examples in the same category

1.	Get Links From HTML
2.	Parses the value information from any INPUT tag in an HTML string where the name="" attribute matched the tagID parameter
3.	Html Utilities
4.	Convert HTML To Text
5.	Converts a FontUnit to a size for the HTML FONT tag
6.	Strip HTML
7.	Remove tags from a html string
8.	Get Type As Html
9.	HTML-encodes a string and returns the encoded string.
10.	Strips all HTML tags from the specified string.
11.	Removes the HTML whitespace.
12.	Array To Html Breaked String
13.	Show Html Page in String with Process

Sanitize any potentially dangerous tags from the provided raw HTML input using a whitelist based approach : HTML « Network « C# / C Sharp

Related examples in the same category