Sanitize any potentially dangerous tags from the provided raw HTML input using a whitelist based approach : HTML « Network « C# / C Sharp






Sanitize any potentially dangerous tags from the provided raw HTML input using a whitelist based approach

 

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.ComponentModel;

namespace NearForums
{
  public static class Utils
  {
    /// <summary>
    /// sanitize any potentially dangerous tags from the provided raw HTML input using 
    /// a whitelist based approach, leaving the "safe" HTML tags
    /// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937 / http://refactormycode.com/codes/333-sanitize-html
    /// </summary>
    /// <param name="html">Html to sanitize</param>
    /// <param name="whiteListTags">Regex containing the allowed name of the html elements. For example: em|h(2|3|4)|strong|p</param>
    public static string SanitizeHtml(string html, string whiteListTags = "b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3|4)|i|kbd|li|ol|p(re)?|s(ub|up|trong|trike)?|ul|a|img")
    {
      #region Regex definitions
      Regex tagsRegex = new Regex("<[^>]*(>|$)",
        RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);

      Regex cleanupRegex = new Regex("((?<=<\\w+[^>]*)(?!\\shref|\\sclass|\\srel|\\stitle|\\sclass|\\swidth|\\sheight|\\salt|\\ssrc)(\\s[\\w-]+)=[\"']?((?:.(?![\"']?\\s+(?:\\S+)=|[>\"']))+.)[\"']?)|((?<=<p[^>]*)\\sclass=\"MsoNormal\")",
          RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnoreCase | RegexOptions.Compiled);

      Regex whitelistRegex = new Regex("^</?(" + whiteListTags + ")>$|^<(b|h)r\\s?/?>$",
        RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace);

      Regex whitelistAnchorRegex = new Regex(@"
      ^<a\s
      href=""(\#\w+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+)""
      (
      (\sclass=""([\w-]+)"")|(\stitle=""[^""<>]+"")|
      (\srel=""nofollow""))*
      \s?>$|
      ^</a>$",
        RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace);

      Regex whitelistImageRegex = new Regex(@"
      ^<img\s
      src=""https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+""
      ((\swidth=""\d{1,3}"")|
      (\sheight=""\d{1,3}"")|
      (\salt=""[^""<>]*"")|
      (\stitle=""[^""<>]*""))*
      \s?/?>$",
        RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace);
      #endregion

      if (String.IsNullOrEmpty(html))
        return html;

      //Do a previous cleanup, for not not allowed attributes included comming from word
      html = cleanupRegex.Replace(html, "");

      string tagname;
      Match tag;

      // match every HTML tag in the input
      MatchCollection tags = tagsRegex.Matches(html);
      for (int i = tags.Count - 1; i > -1; i--)
      {
        tag = tags[i];
        tagname = tag.Value.ToLowerInvariant();

        if (!(whitelistRegex.IsMatch(tagname) || whitelistAnchorRegex.IsMatch(tagname) || whitelistImageRegex.IsMatch(tagname)))
        {
          html = html.Remove(tag.Index, tag.Length);
          System.Diagnostics.Debug.WriteLine("tag sanitized: " + tagname);
        }
      }

      return html;
    }

  }
}

   
  








Related examples in the same category

1.Get Links From HTML
2.Parses the value information from any INPUT tag in an HTML string where the name="" attribute matched the tagID parameter
3.Html Utilities
4.Convert HTML To Text
5.Converts a FontUnit to a size for the HTML FONT tag
6.Strip HTML
7.Remove tags from a html string
8.Get Type As Html
9.HTML-encodes a string and returns the encoded string.
10.Strips all HTML tags from the specified string.
11.Removes the HTML whitespace.
12.Array To Html Breaked String
13.Show Html Page in String with Process