private static Regex _tags = new Regex("<[^>]*(>|$)",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);
private static Regex _whitelist = new Regex(@"
^</?(b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3)|i|kbd|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)>$|
^<(b|h)r\s?/?>$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private static Regex _whitelist_a = new Regex(@"
^<a\s
href=""(\#\d+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+)""
(\stitle=""[^""<>]+"")?\s?>$|
^</a>$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private static Regex _whitelist_img = new Regex(@"
^<img\s
src=""https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+""
(\swidth=""\d{1,3}"")?
(\sheight=""\d{1,3}"")?
(\salt=""[^""<>]*"")?
(\stitle=""[^""<>]*"")?
\s?/?>$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
/// <summary>
/// sanitize any potentially dangerous tags from the provided raw HTML input using
/// a whitelist based approach, leaving the "safe" HTML tags
/// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937
/// </summary>
public static string Sanitize(string html)
{
if (String.IsNullOrEmpty(html)) return html;
string tagname;
Match tag;
// match every HTML tag in the input
MatchCollection tags = _tags.Matches(html);
for (int i = tags.Count - 1; i > -1; i--)
{
tag = tags[i];
tagname = tag.Value.ToLowerInvariant();
if(!(_whitelist.IsMatch(tagname) || _whitelist_a.IsMatch(tagname) || _whitelist_img.IsMatch(tagname)))
{
html = html.Remove(tag.Index, tag.Length);
System.Diagnostics.Debug.WriteLine("tag sanitized: " + tagname);
}
}
return html;
}
Refactorings
No refactoring yet !
Rob Allen
June 20, 2008, June 20, 2008 13:23, permalink
Here is a blacklist to test your A and IMG tags against which includes HTML comments and all of the JavaScript event handlers. At first blush, I would think splitting the m string on the "=" when a blacklist item is detected and stepping through to scrub might be the way to go. I'll fill in more when I get some time.
var blacklist = //JavaScrip events which are valid on IMG and A tags
@"<!--|-->|onabort|onblur|onchange|onclick|ondblclick|onerror|onfocus|onkeydown|onkeypress|onkeyup|onload|
onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onresize|onselect|onsubmit|onunload";
if (!Regex.IsMatch(m.ToString(), blacklist, RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace))
{
String[] mSplit = m.split("=");
foreach (string str in mSplit)
{
}
}
Nick Berardi
June 20, 2008, June 20, 2008 14:59, permalink
Here is my white list approach. It is to basically strip out all the HTML tags and rebuild the HTML that is put in to the database with my whitelist of tags and attributes. Let me know what you think. The key of the dictionary is the whitelisted tag. The list in the value of the dictionary are allowed attributes.
private static readonly Regex HtmlTagExpression = new Regex(@"(?'tag_start'</?)(?'tag'\w+)((\s+(?'attr'(?'attr_name'\w+)(\s*=\s*(?:"".*?""|'.*?'|[^'"">\s]+)))?)+\s*|\s*)(?'tag_end'/?>)", RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex WhiteSpaceBetweenHtmlTagsExpression = new Regex(@">(/w+)<", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex HtmlLineBreakExpression = new Regex(@"<br(/s+)/>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Dictionary<string, List<string>> ValidHtmlTags = new Dictionary<string, List<string>> {
{ "p", new List<string>() },
{ "br", new List<string>() },
{ "strong", new List<string>() },
{ "b", new List<string>() },
{ "em", new List<string>() },
{ "i", new List<string>() },
{ "u", new List<string>() },
{ "strike", new List<string>() },
{ "ol", new List<string>() },
{ "ul", new List<string>() },
{ "li", new List<string>() },
{ "a", new List<string> { "href" } },
{ "img", new List<string> { "src", "height", "width", "alt" } },
{ "q", new List<string> { "cite" } },
{ "cite", new List<string>() },
{ "abbr", new List<string>() },
{ "acronym", new List<string>() },
{ "del", new List<string>() },
{ "ins", new List<string>() }
};
/// <summary>
/// Toes the safe HTML.
/// </summary>
/// <param name="text">The text.</param>
/// <returns></returns>
public static string ToSafeHtml(this string text)
{
return text
.RemoveInvalidHtmlTags();
}
/// <summary>
/// Removes the invalid HTML tags.
/// </summary>
/// <param name="text">The text.</param>
/// <returns></returns>
public static string RemoveInvalidHtmlTags(this string text)
{
return HtmlTagExpression.Replace(text, new MatchEvaluator((Match m) => {
if (!ValidHtmlTags.ContainsKey(m.Groups["tag"].Value))
return String.Empty;
string generatedTag = String.Empty;
System.Text.RegularExpressions.Group tagStart = m.Groups["tag_start"];
System.Text.RegularExpressions.Group tagEnd = m.Groups["tag_end"];
System.Text.RegularExpressions.Group tag = m.Groups["tag"];
System.Text.RegularExpressions.Group tagAttributes = m.Groups["attr"];
generatedTag += (tagStart.Success ? tagStart.Value : "<");
generatedTag += tag.Value;
foreach (Capture attr in tagAttributes.Captures)
{
int indexOfEquals = attr.Value.IndexOf('=');
// don't proceed any futurer if there is no equal sign or just an equal sign
if (indexOfEquals < 1)
continue;
string attrName = attr.Value.Substring(0, indexOfEquals);
// check to see if the attribute name is allowed and write attribute if it is
if (ValidHtmlTags[tag.Value].Contains(attrName))
generatedTag += " " + attr.Value;
}
// add nofollow to all hyperlinks
if (tagStart.Success && tagStart.Value == "<" && tag.Value.Equals("a", StringComparison.OrdinalIgnoreCase))
generatedTag += " rel=\"nofollow\"";
generatedTag += (tagEnd.Success ? tagEnd.Value : ">");
return generatedTag;
}));
}
Nick
June 20, 2008, June 20, 2008 15:16, permalink
I just updated it to use StringBuilder. What I like about this method, that I am using, is that I can add special processing in the code, such as adding a rel="nofollow" when an "a" tag is found. I am using this with out much trouble on http://www.ideapipe.com. It is very fast, because for the most part my WYSIWYG editor is limiting most of the HTML anyways. Personally I like the whitelist everything even the attributes approach, because browsers allow you to add onclick, onfocus, onwhatever to every tag, and they will all execute JavaScript.
return HtmlTagExpression.Replace(text, new MatchEvaluator((Match m) => {
if (!ValidHtmlTags.ContainsKey(m.Groups["tag"].Value))
return String.Empty;
StringBuilder generatedTag = new StringBuilder(m.Length);
System.Text.RegularExpressions.Group tagStart = m.Groups["tag_start"];
System.Text.RegularExpressions.Group tagEnd = m.Groups["tag_end"];
System.Text.RegularExpressions.Group tag = m.Groups["tag"];
System.Text.RegularExpressions.Group tagAttributes = m.Groups["attr"];
generatedTag.Append(tagStart.Success ? tagStart.Value : "<");
generatedTag.Append(tag.Value);
foreach (Capture attr in tagAttributes.Captures)
{
int indexOfEquals = attr.Value.IndexOf('=');
// don't proceed any futurer if there is no equal sign or just an equal sign
if (indexOfEquals < 1)
continue;
string attrName = attr.Value.Substring(0, indexOfEquals);
// check to see if the attribute name is allowed and write attribute if it is
if (ValidHtmlTags[tag.Value].Contains(attrName))
{
generatedTag.Append(' ');
generatedTag.Append(attr.Value);
}
}
// add nofollow to all hyperlinks
if (tagStart.Success && tagStart.Value == "<" && tag.Value.Equals("a", StringComparison.OrdinalIgnoreCase))
generatedTag.Append(" rel=\"nofollow\"");
generatedTag.Append(tagEnd.Success ? tagEnd.Value : ">");
return generatedTag.ToString();
}));
Rob Allen
June 20, 2008, June 20, 2008 17:50, permalink
@Jeff, there was a post from Owen on the Stack Overflow blog that linked to this class: http://code.google.com/p/subsonicforums/source/browse/trunk/SubSonic.Forums.Data/HtmlScrubber.cs?r=61 which takes this challenge from the other side of the coin. Instead of stripping out what isn't allowable, it finds the allowable stuff and dumps the rest. For me, that sounds like a more effective way to approach this given the immense number of possible attacks and the relatively small number of safe attributes.
Chris Jester-Young
June 20, 2008, June 20, 2008 22:50, permalink
Reading the ha.ckers page (as linked to from the Stack Overflow blog), I must say, I'm almost ready to give up. :-P The original code (from mental evaluation; I haven't run the code) seems to allow this: <script src="javascript:alert('XSS')" (any src that contains the XSS is fine, as long as the right-angle-bracket is missing)
Also, WRT Nick's solution (which constrains usable attributes), the following exploits are possible: <a href="javascript:alert('XSS')"><img src="javascript:alert('XSS')" /></a> (I don't know if a URI-scheme filter is enough to filter out all URI-based attacks---I'll read further in that page to see....)
OJ
June 21, 2008, June 21, 2008 00:13, permalink
I agree with Rob's last comment. One of the assumptions that's been made so far is that the input contains <em>valid</em> HTML markup. There's nothing stopping people from abusing poor/inavlid markup which may just make it past your validation and end up in the browser -- and depending on the browser you're using you may see a poor attempt at rendering that malformed HTML.
PS. This site is great! I'll be using it to code share/refactor in the future. Cheers!
Chris Jester-Young
June 21, 2008, June 21, 2008 02:57, permalink
The no-closing-angle form of the tag still needs to be caught---search "no closing script tags" in the ha.ckers page, and see the next few entries below it, up to "double open angle brackets". This is especially important if people drop a <script (or <img, or whatever) tag at the end of their text input (and therefore all further tags are outside the scrutiny of the sanitiser).
For example, say your page has this sort of template:
<div class="comment">...</div>
If there is a <script src="..." (or <img src="javascript:..." for IE) at the end of the user input, you'd end up with:
<div class="comment">...<script src="..." </div>
This would trigger "half open HTML" on IE (if the ending tag is an unclosed <img tag), or "double open angle brackets" on Firefox (if the ending tag is an unclosed <script tag). Post two comments, one with each exploit, and you can win both cases. :-P
Chris Jester-Young
June 21, 2008, June 21, 2008 03:15, permalink
Another vulnerability, owing to the fact that System.String.Replace() replaces all instances, not just the first: <foo><scr<foo>ipt src="..." />. Or, perhaps: <foo><scr<foo>ipt>...</scr<foo>ipt>.
One way to solve this would be, rather than strip out bad tags, reject the whole expression entirely when a bad tag is seen, either by throwing exceptions, or (if you're from the Joel Spolsky school of error handling) returning null. :-)
Jeff Atwood
June 21, 2008, June 21, 2008 04:48, permalink
Hi Chris -- those are both *excellent* points, and I verified both. Working on it.. the second one is fairly easy (just replace the one instance) but the unclosed tag thing is rough. Suggestions?
Chris Jester-Young
June 21, 2008, June 21, 2008 05:14, permalink
My initial suggestion would be to use both > and end-of-line as matching end-of-tag, as shown in the Code box. I haven't investigated fully as to whether this actually solves the problem in all cases (perhaps when I'm less tired I can have another go...), but the sample tests I did seemed to work.
var tags = Regex.Matches(html, "<[^>]*?(>|$)", RegexOptions.Singleline);
Stefan Ciobaca
June 21, 2008, June 21, 2008 07:01, permalink
[quote]
Hi Chris -- those are both *excellent* points, and I verified both. Working on it.. the second one is fairly easy (just replace the one instance) but the unclosed tag thing is rough. Suggestions?
[/quote]
No, it's not *that* easy. By replacing the first match you introduce another class of (more subtle) vulnerabilities (it's your homework for this weekend to think what it is; hint: it's almost like the bug of replacing everything, but a bit more difficult). The game: "here's my fix; can you find a new bug?" goes on forever. It's ok to want to learn about XSS; the proof that you learned is that you will not allow the use of HTML in the comments.
Garry Shutler
June 21, 2008, June 21, 2008 10:09, permalink
Just a thought. In the example given by someone the comment goes in <div class="comment">...</div>
You could do with a check that makes sure they don't start their comment with </div> and break out of your styling. This would work as part of a deeper check that all tags are in pairs or end />
Nick
June 21, 2008, June 21, 2008 13:59, permalink
Jeff I think we are forgetting about something. You have all of these tags which are valid, but what if somebody copies and pastes from the web? Where <h /> tags and <p /> tags and probably everything else contain a class, style, or etc attribute. You need to strip out the attributes that you don't care about, while keeping the general tag. I provided a solution to this a couple items back.
Jeff Atwood
June 21, 2008, June 21, 2008 16:35, permalink
> but what if somebody copies and pastes from the web
Nick, I don't think that's the use case here.
1) If you are pasting a code sample it'll be inside a <pre><code> block (in the WMD editor, highlight the code snippet and use CTRL+K or press the "code" toolbar button) and thus fully escaped, so it'll appear as-is.
2) If you are composing question or answer text, you'd probably paste from the textual content of the web page, not the view source.
> By replacing the first match you introduce another class of (more subtle) vulnerabilities
You're implying that a later bad tag would somehow appear earlier in the string as a substring, but I cannot for the life of me come up with an example of that working. If a tag is contained within another tag, it'll get stripped by the new tag match that uses either > or $ (end of text) as the end match.
Can you provide an example of this that works, given the new tag match regex? I can't come up with one.
Stefan Ciobaca
June 21, 2008, June 21, 2008 22:16, permalink
>You're implying that a later bad tag would somehow appear earlier in the string as
>a substring, but I cannot for the life of me come up with an example of that working.
I imply quite the opposite, really.
<foo<bar><scr<bar>ipt/>
I'm just guessing; since you write C#, I can't actually run your code. I'm supposing something like this will happen:
remove first <foo<bar> ==> <scr<bar>ipt/>
remove first <bar> ==> <script/>
remove first <scr<bar> ==> <script/>
remove first <bar> => <script/>
I'm not sure Matches works like this, because the MSDN documentation on it is at best childish.
Again, a wrong solution to this is, for example, to forbid < in the regexp that searches for tags.
Also, RegexOptions.SingleLine is a terrible name for what is does.
Jon Galloway
June 21, 2008, June 21, 2008 23:25, permalink
I've played with refactoring some code posted on the Html Agility Pack forums (http://www.codeplex.com/htmlagilitypack/Thread/View.aspx?ThreadId=24346) to use a whitelist approach instead of a blacklist. Seems to work well, and it handles tag nesting, too.
public string ConvertHtml(string html)
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
//Modified - using a whitelist of allowed elements
string[] allowedElements = "a|p|img|pre".Split('|');
HtmlNodeCollection nc = doc.DocumentNode.ChildNodes;
//This should work: nc=doc.DocumentNode.SelectNodes("//*[not(a|img|p)]");
if (nc != null)
{
foreach (HtmlNode node in nc)
{
if(!allowedElements.Contains(node.Name))
node.ParentNode.RemoveChild(node, false);
}
}
//remove hrefs to java/j/vbscript URLs
nc = doc.DocumentNode.SelectNodes("//a[starts-with(@href, 'javascript')]|//a[starts-with(@href, 'jscript')]|//a[starts-with(@href, 'vbscript')]");
if (nc != null)
{
foreach (HtmlNode node in nc)
{
node.SetAttributeValue("href", "protected");
}
}
//remove img with refs to java/j/vbscript URLs
nc = doc.DocumentNode.SelectNodes("//img[starts-with(@src, 'javascript')]|//img[starts-with(@src, 'jscript')]|//img[starts-with(@src, 'vbscript')]");
if (nc != null)
{
foreach (HtmlNode node in nc)
{
node.SetAttributeValue("src", "protected");
}
}
//remove on<Event> handlers from all tags
nc = doc.DocumentNode.SelectNodes("//*[@onclick or @onmouseover or @onfocus or @onblur or @onmouseout or @ondoubleclick or @onload or @onunload]");
if (nc != null)
{
foreach (HtmlNode node in nc)
{
node.Attributes.Remove("onFocus");
node.Attributes.Remove("onBlur");
node.Attributes.Remove("onClick");
node.Attributes.Remove("onMouseOver");
node.Attributes.Remove("onMouseOut");
node.Attributes.Remove("onDoubleClick");
node.Attributes.Remove("onLoad");
node.Attributes.Remove("onUnload");
}
}
return doc.DocumentNode.WriteTo();
}
Jeff Atwood
June 22, 2008, June 22, 2008 02:29, permalink
> I'm just guessing; since you write C#, I can't actually run your code. I'm supposing something like this will happen:
Well, you don't need C#, you could use a regex testing tool of your choice to see what actually happens. Remember, our intial tag match is "<[^>]*?(>|$)"
"<foo<bar><scr<bar>ipt/>"
remove first <foo<bar> ==> "<scr<bar>ipt/>"
remove first <scr<bar> ==> "ipt/>"
resulting string is "ipt/>".
(and anyway, "first match" is moot; I added 2 lines of code so it actually replaces the correct location for each match now, to prevent any possibility of unwanted side-effect IndexOf() matches.)
Stefan Ciobaca
June 22, 2008, June 22, 2008 07:16, permalink
> Well, you don't need C#, you could use a regex testing tool of your choice to see what
> actually happens. Remember, our intial tag match is "<[^>]*?(>|$)"
The problem here is the Matches functions (which it seems doesn't find overlapping regexps), not the regexp parser ;-)
The code looks ok to me now, but here is an example that does not work as you'd expect:
"<a><" => "a><"
But it does not seem easily exploitable anymore.
What is the ? in the middle of the tag regexp doing there?
Very good idea to ask for community help on this. Very good use of refactormycode.
Chris Jester-Young
June 22, 2008, June 22, 2008 07:53, permalink
@Stefan:
Yes, that does work as expected (I just tested it): "<a><" does get completely stripped out. There would be two matches under this scheme: "<a>", and "<", and neither of which match the allowed whitelist. I dare say it works _because_ it doesn't find overlapping regexes; if it does find overlapping regexes, then the offset-counting mechanism in the 3-arg version of Strip() would be broken.
The ? in the regex does a minimal match (http://msdn.microsoft.com/en-us/library/3206d374.aspx), i.e., it would try to match as few [^>]'s as possible, rather than at most. However, in this case, it's redundant, due to the need to match either a > or end of string right afterwards. Thus "<[^>]*(>|$)" would behave identically to "<[^>]*?(>|$)".
I agree that it's a very good use of RmC...I look forward to seeing more submissions here, especially from the Stack Overflow/Coding Horror readership. :-)
Stefan Ciobaca
June 22, 2008, June 22, 2008 08:49, permalink
Sorry, I meant "<p><". (actually I meant "<a><", because I had the impression "<a>" was allowed)
The result of cleaning "<p><" should be "p><".
Stefan Ciobaca
June 22, 2008, June 22, 2008 09:00, permalink
Here's a good example that passes through from http://ha.ckers.org/xss.html.
<IMG SRC="http://www.thesiteyouareon.com/somecommand.php?somevariables=maliciouscode">
Chris Jester-Young
June 22, 2008, June 22, 2008 09:39, permalink
The result of cleaning "<p><" is "<p>". :-)
The attack that your last comment mentions (about valid HTML, with a valid link, pointing to page that causes the server to take a possibly-inappropriate action) is not something for the sanitiser to fix up. Instead, web applications need to take care in validating any requests sent to them. However, you brought up a good point, and it should be discussed.
LiveJournal was stung by something similar a few years ago, and the steps they took to fix it up were very instructional.
1. Any requests that _change state_ on a user's behalf need to check that requests are POST requests, not GET requests.
2. Even POST requests can be "spoofed", as LiveJournal found out. So what LiveJournal changed was that all their POST forms have a timestamp as well as a secure hash of the timestamp (based on a secret key) as hidden fields on the form. If the timestamp is over an hour old, say, or the hash does not verify correctly, the request is rejected.
They have other security measures too, which I no longer recall very clearly. Other LiveJournal users are welcome to shed more light on this. :-)
WRT implementing point 2, .NET has some HMAC classes (up to SHA512 now: http://msdn.microsoft.com/en-us/library/system.security.cryptography.hmacsha512.aspx) which can be very useful for this. (If I remember correctly, to mitigate hash-spoofing attacks even further, LiveJournal also regenerates their keys every few hours, and only the most recent two or so are used for validation. Again, I haven't studied their code for a while, so don't quote me on this. :-P)
Nick
June 22, 2008, June 22, 2008 12:10, permalink
Hi Jeff, point well taken from your last comment. I had forgotten that you were using WMD, I am use to working with the WYSIWYG editors on the web. While looking over your code I did actually have a bug in some of your assumptions. You assume that anchors are going to be written like <a href="..." title="..." />, but what if somebody does <a title="..." href="..." /> or <a href="title" rel="friend" />? It doesn't look like your regex can handle that. Same with the <img /> tag. Except this one has 25 different ways it can be written with just the attributes you allow, and hundreds with all the valid attributes.
I know I have said this before, it was probably missed with all the flurry of comments, but you might want to consider also whitelisting the attributes, so they can appear in any order.
Rob Allen
June 22, 2008, June 22, 2008 18:44, permalink
@Nick
We can handle the order issue by splitting the string then looping the resulting array through as switch - this is written right in the code editor below so take is as an example and assume it doesn't work as typed:
String[] mArray = m.split(" ");
String output = new String();
output = "";
foreach (String str in mArray)
{
String item = str.ToLower().substring(0,4);
switch (item){
case ("src="):
output += item;
break;
case ("href"):
output += item;
break;
case ("titl"):
output += item;
break;
case ("rel="):
output += item;
break;
}
}
Jonathan Holland
June 22, 2008, June 22, 2008 20:31, permalink
I used a fairly simple method on my own internal blog engine. It used a blacklist to remove tags that I did not want, its not as robust as your solution but it worked well for me:
/// <summary>
/// Escapes HTML entities that are vulnerable to attack
/// </summary>
/// <param name="commentText">Comment Text</param>
/// <returns>Escaped Comment Text</returns>
public static string EscapeMarkup(string commentText)
{
string returnText = string.Empty;
// Remove HTML Comments
commentText = commentText.Replace("<!--", "");
// strip out comments and doctype
Regex docType = new Regex("<!DOCTYPE[.]*>");
returnText = docType.Replace(commentText, "");
commentText = Regex.Replace(commentText, "(.*)", @"$5");
// strip out most known tags except:
// (a|b|br|blockquote|em|h1|h2|h3|h4|h5|h6|hr|i|li|ol|p|u|ul|strong|sub|sup)
Regex badTags = new Regex("< [/]{0,1}(abbr|acronym|address|applet|area|base|basefont|bdo|big|body|button|caption|center|cite|code|col|colgroup|dd|del|dir|div|dfn|dl|dt|embed|fieldset|font|form|frame|frameset|head|html|iframe|img|input|ins|isindex|kbd|label|legend|link|map|menu|meta|noframes|noscript|object|optgroup|option|param|pre|q|s|samp|script|select|small|span|strike|style|table|tbody|td|textarea|tfoot|th|thead|title|tr|tt|var|xmp){1}[.]*>",RegexOptions.IgnoreCase);
returnText = badTags.Replace(returnText, "");
// Convert line breaks into <BR/> tags
returnText = returnText.Replace(Environment.NewLine,"<br/>");
return returnText;
}
Chris Jester-Young
June 22, 2008, June 22, 2008 23:14, permalink
@Nick: The way Jeff has it is actually some sort of secure. To actually allow all sorts of reordering would either require a more complicated regex (e.g., <img( src="https?://[^"]+"| alt="[^"]*"| title="[^"]*")* ?/?> (I left out width and height on purpose, because I don't think people should be able to say width="5000"), which then has a risk that people might not put in a src attribute (harmless but annoying)), or use some sort of attribute-parsing system.
I don't like the latter idea at all: the code Jeff had has one very important virtue: it's simple, and thus easier to vet than most of the others I've seen here. I would be unhappy to see the code depart from that simplicity.
Remember that the regex enforces the use of double-quotes as attribute delimiter (not single quotes, not backquotes, and definitely not no quotes), with no spaces around the equal sign, and lower-case attribute names only. People who use an attribute-parsing system might be tempted to relax on any of those. Lenient HTML == vulnerable HTML.
@Rob: Your system might work but the matching must be done using a regex, not just the first 4 chars. In particular the src must still require a http:/https: scheme (and definitely not let through javascript: URIs).
Chris Jester-Young
June 22, 2008, June 22, 2008 23:30, permalink
@Jonathan: New elements come out as standards evolve, but more importantly, each browser has its own set of proprietary tags. Keeping up with the list of blacklisted elements is a Sisyphean task.
Also, I just tested your program, it doesn't seem to strip any of my test cases at all. What test cases do you use for your function? I'd be happy to send you some of mine.
Nick
June 23, 2008, June 23, 2008 00:35, permalink
@Chris: it is actually simpler than you make it out to be, you don't actually create an expression for each tag you want to check, you just create one expression that will strip out everything you need. And then you reconstruct the tag using a white list of attributes. I have posted the regex below. I don't want to spam the board with my code but you can find the original here http://refactormycode.com/codes/333-sanitize-html#refactor_11281 that will do all this for you. Very simple and easy to implement code.
Also @Chris you mentioned some very trivial things such as case and quote use, both can easily be corrected when you can pull out the attributes and values from tags. I am reconstructing the tag with out the blacklisted attributes so I can just as easily call ToLower on the attribute and wrap it with quotes.
Remember you are not trying to make the system a burden the users, if they enter in a link tag with some attributes that don't fall in white list you shouldn't just ignore the whole link, because 99 times out of 100 they didn't really care about the extra attribute but they most definitely cared about the link. Same can be argued for <blockquote /> where somebody might use the cite attribute.
Remember the purpose of this is to sanitize the HTML, not provide an iron fist. An Iron fist will just confuse the user as to why their link was not entered in to the system.
@"(?'tag_start'</?)(?'tag'\w+)((\s+(?'attr'(?'attr_name'\w+)(\s*=\s*(?:"".*?""|'.*?'|[^'"">\s]+)))?)+\s*|\s*)(?'tag_end'/?>)"
Rob Allen
June 23, 2008, June 23, 2008 12:40, permalink
@Chris - thanks for the feedback. My hacker hat was in the wash when I wrote that and I completely forgot that IMG Src attributes can come in many forms - not the least of which are scripts with mime headers which I have used in the past to serve images from a database (i.e., <img src="image.php?pid=12"/>)
<img src="image.php?pid=12"/>
mmp1
June 23, 2008, June 23, 2008 12:46, permalink
All the regex stuff looks really small and compact code wise, but you are basically trying to write a simple HTML parser in regex. Also, the code might look small, but it might not be the fastest code. ideas to consider->since you are .NET (I assume you are using an MS .net and not say MONO on linux) why not use say lightweight MSHTML, get a DOM model and then iter. over the DOM. (or another HTML parser with a DOM type model). Then pull out the tags and attribs you want using the whitelist. Once you have it in the dom its easy to do other things (and run transforms etc). Also, only have to parse the file once (not mulitple times). It might seem a little bit of work, but it might be the most bullet proof.
(see here for lightweight mshtml example. http://www.codeguru.com/cpp/i-n/ieprogram/article.php/c4385/#more ). Note sure how this would handl high loads (ie. any memory leaks etc).
Or maybe this is more of a c-programmers solution ;-) (write a parser, use a data structure....etc).
Rob Allen
June 23, 2008, June 23, 2008 14:42, permalink
@mmp
Since this script is for guarding against XSS attacks in forum/blog/wiki posts, accessing the DOM is too late in the process. This script would be run before the content of the request is posted to the DB and the dom doesn't have the new data for it yet.
Stefan Ciobaca
June 23, 2008, June 23, 2008 17:38, permalink
> Why not iterate your match collection in reverse to avoid the index/offset code?
Yes, a lot cleaner.
> The result of cleaning "<p><" is "<p>". :-)
Not in the version where you remove the first occurence of the bad tag.
> The attack that your last comment mentions (about valid HTML, with a valid link, pointing > to page that causes the server to take a possibly-inappropriate action) is not something > for the sanitiser to fix up.
Why not? I myself would forbid image links, because it is a weak form of programming (the server automatically reacts somehow to the image request). I don't think it correct to argue that the server needs to check if the request it is serving is malicious in this sense.
Chris Jester-Young
June 25, 2008, June 25, 2008 00:41, permalink
@Stefan: The result of cleaning is <p> for the version of the code Jeff last posted. It matches the first-pass regex twice: <p>, and <. The second-pass regex accepts the <p>, but rejects the <.
Also, there are cases where linking to images is actually quite appropriate, such as showing screenshots. Also, because not every site bans image links, you would have to guard against people putting image links on those sites that point to your webapp anyway. So, your webapp must have the kinds of defences mentioned in my earlier post, no matter what.
i.e., the server can only "automatically react" for POST requests, and only if the timestamp/hash thing checks out. (Referring to that last post, LiveJournal has even more defences than that...I will have to post a summary of it sometime, once I look at the code further.)
Chris Jester-Young
June 25, 2008, June 25, 2008 00:50, permalink
@Rob, @mmp1: (Re use of a DOM)
Technically, using a DOM is possible. Just not the HTML DOM. You write out a restricted DTD, containing only the elements and attributes you want (start with the XHTML DTD, and strip it down) and ask the XML processing system to validate against that. The restricted DTD can use a root tag of, say, rhtml ("restricted" HTML), and you wrap <rhtml>...</rhtml> around the string to validate. Then, strip off inappropriate URI schemes (i.e., anything but http/https for images, or http/https/ftp for links) via the DOM, write the tree back out to a string, strip off the rhtml tags, and you're done.
But, that is a lot of work. Also, it rejects any invalid input, rather than just stripping it off or escaping it, which is what Jeff was going for.
Chris Jester-Young
June 25, 2008, June 25, 2008 01:22, permalink
@Mike: That sounds like a good idea. However, not being a .NET person myself, I can't help but think: surely there is a way to make a reverse iterator from a collection, and be able to use the "for (Type x in foo)"-type loop. e.g., rbegin/rend in the C++ STL, or descendingIterator in the Java Collections Framework. (I've had a cursory look at the .NET Framework documentation and couldn't find anything suitable....)
Stefan Ciobaca
June 25, 2008, June 25, 2008 23:13, permalink
> @Stefan: The result of cleaning is <p> for the version of the code Jeff last posted. It matches the first-pass regex twice: <p>, and <. The second-pass regex accepts the <p>, but rejects the <.
Correct.
> Also, there are cases where linking to images is actually quite appropriate, such as showing screenshots.
You can implement this otherwise, if you want to store the screenshot on your site.
> Also, because not every site bans image links, you would have to guard against people putting image links on those sites that point to your webapp anyway.
HUGE Difference!
> So, your webapp must have the kinds of defences mentioned in my earlier post, no matter what.
Don't agree.
> i.e., the server can only "automatically react" for POST requests, and only if the timestamp/hash thing checks out. (Referring to that last post, LiveJournal has even more defences than that...I will have to post a summary of it sometime, once I look at the code further.)
Using timestamps creates more problems than it solves... I don't think it's a smart/good solution, only more complicated.
Stephen
June 29, 2008, June 29, 2008 13:30, permalink
I disagree stefan, A sanitizer cannot be considering URL security, what URLs it should allow or not, what about sites that share authentication.. you'd have to cover those as well.. or even sites that are well known for having persistent authentication..
It's up to each site to secure its actions so that they can be identified as real user actions.. or spoofed actions.
Weeble
June 30, 2008, June 30, 2008 11:16, permalink
Even if this code works, I find it worrying in principle. Whitelists are superior to blacklists because they catch attacks that you don't know about. But even though you use a whitelist, your overall approach is more like a blacklist: look inside this string for "bad" stuff and erase it. I *think* it works, but I don't feel sure. I would be a lot happier if it constructed the output string from scratch, and every single thing that went into that string was either a string literal that you know is safe, or the output of HtmlEncode. A bug in your code should "fail-safe" and output nothing at all, and not "fail-deadly" and output the input unprocessed. For example, what if somebody later accidentally introduces a bug that causes the main loop to run the wrong number of times? (Hopefully your unit tests would spot it, of course, but it can be difficult to test for all eventualities.) The code would then fail-deadly and output all or part of the input unsanitized. At the very least, you should not be putting the sanitized string in the same variable as the unsanitized one.
Also, you say 'The only remaining issue is orphaned tags, but that (IMHO) is outside the scope of this function as you can have orphaned tags with completely "safe" HTML.' I take issue with your logic here. You can have Javascript in completely safe HTML. Just because it's possible to use orphaned tags in safe HTML doesn't mean that they are out of scope for a sanitization function. It worries me that this is effectively a "partial sanitizer", and that you presumably intend to have a chain of such functions that each clean up some sort of potential problem. You then have to track various different sorts of "partially tainted" data passing through your program.
cpdog.myopenid.com
June 30, 2008, June 30, 2008 15:54, permalink
This is just the wrong way to do this.
var html = "this is a < b> test <a href = \"http://www.google.com\"> Hello 1</a>";
html += "<a href =\"http://www.google.com\"> Hello 2</a>";
html += "<a href ='http://www.google.com'> Hello 3</a>";
html += "<img src='http://www.google.com'> Hello 4";
html += "<img src=\"http://www.google.com\"> Hello 5";
html += "<img src =\"http://www.google.com\" /> Hello 6";
html += "<img src = \"http://www.google.com\" /> Hello 7";
html = SanitizeHtml(html);
Output: this is a test Hello 1</a> Hello 2</a> Hello 3</a> Hello 4 Hello 5 Hello 6 Hello 7
People are going to go nuts trying to figure out how to get their HTML to work. Do this the right way: HtmlEncode the entire string then do a string replace of the entities that you want. E.g., html.Replace("<p>","<p>");
Mario Pareja
July 5, 2008, July 05, 2008 15:54, permalink
Obviously this method is very susceptible to change. I would consider wrapping it in a class and interface and applying the Strategy pattern along with some form of run-time dependency injection. This way, if the algorithm is compromised while the application is on production, you can alter a configuration setting to temporarily use a different algorithm that disables all HTML without bringing the site down.
Jeff Atwood
July 11, 2008, July 11, 2008 10:28, permalink
> your overall approach is more like a blacklist: look inside this string for "bad" stuff and erase it
I see what you mean vs. HTMLEncoding() the whole thing, and then writing regular expressions to replace the escaped whitelisted entities with valid HTML. I just never thought of it this way.
The other advantage of that approach is that content is not deleted.
Bealer
July 23, 2008, July 23, 2008 13:21, permalink
I've been looking at Nick's implementation.
The problem with it, is that the html must be clean. There are cases where you can pass in hacked html, and it won't match it, in which case it'll return the hacked html string as it won't match anything to replace. This isn't strictly whitelisting.
A different approach would be to HtmlEncode the whole string. Then look for matches, if you find any to HtmlDecode them back. I was looking at this approach.
For now I've gone with using Tidy.Net to clean the Html, and then I pass it into a modified version of Nick's code. Another weakness was that it allowed one of the most common XSS attacks, src="javascript.... I have for now, put in case statements for src and href attributes to check for https?://
I'll post my code once it's finished. Am working on some unit test based on the XSS attacks listed here:
http://ha.ckers.org/xss.html
Dinah
July 23, 2008, July 23, 2008 20:17, permalink
I'm working on one project which requires this but img and anchor attributes can end up in an odd order due to a generator. So I made this variation on the code at the top of the page. I mainly posted this for others like myself who don't want attribute order to be relevant.
class SpecialTag
{
private string _tag;
private string _requiredAttribute;
private List<string> _whitelist;
private Regex _regex;
private string _requiredAttribRegex;
public string Tag
{
get { return _tag; }
}
public SpecialTag(string tag, string requiredAttribute, string requiredAttribRegex, params string[] list)
{
_tag = tag;
_requiredAttribute = requiredAttribute;
_requiredAttribRegex = requiredAttribRegex;
_whitelist = new List<string>(list);
// make sure the main required attribute is included in the whitelist
if (!_whitelist.Contains(_requiredAttribute))
_whitelist.Add(_requiredAttribute);
_regex = new Regex(@"
<" + _tag + @"
(?<attrib>\s
(?<name>[a-z]+)
\s*=\s*
""(?<value>[^""]*)""
)+\s?>",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
}
public string Parse(string html, Match tag)
{
MatchCollection attribs = _regex.Matches(tag.Value.ToLower());
// verify general structure
if (attribs.Count == 0)
{
html = html.Remove(tag.Index, tag.Length);
return html;
}
// there will only be 1 capture
GroupCollection attribGroups = attribs[0].Groups;
// handle href 1st
int hrefIndex = -1;
for (int j = 0; j < attribGroups["name"].Captures.Count; j++)
{
if (attribGroups["name"].Captures[j].Value == _requiredAttribute)
{
hrefIndex = j;
break;
}
}
// if no href or if it doesn't match the regex, kill the whole ta <a> g
if (hrefIndex < 0 || !Purify.IsMatch(attribGroups["attrib"].Captures[hrefIndex].Value.Trim(), _requiredAttribRegex))
{
html = html.Remove(tag.Index, tag.Length);
return html;
}
for (int k = attribGroups["attrib"].Captures.Count - 1; k > -1; k--)
{
Capture c = attribGroups["attrib"].Captures[k];
string attrib = c.Value;
string aName = attribGroups["name"].Captures[k].Value;
string aValue = attribGroups["value"].Captures[k].Value;
//case "href": // don't forget to allow href here
if (!_whitelist.Contains(aName))
html = html.Remove(tag.Index + c.Index, c.Length);
}
return html;
}
}
class Purify
{
private static Regex _htmltags = new Regex("<[^>]+(>|$)",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);
private static Regex _whitelist = new Regex(@"
</?a|(b(lockquote)?|code|em|h(1|2|3|4)|i|li|ol|p(re)?|s(ub|uper|trong|trike)?|ul)>
|<(b|h)r\s?/?>
|<a[^>]+>
|<img[^>]+/?>",
RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace |
RegexOptions.ExplicitCapture | RegexOptions.Compiled);
private static SpecialTag[] _specialTags;
static Purify()
{
_specialTags = new SpecialTag[] {
new SpecialTag(
// tag
"a",
// required attribute
"href",
// regex for restricting required attribute's value
@"href\s*=\s*""(https?|ftp)://[^""]+""",
// attribute whitelist
"style", "id", "target", "class", "type", "title", "tabindex", "name"),
new SpecialTag("img", "src", @"src\s*=\s*""https?://[^""]+""",
"hspace", "height", "border", "align", "width", "vspace", "style", "class", "longdesc", "title", "id", "alt")
};
}
public static string Sanitize(string html)
{
MatchCollection htmltags = _htmltags.Matches(html);
// iterate through input tags
for (int i = htmltags.Count - 1; i > -1; i--)
{
Match htmltag = htmltags[i];
string tagname = htmltag.Value.ToLower();
// not in whitelist
if (!_whitelist.IsMatch(tagname))
{
html = html.Remove(htmltag.Index, htmltag.Length);
continue;
}
foreach (SpecialTag specialTag in _specialTags)
{
if (tagname.StartsWith("<" + specialTag.Tag))
{
html = specialTag.Parse(html, htmltag);
break;
}
}
}
return html;
}
public static bool IsMatch(string s, string pattern)
{
return Regex.IsMatch(s, pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase |
RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
}
}
Bealer
July 24, 2008, July 24, 2008 12:16, permalink
Also, from looking at both Jeff's and Nick's versions they both won't stop "url string invasion" attacks, and are a little weak around the src/href's value for anchors and images.
I'm still working on my solution (got other stuff in the iteration to do now), but in my solution I firstly use Tidy.Net to clean the HTML.
This is preferable to using regex's as it'll build a full object model and work it's way through the html. Save me having to write a hugely complex regex, especially as some of the hacks as based around malformed html so it's incredibly difficult to match everything.
Regex's are then good to use when removing "valid" html, that is potentially dangerous. Although Tidy.Net is quite good at removing a number of XSS attacks.
I use Microsoft's AntiXss library (v1.5) to HtmlEncode the string Tidy.Net returns me. I then go through the string matching all the encoded data. Anything good (stuff that is allowed) that I find, I html decode it back. That way I am truly whitelisting, anything I miss will remain encoded.
The hardest part is reading "href" and "src" values of img/anchor elements and checking for hex, octal, dword etc... hacks.
I'm unit testing for each type of attack. Got about 14 done so far. Still another 100 or so to go.
Bealer
July 27, 2008, July 27, 2008 12:41, permalink
Here's my code so far. I use Tidy.Net to clean the HTML first, then use a slightly modified version of Nick's code.
A few things I'm considering:
1) It would be nice to remove the "RemoveTidyAdditions" and not have Tidy add it in, in the first place.
2) I HtmlEncode the whole string upon entering the method. I then match allowed html, and decode it back. That way I would be truly whitelisting, as anything I miss or don't match has already been encoded.
I'm writing unit tests for all the possible XSS attacks. I've currently got about 20. There's another 30 I think I'll write. However some are currently failing. The ones I'm struggling with most are url invasion attacks. Examples would be using "src" or "href", then calling javascript using a hex encoded string. I can check for "https?" but there are other instances where it's a problem, such as style=background(xss here).
With regards to the code, I've made it a bit more readable (don't like 1 line regex's) and tried to keep it broken down nicely. I'm currently using static methods to copy HttpUtility.HtmlEncode, but may make the class an instance class if unit testing becomes tricky with more broken down methods.
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using Microsoft.Security.Application;
using TidyNet;
namespace Bealer.Web
{
public class HtmlUtility
{
private static readonly Regex HtmlTagExpression = new Regex(@"
(?'tag_start'</?)
(?'tag'\w+)((\s+
(?'attribute'(\w+)(\s*=\s*(?:"".*?""|'.*?'|[^'"">\s]+)))?)+\s*|\s*)
(?'tag_end'/?>)",
RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex HtmlAttributeExpression = new Regex(@"
(?'attribute'\w+)
(\s*=\s*)
(""(?'value'.*?)""|'(?'value'.*?)')",
RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Dictionary<string, List<string>> ValidHtmlTags = new Dictionary<string, List<string>>
{
{"p", new List<string> {"style", "class", "align"}},
{"div", new List<string> {"style", "class", "align"}},
{"span", new List<string> {"style", "class"}},
{"br", new List<string> {"style", "class"}},
{"hr", new List<string> {"style", "class"}},
{"label", new List<string> {"style", "class"}},
{"h1", new List<string> {"style", "class"}},
{"h2", new List<string> {"style", "class"}},
{"h3", new List<string> {"style", "class"}},
{"h4", new List<string> {"style", "class"}},
{"h5", new List<string> {"style", "class"}},
{"h6", new List<string> {"style", "class"}},
{"font", new List<string> {"style", "class", "color", "face", "size"}},
{"strong", new List<string> {"style", "class"}},
{"b", new List<string> {"style", "class"}},
{"em", new List<string> {"style", "class"}},
{"i", new List<string> {"style", "class"}},
{"u", new List<string> {"style", "class"}},
{"strike", new List<string> {"style", "class"}},
{"ol", new List<string> {"style", "class"}},
{"ul", new List<string> {"style", "class"}},
{"li", new List<string> {"style", "class"}},
{"blockquote", new List<string> {"style", "class"}},
{"code", new List<string> {"style", "class"}},
{"a", new List<string> {"style", "class", "href", "title", "target"}},
{"img", new List<string> {"style", "class", "src", "height", "width", "alt", "title", "hspace", "vspace", "border"}},
{"table", new List<string> {"style", "class"}},
{"thead", new List<string> {"style", "class"}},
{"tbody", new List<string> {"style", "class"}},
{"tfoot", new List<string> {"style", "class"}},
{"th", new List<string> {"style", "class", "scope"}},
{"tr", new List<string> {"style", "class"}},
{"td", new List<string> {"style", "class", "colspan"}},
{"q", new List<string> {"style", "class", "cite"}},
{"cite", new List<string> {"style", "class"}},
{"abbr", new List<string> {"style", "class"}},
{"acronym", new List<string> {"style", "class"}},
{"del", new List<string> {"style", "class"}},
{"ins", new List<string> {"style", "class"}}
};
/// <summary>
/// Removes the invalid HTML tags.
/// </summary>
/// <param name="input">The text.</param>
/// <returns></returns>
public static string RemoveInvalidHtmlTags(string input)
{
var html = TidyHtml(input);
if (string.IsNullOrEmpty(html))
return AntiXss.HtmlEncode(input);
return HtmlTagExpression.Replace(html, new MatchEvaluator(match =>
{
var builder = new StringBuilder(match.Length);
var tagStart = match.Groups["tag_start"];
var tagEnd = match.Groups["tag_end"];
var tag = match.Groups["tag"].Value;
var attributes = match.Groups["attribute"];
if (false == ValidHtmlTags.ContainsKey(tag))
{
builder.Append(tagStart.Success ? tagStart.Value : "<");
builder.Append(tag);
builder.Append(tagEnd.Success ? tagEnd.Value : ">");
return AntiXss.HtmlEncode(builder.ToString());
}
builder.Append(tagStart.Success ? tagStart.Value : "<");
builder.Append(tag);
foreach (Capture attribute in attributes.Captures)
{
builder.Append(MatchHtmlAttribute(tag, attribute));
}
// add nofollow to all hyperlinks
if (tagStart.Success && tagStart.Value == "<" && tag.Equals("a", StringComparison.OrdinalIgnoreCase))
builder.Append(" rel=\"nofollow\"");
builder.Append(tagEnd.Success ? tagEnd.Value : ">");
return builder.ToString();
}));
}
private static string MatchHtmlAttribute(string tag, Capture capture)
{
var output = string.Empty;
var match = HtmlAttributeExpression.Match(capture.Value);
var attribute = match.Groups["attribute"].Value;
var value = match.Groups["value"].Value;
if (ValidHtmlTags[tag].Contains(attribute))
{
switch (attribute)
{
case "src":
case "href":
if (Regex.IsMatch(value, @"https?://[^""]+"))
output = string.Format(" {0}=\"{1}\"", attribute, AntiXss.UrlEncode(value));
break;
default:
output = string.Format(" {0}=\"{1}\"", attribute, value);
break;
}
}
return output;
}
private static string TidyHtml(string text)
{
var doc = new Tidy();
var messages = new TidyMessageCollection();
var input = new MemoryStream();
var output = new MemoryStream();
var array = Encoding.UTF8.GetBytes(text);
input.Write(array, 0, array.Length);
input.Position = 0;
doc.Options.DocType = DocType.Strict;
doc.Options.Xhtml = true;
doc.Options.CharEncoding = CharEncoding.UTF8;
doc.Options.LogicalEmphasis = true;
doc.Options.MakeClean = false;
doc.Options.SmartIndent = false;
doc.Options.IndentContent = false;
doc.Options.TidyMark = false;
doc.Options.DropFontTags = false;
doc.Options.QuoteAmpersand = true;
doc.Options.DropEmptyParas = true;
doc.Options.CharEncoding = CharEncoding.UTF8;
doc.Parse(input, output, messages);
return RemoveTidyAdditions(Encoding.UTF8.GetString(output.ToArray()));
}
private static string RemoveTidyAdditions(string text)
{
if (string.IsNullOrEmpty(text))
return string.Empty;
var start = text.IndexOf("<body>");
var end = text.IndexOf("</body>");
if (start != -1 && end > start && end < text.Length)
text = text.Substring(start + 6, end - (start + 6));
else
return string.Empty;
return Regex.Replace(text, "[\r\n\t]*", string.Empty);
}
}
}
Steve Downing
October 3, 2008, October 03, 2008 03:51, permalink
Here's is a fairly quick'n'dirty implementation based on combining Weeble's idea with Jeff code (i.e HtmlEncode everything, then HTMLDecode the safe tags).
My needs were a bit more relaxed, so the more stringent checks on "img" and
"a href" are not present in this version. And this has a noted FIXME for when the inner text of a <a> tag might contain > or < (i.e <a href=".."> Click <here></a>
/// <summary>
/// A regex that matches things that look like a HTML tag after HtmlEncoding. Splits the input so we can get discrete
/// chunks that start with < and ends with either end of line or >
/// </summary>
private static Regex _tags = new Regex("<(?!>).+?(>|$)", RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);
/// <summary>
/// A regex that will match tags on the whitelist, so we can run them through
/// HttpUtility.HtmlDecode
/// FIXME - Could be improved, since this might decode > etc in the middle of
/// an a/link tag (i.e. in the text in between the opening and closing tag)
/// </summary>
private static Regex _whitelist = new Regex(@"
^</?(a|b(lockquote)?|code|em|h(1|2|3)|i|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)>$
|^<(b|h)r\s?/?>$
|^<a(?!>).+?>$
|^<img(?!>).+?/?>$",
RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace |
RegexOptions.ExplicitCapture | RegexOptions.Compiled);
/// <summary>
/// HtmlDecode any potentially safe HTML tags from the provided HtmlEncoded HTML input using
/// a whitelist based approach, leaving the dangerous tags Encoded HTML tags
/// </summary>
private static string Sanitize(string html)
{
string tagname = "";
Match tag;
MatchCollection tags = _tags.Matches(html);
string safeHtml = "";
// iterate through all HTML tags in the input
for (int i = tags.Count - 1; i > -1; i--)
{
tag = tags[i];
tagname = tag.Value.ToLower();
if (_whitelist.IsMatch(tagname))
{
// If we find a tag on the whitelist, run it through
// HtmlDecode, and re-insert it into the text
safeHtml = HttpUtility.HtmlDecode(tag.Value);
html = html.Remove(tag.Index, tag.Length);
html = html.Insert(tag.Index, safeHtml);
}
}
return html;
}
Stephen
October 19, 2008, October 19, 2008 16:57, permalink
The job of a html sanitizer is to sanitize the html against rules you state, its dependency is that the its given html - before you sanitize there should be something else that ensures the correctness of the html..
otherwise you expect the sanitizer to sanitize, clean, correct.. drive your car for you.. take over the world?
Matt Presson
October 20, 2008, October 20, 2008 13:33, permalink
Be wary of the style attribute when accepting html. There are some IE specific attacks that can still execute javascript in the style tag through the use of expression();
<a href="#" style="x:expression(alert(0))">Click me</a>
grom
November 24, 2008, November 24, 2008 23:32, permalink
Here is an actual parser (written in PHP) that only accepts white listed input, http://refactormycode.com/codes/557-html-filter
Orson Roy
March 2, 2009, March 02, 2009 16:45, permalink
htmLawed -- for PHP users, a very good sanitization/filtering script: http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed
Matt
April 8, 2009, April 08, 2009 03:50, permalink
Marcus, nice code! Snip below fails. It thinks that the second img isnt valid and encodes it as html.
<p><img src="a.jpg" /><img src="b.jpg" /></p>
Marcus McConnell
April 8, 2009, April 08, 2009 14:14, permalink
Thanks Matt! There was one small bug that was causing the second img tag to encode instead of rebuild. However, the expected behavior of the code is to remove any SRC or HREF attribute values that aren't valid URIs. This is because there are many XSS attacks on the SRC and HREF attribute. By requiring users to enter a fully qualified domain name for images I can use the URI class to validate the value.
// Marcus McConnell - 2009-06-01
// Copyright 2009 by Marcus McConnell and BV Software.
// All rights reserved.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace BVSoftware.Web
{
public class HtmlSanitizer
{
private class TagAttribute
{
public string Name { get; set; }
public string Value {get;set;}
public TagAttribute()
{
}
public TagAttribute(string name, string value)
{
Name = name;
Value = value;
}
public static bool IsValidForTag(TagAttribute att, AcceptableTag tag)
{
bool result = false;
if (tag.AcceptableAttributes.Contains(att.Name))
{
result = true;
}
return result;
}
public string ValueAsEncodedUrl()
{
return System.Web.HttpUtility.UrlEncode(Value);
}
public string ValueAsEncodedHtml()
{
return System.Web.HttpUtility.HtmlEncode(Value);
}
}
private class AcceptableTag
{
public string Tagname { get; set; }
public List<string> AcceptableAttributes { get; set; }
public bool SelfClosing { get; set; }
public AcceptableTag()
{
AcceptableAttributes = new List<string>();
}
public AcceptableTag(string name, bool selfclosing, params string[] attributes)
{
Tagname = name;
SelfClosing = selfclosing;
AcceptableAttributes = new List<string>();
foreach (string att in attributes)
{
AcceptableAttributes.Add(att);
}
}
}
private static Dictionary<string, AcceptableTag> AcceptedTags()
{
Dictionary<string, AcceptableTag> result = new Dictionary<string,AcceptableTag>();
// self closing tags
result.Add("br", new AcceptableTag("br", true, "id","class"));
result.Add("hr", new AcceptableTag("hr", true, "id","class"));
result.Add("img", new AcceptableTag("img", true, "id","class","src","alt"));
result.Add("b",new AcceptableTag("b",false, "id","class"));
result.Add("i",new AcceptableTag("i",false, "id","class"));
result.Add("u",new AcceptableTag("u",false, "id","class"));
result.Add("em",new AcceptableTag("em",false, "id","class"));
result.Add("strong",new AcceptableTag("strong",false, "id","class"));
result.Add("p",new AcceptableTag("p",false, "id","class"));
result.Add("div",new AcceptableTag("div",false, "id","class"));
result.Add("span",new AcceptableTag("span",false, "id","class"));
result.Add("h1",new AcceptableTag("h1",false, "id","class"));
result.Add("h2",new AcceptableTag("h2",false, "id","class"));
result.Add("h3",new AcceptableTag("h3",false, "id","class"));
result.Add("h4",new AcceptableTag("h4",false, "id","class"));
result.Add("h5",new AcceptableTag("h5",false, "id","class"));
result.Add("h6",new AcceptableTag("h6",false, "id","class"));
result.Add("a",new AcceptableTag("a",false, "id","class","href","target","title", "name"));
result.Add("strike",new AcceptableTag("strike",false, "id","class"));
result.Add("ul",new AcceptableTag("ul",false, "id","class"));
result.Add("ol",new AcceptableTag("ol",false, "id","class"));
result.Add("li",new AcceptableTag("li",false, "id","class"));
result.Add("pre",new AcceptableTag("pre",false, "id","class"));
result.Add("blockquote",new AcceptableTag("blockquote",false, "id","class"));
result.Add("address",new AcceptableTag("address",false, "id","class"));
result.Add("sup",new AcceptableTag("sup",false, "id","class"));
result.Add("sub", new AcceptableTag("sub", false, "id", "class"));
return result;
}
// This method html encodes the input string but
// allows certain safe HTML tags by
// rewriting them with safe attribute tags
//
// b,i,u,em,strong,p,div,span,h1-h6,a,strike
// ul,ol,li,hr,pre,blockquote,address,br
// img,sup,sub
//
public static string MakeHtmlSafe(string input)
{
string result = string.Empty;
Queue<string> tokens = Tokenize(input);
StringBuilder sb = new StringBuilder();
Queue<string> tokenQueue = new Queue<string>();
ProcessTags(tokens, sb);
result = sb.ToString();
return result;
}
private static Queue<string> Tokenize(string input)
{
Queue<string> result = new Queue<string>();
bool in_tag = false;
StringBuilder sb = new StringBuilder();
foreach (char c in input.ToCharArray())
{
if (in_tag == false)
{
if (c == '<')
{
// starting html tag
// dump any plaintext
if (sb.ToString().Length > 0)
{
result.Enqueue(sb.ToString());
}
sb = new StringBuilder();
in_tag = true;
sb.Append(c);
}
else
{
// plain text only
sb.Append(c);
}
}
else
{
// searching for ending char
if (c == '>')
{
sb.Append(c);
if (sb.ToString().Length > 0)
{
result.Enqueue(sb.ToString());
}
sb = new StringBuilder();
in_tag = false;
}
else
{
sb.Append(c);
}
}
}
if (sb.ToString().Length > 0)
{
result.Enqueue(sb.ToString());
}
return result;
}
private static void ProcessTags(Queue<string> tokens, StringBuilder sb)
{
bool parsingTag = false;
Queue<string> subqueue = new Queue<string>();
string startToken = string.Empty;
while (tokens.Count > 0)
{
string currentToken = tokens.Dequeue();
if (parsingTag)
{
if (IsClosingTag(currentToken,ParseTagName(startToken)))
{
// Yes, this tag closed the starting tag
sb.Append(RebuildOpenTag(startToken));
ProcessTags(subqueue, sb);
sb.Append(CloseOpenTag(ParseTagName(startToken)));
// reset everything since the tag is parsed below
parsingTag = false;
startToken = "";
subqueue = new Queue<string>();
}
else
{
// Nope, no closing yet, just enqueue the token
subqueue.Enqueue(currentToken);
}
}
else
{
// we're not parsing, check for a tag start
if (currentToken.StartsWith("<"))
{
if (IsAcceptedTag(currentToken))
{
parsingTag = true;
if (IsSelfClosed(currentToken))
{
// Tag is self closed, just parse it
sb.Append(RebuildClosedTag(currentToken));
parsingTag = false; // Added 2009-04-08
}
else
{
// store the starting token for later parsing
startToken = currentToken;
}
}
else
{
// not an accepted tag, just encode the sucker
sb.Append(EnsureHtmlEncode(currentToken));
}
}
else
{
// not starting a tag, just dump the encoded output
sb.Append(EnsureHtmlEncode(currentToken));
}
}
}
if (parsingTag)
{
if (startToken.Length > 0)
{
sb.Append(EnsureHtmlEncode(startToken));
}
}
// if the subque has items in it because we didn't find a closing tag, dump them encoded
if (subqueue.Count > 0)
{
while (subqueue.Count > 0)
{
string subqueuetoken = subqueue.Dequeue();
sb.Append(EnsureHtmlEncode(subqueuetoken));
}
}
}
private static string CloseOpenTag(string tagName)
{
return "</" + tagName + ">";
}
private static string RebuildOpenTag(string startToken)
{
string tagName = ParseTagName(startToken);
return "<" + tagName + ParseAttributes(startToken) + ">";
}
private static bool IsClosingTag(string currentToken, string forTagName)
{
if (currentToken == "</" + forTagName + ">" ||
currentToken == "</ " + forTagName + ">")
{
return true;
}
else
{
return false;
}
}
private static string RebuildClosedTag(string currentToken)
{
string result = string.Empty;
if (IsAcceptedTag(currentToken))
{
result = "<" + ParseTagName(currentToken) + ParseAttributes(currentToken) + "/>";
}
else
{
result = EnsureHtmlEncode(currentToken);
}
return result;
}
private static string ParseAttributes(string currentToken)
{
StringBuilder sb = new StringBuilder();
string tagname = ParseTagName(currentToken);
AcceptableTag foundTag = null;
if (AcceptedTags().TryGetValue(tagname, out foundTag))
{
List<TagAttribute> atts = ParseAttributeList(currentToken, foundTag);
if (atts.Count > 0)
{
foreach (TagAttribute t in atts)
{
sb.Append(" ");
sb.Append(t.Name + "=\"");
if (t.Name == "href" || t.Name == "src")
{
//sb.Append(t.Value);
try
{
Uri url = null;
if (t.Value.StartsWith("/"))
{
url = new Uri("http://localhost" + t.Value);
sb.Append(url.PathAndQuery);
}
else
{
url = new Uri(t.Value);
sb.Append(url.ToString());
}
}
catch
{
sb.Append(" ");
}
}
else
{
sb.Append(t.ValueAsEncodedHtml());
}
sb.Append("\"");
}
}
}
return sb.ToString();
}
private static List<TagAttribute> ParseAttributeList(string input, AcceptableTag tag)
{
string temp = input.Substring(tag.Tagname.Length + 1, input.Length - (tag.Tagname.Length + 2));
if (temp.EndsWith("/"))
temp = temp.TrimEnd('/');
temp = temp.Trim();
List<TagAttribute> result = new List<TagAttribute>();
bool isParsingAttribute = false;
bool isParsingAttributeValue = false;
TagAttribute currentAttribute = null;
// loop through all characters, splitting of attributes
char[] characters = temp.ToCharArray();
for (int i = 0; i < temp.Length; i++)
{
char current = temp[i];
if (isParsingAttribute)
{
if (isParsingAttributeValue)
{
// append the current character
currentAttribute.Value += current;
// check to see if we're done with the attribute
if (currentAttribute.Value.Length >= 2)
{
if (currentAttribute.Value.EndsWith("\""))
{
isParsingAttributeValue = false;
isParsingAttribute = false;
if (TagAttribute.IsValidForTag(currentAttribute, tag))
{
currentAttribute.Value = currentAttribute.Value.TrimStart('"');
currentAttribute.Value = currentAttribute.Value.TrimEnd('"');
if (currentAttribute.Name == "src" || currentAttribute.Name == "href")
{
if (currentAttribute.Value.IndexOf("javascript", StringComparison.InvariantCultureIgnoreCase) > -1)
{
currentAttribute.Value = currentAttribute.Value.ToLowerInvariant().Replace("javascript", "");
}
if (currentAttribute.Value.IndexOf("vbscript", StringComparison.InvariantCultureIgnoreCase) > -1)
{
currentAttribute.Value = currentAttribute.Value.ToLowerInvariant().Replace("vbscript", "");
}
}
result.Add(currentAttribute);
}
currentAttribute = null;
}
}
}
else
{
// we're not parsing the value yet so check for "="
if (current == '=')
{
// skip this charater but enable attribute value parsing;
isParsingAttributeValue = true;
}
else
{
currentAttribute.Name += current;
}
}
}
else
{
// not parsing right now, check to see if we need to start
if (!char.IsWhiteSpace(current))
{
// not white space so let's start our attribute name
currentAttribute = new TagAttribute(current.ToString(),"");
isParsingAttribute = true;
}
}
}
return result;
}
private static bool IsSelfClosed(string currentToken)
{
AcceptableTag foundTag = null;
if (AcceptedTags().TryGetValue(ParseTagName(currentToken), out foundTag))
{
// Found a matching tag.
if (foundTag.SelfClosing && currentToken.EndsWith("/>"))
{
return true;
}
}
return false;
}
private static bool IsAcceptedTag(string currentToken)
{
AcceptableTag foundTag = null;
if (AcceptedTags().TryGetValue(ParseTagName(currentToken), out foundTag))
{
// simple check to make sure tag is closed
if (currentToken.EndsWith(">") || currentToken.EndsWith("/>"))
{
return true;
}
}
return false;
}
private static string ParseTagName(string tag)
{
string result = string.Empty;
string temp = tag.TrimStart('<');
temp = temp.TrimEnd('>');
if (temp.EndsWith("/")) temp = temp.TrimEnd('/');
// temp is now a raw tag without open/close brackets
string[] parts = temp.Split(' ');
if (parts != null)
{
if (parts.Count() > 0)
{
result = parts[0];
}
}
return result;
}
private static string EnsureHtmlEncode(string s)
{
string result = "";
if (!String.IsNullOrEmpty(s))
result = System.Web.HttpUtility.HtmlEncode(s);
return result;
}
}
}
Pietro Polsinelli
November 5, 2009, November 05, 2009 12:39, permalink
We did a version in Java, actually more extended:
http://patapage.com/applications/pataPage/site/test/testSanitize.jsp
a
December 5, 2009, December 05, 2009 04:12, permalink
// Marcus McConnell - 2009-06-01
// Copyright 2009 by Marcus McConnell and BV Software.
// All rights reserved.
Is this really so much copyrighted, or rather i can use it in my public domain software?
rickstone
February 24, 2010, February 24, 2010 03:12, permalink
doc.Parse(input, output, messages); always return output stream with a zero size.
doc.Parse(input, output, messages);
return RemoveTidyAdditions(Encoding.UTF8.GetString(output.ToArray()));
Jaundalynn
April 23, 2011, April 23, 2011 12:13, permalink
I'm impressed! You've manaegd the almost impossible.
I'm impressed! You've manaegd the almost impossible.
Jaundalynn
April 23, 2011, April 23, 2011 12:14, permalink
I'm impressed! You've manaegd the almost impossible.
I'm impressed! You've manaegd the almost impossible.
Kerriann
May 1, 2011, May 01, 2011 19:23, permalink
Glad I've finally found sometihng I agree with!
Glad I've finally found sometihng I agree with!
Kerriann
May 1, 2011, May 01, 2011 19:23, permalink
Glad I've finally found sometihng I agree with!
Glad I've finally found sometihng I agree with!
Kerriann
May 1, 2011, May 01, 2011 19:23, permalink
Glad I've finally found sometihng I agree with!
Glad I've finally found sometihng I agree with!
ridgerunner
December 31, 2011, December 31, 2011 10:06, permalink
The _whitelist regex can be sped up significantly (for the cases where the pattern does NOT match) by implementing Friedl's "Start of string/line anchor optimization" technique. The original _whitelist regex has two global alternatives, each of which begin with ^, but since there are global alternatives, the regex engine is (probably) not smart enough to know that the pattern can only match at the start of the target string, and will (needlessly, wastefully and unsuccessfully) attempt to match the pattern at each and every position within the string. By factoring out and "exposing" the anchors, the regex engine can apply the optimization, and will declare a match failure immediately. Here is an improved equivalent expression:
private static Regex _whitelist = new Regex(@"
# Match a whitelist of allowable HTML element tags.
^ # Anchor to start of string.
< # All HTML element tags start with a <.
(?: # Group for global whitelist element alternatives.
/? # Option 1: Non-empty HTML open & close tags.
(?: # Group for whitelist tag alternatives.
b(?:lockquote)? # Either B or BLOCKQUOTE,
| code # or CODE,
| d(?:[dtl]|el) # or DD, DT, DL or DEL,
| em # or EM,
| h[123] # or H1, H2 or H3,
| i # or I,
| kbd # or KBD,
| li # or LI,
| ol # or OL,
| p(?:re)? # or P or PRE,
| s(?:ub|up|trong|trike)? # or SUB, SUP, STRONG or STRIKE,
|ul # or UL.
) # End group of whitelist tag alternatives.
| # or Global Option 2: Empty HTML elements.
(?: # Group for empty element alternatives.
br # Either BR,
| hr # or HR.
) # End group of empty element alternatives.
\s? # Optional whitespace.
/? # Optional empty element / tag terminator.
) # End group of global whitelist element alternatives.
> # All HTML element tags end with a >.
$ # Anchor to end of string.
", RegexOptions.Singleline | RegexOptions.ExplicitCapture |
RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
spider repellent
January 4, 2012, January 04, 2012 12:45, permalink
I'm not sure exactly why but this website is loading very slow for me. Is anyone else having this issue or is it a issue on my end? I'll check back later and see if the problem still exists.
puertas de herreria para exterior
January 18, 2012, January 18, 2012 20:24, permalink
Es serio? No!~ Es no posible!
Takes a provided HTML string and removes any potentially dangerous XSS HTML tags using a whitelist approach. Useful when you want to allow a small subset of "safe" HTML tags in user content.
UPDATED: July 11th to reflect all refactorings, plus optimizing for speed. Now 2x faster!
UPDATED: Sept 1st, bugfixes
UPDATED: May 14th, simplify and improve whitelist strictness