#!/usr/bin/php
<?php
/**
* @author Cameron Zemek <cameronz@bundaberg.qld.gov.au>
*/
/**
* Callback handler for HtmlParser
*/
interface HtmlParserHandler {
/**
* Callback for open tag
*
* @param $tagName string Tag name
* @param $attributes array Attributes as an associative array of name => value
*/
public function openTag($tagName, $attributes);
/**
* Callback for close tag
*
* @param $tagName string Tag name
*/
public function closeTag($tagName);
/**
* Callback for comment tags
*
* @param $comment string Comment
*/
public function comment($comment);
/**
* Callback for text
*
* @param $text string Escaped text
*/
public function text($text);
}
/**
* Handles HTML special characters (ie. < & > ") thereby making it easier
* for the filter to remove XSS attacks.
*/
class HtmlParser {
/**
* Parse HTML snippet
*
* @param $html string HTML snippet
* @param $handler HtmlParserHandler Callback handler
*/
public function parse($html, HtmlParserHandler $handler) {
$this->input = $html;
$this->pos = 0;
$this->len = strlen($html);
$this->handler = $handler;
$text = '';
while ($this->pos < $this->len) {
$char = $this->look();
if ($char === '<') {
if ($this->lookMatches('<!')) {
// Handle HTML comment
$this->commentBlock();
} else {
// Process tag
$this->fireText($text);
$text = '';
$this->tag();
}
} else {
$text .= $this->char();
}
}
if ($text !== '') {
$this->fireText($text);
}
}
protected function fireOpenTag($tagName, $attributes) {
$this->handler->openTag($tagName, $attributes);
}
protected function fireCloseTag($tagName) {
$this->handler->closeTag($tagName);
}
protected function fireComment($comment) {
$this->handler->comment($comment);
}
protected function fireText($text) {
if (strlen($text) == '') {
return;
}
$this->handler->text($text);
}
/**
* Parse HTML comment block
*/
private function commentBlock() {
$comment = '';
$this->matches('<!');
while ($this->look() !== '>' && $this->pos < $this->len) {
if ($this->lookMatches('--')) {
$comment .= $this->comment();
} else {
$this->matchAny(); // Ignore characters outside comment
}
}
$this->match('>');
$this->fireComment($comment);
}
/**
* Parse HTML comment
*/
private function comment() {
$comment = '';
$this->matches('--');
while (!$this->lookMatches('--') && $this->pos < $this->len) {
// Convert characters to stop comment hacks <!------><script>alert('xss')</script>-->
$comment .= htmlspecialchars($this->matchAny());
}
$this->matches('--');
return $comment;
}
/**
* Parse HTML tag
*/
private function tag() {
$output = $this->match('<');
// Check if close tag
$closeTag = false;
if ($this->look() === '/') {
$closeTag = true;
$output .= $this->match('/');
}
$tagName = strtolower($this->matchWord());
// If not valid tag, escape output
if (strlen($tagName) === 0) {
$this->fireText(htmlspecialchars($output));
return;
}
if ($closeTag) {
$this->match('>');
$this->fireCloseTag($tagName);
return;
}
$this->matchWhitespace();
$attributes = array();
while ($this->look() !== '>' && $this->pos < $this->len) {
$attribute = $this->attribute();
// Invalid attribute, finish tag
if (strlen($attribute) === 0) {
break;
}
list($attributeName, $attributeValue) = $attribute;
$attributes[$attributeName] = $attributeValue;
$this->matchWhitespace();
}
if ($this->look() === '>') {
$this->match('>');
}
$this->fireOpenTag($tagName, $attributes);
}
/**
* Parse attribute
*/
private function attribute() {
$attributeName = strtolower($this->matchWord());
if (strlen($attributeName) === 0) {
return '';
}
$this->matchWhitespace();
if ($this->look() === '=') {
$this->match('=');
$this->matchWhitespace();
$attributeValue = $this->attributeValue();
} else {
$attributeValue = null;
}
return array($attributeName, $attributeValue);
}
/**
* Parse attribute value
*/
private function attributeValue() {
if ($this->look() === '"' || $this->look() === "'") {
$quoteChar = $this->look();
$this->match($quoteChar);
$attributeValue = $this->matchUntil($quoteChar);
$this->match($quoteChar);
} else {
$attributeValue = '';
while (!ctype_space($this->look()) && $this->look() != '>' && $this->pos < $this->len) {
$attributeValue .= $this->char();
}
}
if (strlen($attributeValue) === 0) {
return null;
}
return $attributeValue;
}
/**
* Match character, handling special characters and character entities
*/
private function char() {
if ($this->look() === '&') {
return $this->entity();
} else {
return htmlspecialchars($this->matchAny());
}
}
/**
* Parse HTML entity
*/
private function entity() {
if ($this->lookMatches('&#x')) {
return $this->entityHex();
} elseif ($this->lookMatches('&#')) {
return $this->entityNumber();
} else {
return $this->entityName();
}
}
/**
* Parse HTML entity name
*/
private function entityName() {
$entity = $this->match('&') . $this->matchWord();
if (strlen($entity) === 1) {
// Invalid entity, escape &
return htmlspecialchars($entity);
}
if ($this->look() === ';') {
$entity .= $this->match(';');
} else {
$entity .= ';';
}
return $entity;
}
/**
* Parse HTML entity in number format. Eg. ©
*/
private function entityNumber() {
$entity = $this->matches('&#');
$entity .= $this->matchNumber();
$len = strlen($entity);
if ($len <= 2 || $len > 6) {
// Invalid entity, escape &
return htmlspecialchars($entity);
}
if ($this->look() === ';') {
$entity .= $this->match(';');
} else {
$entity .= ';';
}
return $entity;
}
/**
* Parse HTML entity in hex format. Eg. j
*/
private function entityHex() {
$entity = $this->matches('&#x', true);
$entity .= $this->matchHexNumber();
$len = strlen($entity);
if ($len <= 3 || $len > 7) {
// Invalid entity, escape &
return htmlspecialchars($entity);
}
if ($this->look() === ';') {
$entity .= $this->match(';');
} else {
$entity .= ';';
}
return $entity;
}
private function look() {
return $this->input[ $this->pos ];
}
private function lookMatches($str, $ignoreCase = true) {
$input = substr($this->input, $this->pos, strlen($str));
if ($ignoreCase) {
$input = strtolower($input);
}
return $str === $input;
}
private function matchAny() {
return $this->input[ $this->pos++ ];
}
private function match($char, $ignoreCase = false) {
$input = $this->look();
if ($ignoreCase) {
$input = strtolower($input);
}
if ($input !== $char) {
// This means we have a logic coding error
throw new Exception('Invalid match');
}
return $this->input[ $this->pos++ ];
}
private function matches($str, $ignoreCase = false) {
$input = substr($this->input, $this->pos, strlen($str));
if ($ignoreCase) {
$input = strtolower($input);
}
if ($str !== $input) {
// This means we have a logic coding error
throw new Exception('Invalid match');
}
$this->pos += strlen($str);
return $str;
}
private function matchUntil($char) {
$match = '';
while ($this->look() !== $char && $this->pos < $this->len) {
$match .= $this->char();
}
return $match;
}
private function matchWord() {
$word = '';
while (ctype_alpha($this->input[ $this->pos ]) && $this->pos < $this->len) {
$word .= $this->input[ $this->pos++ ];
}
return $word;
}
private function matchNumber() {
$num = '';
while (ctype_digit($this->input[ $this->pos ]) && $this->pos < $this->len) {
$num .= $this->input[ $this->pos++ ];
}
return $num;
}
private function matchHexNumber() {
$num = '';
while (ctype_xdigit($this->input[ $this->pos ]) && $this->pos < $this->len) {
$num .= $this->input[ $this->pos++ ];
}
return $num;
}
private function matchWhitespace() {
$ws = '';
while (ctype_space($this->input[ $this->pos ]) && $this->pos < $this->len) {
$ws .= $this->input[ $this->pos++ ];
}
return $ws;
}
private $pos; // Position in input
private $len; // Length of input
private $input;
private $handler;
}
class HtmlFilter implements HtmlParserHandler {
/**
* Allowed tags
*/
private $allowedTags = array('a', 'b', 'i', 'u', 'strong', 'em',
'sub', 'sup', 'br', 'ul', 'ol', 'li', 'table', 'colgroup', 'col',
'thead', 'tbody', 'tr', 'td', 'img');
/**
* Attributes that are allowed
*/
private $allowedAttributes = array(
'a' => array('href'),
'col' => array('width'),
'td' => array('rowspan', 'colspan', 'bgcolor', 'align'),
'img' => array('src', 'width', 'height', 'alt')
);
/**
* Tag stack is used to balance tags
*/
private $tagStack = array();
/**
* Tags which should always be self-closing (eg. "<img />")
*/
private $selfCloseTags = array('img', 'br', 'col');
/**
* Attributes which contain URLs, and should be checked for valid protocols
*/
private $protocolAttributes = array('src', 'href');
/**
* Protocols which are allowed
*/
private $allowedProtocols = array('http');
/**
* Should comments be removed?
*/
private $stripComments = true;
private $output = ''; // Safe HTML
public function openTag($tagName, $attributes) {
// Ignore tags that are not white listed
if (!in_array($tagName, $this->allowedTags)) {
return;
}
array_push($this->tagStack, $tagName);
$this->output .= '<' . $tagName;
$allowedAttributes = $this->allowedAttributes[$tagName];
if (isset($allowedAttributes) && is_array($allowedAttributes)) {
foreach ($attributes as $name => $value) {
if (in_array($name, $allowedAttributes)) {
// If its a protocol attribute, check if its an allowed protocol
if (in_array($name, $this->protocolAttributes)) {
foreach ($this->allowedProtocols as $protocol) {
$testOn = substr($value, 0, strlen($protocol));
if ($testOn === $protocol) {
$this->output .= ' ' . $name . '="' . $value . '"';
}
}
} else {
$this->output .= ' ' . $name . '="' . $value . '"';
}
}
}
}
if (in_array($tagName, $this->selfCloseTags)) {
$this->output .= ' /';
}
$this->output .= '>';
}
public function closeTag($tagName) {
if (!in_array($tagName, $this->tagStack)) {
// Orphan close tag, ignore
return;
}
while (true) {
if (count($this->tagStack) === 0) {
break;
}
$popTag = array_pop($this->tagStack);
if ($popTag === $tagName) {
break;
}
$this->output .= '</' . $popTag . '>';
}
$this->output .= '</' . $tagName . '>';
}
public function comment($comment) {
if ($this->stripComments) {
return;
}
$this->output .= '<!--' . $comment . '-->';
}
public function text($text) {
$this->output .= $text;
}
private $parser;
public function __construct() {
$this->parser = new HtmlParser;
}
public function filter($html) {
$this->parser->parse($html, $this);
return $this->output;
}
}
$filename = $argv[1];
$contents = file_get_contents($filename);
$filter = new HtmlFilter;
$safeHTML = $filter->filter($contents);
// Compress html
//$safeHTML = preg_replace('/\s+/', ' ', $safeHTML);
echo $safeHTML . "\n";
Refactorings
No refactoring yet !
Nick Ramsay
January 19, 2009, January 19, 2009 04:23, permalink
I'm a having the following problem with line breaks. Is there anything I can safely tweak to stop this?
Otherwise, thank you so much for the effort you've put into this!
<br /> becomes <br />/> Unparsed: hello <br /> Parsed: hello <br>/>
Nick Ramsay
January 19, 2009, January 19, 2009 05:51, permalink
Ah, a quick search and replace was all I needed:
public function filter($html) {
$html = str_replace("<br />", "<br>", $html);
$this->parser->parse($html, $this);
return $this->output;
}
grom
February 27, 2009, February 27, 2009 04:49, permalink
@Nick Ramsay, or better yet change tag function to allow self closing tags as follows:
<?php
private function tag() {
$output = $this->match('<');
// Check if close tag
$closeTag = false;
if ($this->look() === '/') {
$closeTag = true;
$output .= $this->match('/');
}
$tagName = strtolower($this->matchWord());
// If not valid tag, escape output
if (strlen($tagName) === 0) {
$this->fireText(htmlspecialchars($output));
return;
}
if ($closeTag) {
$this->match('>');
$this->fireCloseTag($tagName);
return;
}
$this->matchWhitespace();
$attributes = array();
while (!$this->lookMatches("/>") && $this->look() !== '>' && $this->pos < $this->len) {
$attribute = $this->attribute();
// Invalid attribute, finish tag
if (strlen($attribute) === 0) {
break;
}
list($attributeName, $attributeValue) = $attribute;
$attributes[$attributeName] = $attributeValue;
$this->matchWhitespace();
}
if ($this->look() === '/') {
$this->match('/');
}
if ($this->look() === '>') {
$this->match('>');
}
$this->fireOpenTag($tagName, $attributes);
}
grom
February 27, 2009, February 27, 2009 05:32, permalink
I have updated version available at http://grom.zeminvaders.net/html-sanitizer . It handles self-closing tags and closes tags that are left open. Or below is the diff to the original posting.
--- filter.php 2009-02-27 15:34:00.000000000 +1000
+++ filter2.php 2009-02-27 15:33:29.000000000 +1000
@@ -1,7 +1,11 @@
#!/usr/bin/php
<?php
/**
- * @author Cameron Zemek <cameronz@bundaberg.qld.gov.au>
+ * Script for sanitizing HTML input to only allow what is in the whitelist.
+ * Tested against majority of the hacks listed at http://ha.ckers.org/xss.html
+ *
+ * @author Cameron Zemek <grom@zeminvaders.net>
+ * @license http://opensource.org/licenses/mit-license.php MIT License
*/
/**
@@ -141,6 +145,10 @@
}
$tagName = strtolower($this->matchWord());
+ if ($tagName === 'h' && $this->look() >= '1' && $this->look() <= '6') {
+ $tagName .= $this->matchAny();
+ }
+ $output .= $tagName;
// If not valid tag, escape output
if (strlen($tagName) === 0) {
@@ -149,14 +157,22 @@
}
if ($closeTag) {
+ if ($this->look() !== '>') {
+ $this->fireText(htmlspecialchars($output));
+ return;
+ }
$this->match('>');
$this->fireCloseTag($tagName);
return;
}
- $this->matchWhitespace();
+ $ws = $this->matchWhitespace();
+ if ($ws === '' && !($this->look() === '/' || $this->look() === '>')) {
+ $this->fireText(htmlspecialchars($output));
+ return;
+ }
$attributes = array();
- while ($this->look() !== '>' && $this->pos < $this->len) {
+ while (!$this->lookMatches('/>') && $this->look() !== '>' && $this->pos < $this->len) {
$attribute = $this->attribute();
// Invalid attribute, finish tag
@@ -168,18 +184,25 @@
$attributes[$attributeName] = $attributeValue;
$this->matchWhitespace();
}
-
+ if ($this->look() === '/') {
+ $closeTag = true;
+ $this->match('/');
+ $this->match('>');
+ }
if ($this->look() === '>') {
$this->match('>');
}
$this->fireOpenTag($tagName, $attributes);
+ if ($closeTag) {
+ $this->fireCloseTag($tagName);
+ }
}
/**
* Parse attribute
*/
private function attribute() {
- $attributeName = strtolower($this->matchWord());
+ $attributeName = strtolower($this->matchHyphenWords());
if (strlen($attributeName) === 0) {
return '';
}
@@ -316,7 +339,6 @@
$input = strtolower($input);
}
if ($input !== $char) {
- // This means we have a logic coding error
throw new Exception('Invalid match');
}
return $this->input[ $this->pos++ ];
@@ -328,7 +350,6 @@
$input = strtolower($input);
}
if ($str !== $input) {
- // This means we have a logic coding error
throw new Exception('Invalid match');
}
$this->pos += strlen($str);
@@ -343,6 +364,23 @@
return $match;
}
+ private function matchHyphenWords() {
+ $words = $this->matchWord();
+ while ($this->look() === '-' && $this->pos < $this->len) {
+ // Is there another character after the hypen?
+ if ($this->pos + 1 >= $this->len) {
+ break;
+ }
+ // Is next character after hypen part of a word?
+ if (!ctype_alpha($this->input [ $this->pos + 1 ])) {
+ break;
+ }
+ // There is another word, so match the hyphen and word
+ $words .= $this->match('-') . $this->matchWord();
+ }
+ return $words;
+ }
+
private function matchWord() {
$word = '';
while (ctype_alpha($this->input[ $this->pos ]) && $this->pos < $this->len) {
@@ -410,9 +448,9 @@
private $selfCloseTags = array('img', 'br', 'col');
/**
- * Attributes which contain URLs, and should be checked for valid protocols
+ * Attributes which contain URLs
*/
- private $protocolAttributes = array('src', 'href');
+ private $urlAttributes = array('src', 'href');
/**
* Protocols which are allowed
@@ -420,6 +458,11 @@
private $allowedProtocols = array('http');
/**
+ * Are URL paths allowed, that is no protocol scheme is specified. Eg. /images/photo.jpg
+ */
+ private $urlPathAllowed = true;
+
+ /**
* Should comments be removed?
*/
private $stripComments = true;
@@ -431,20 +474,21 @@
if (!in_array($tagName, $this->allowedTags)) {
return;
}
-
- array_push($this->tagStack, $tagName);
+ if (!in_array($tagName, $this->selfCloseTags)) {
+ array_push($this->tagStack, $tagName);
+ }
$this->output .= '<' . $tagName;
$allowedAttributes = $this->allowedAttributes[$tagName];
if (isset($allowedAttributes) && is_array($allowedAttributes)) {
foreach ($attributes as $name => $value) {
if (in_array($name, $allowedAttributes)) {
// If its a protocol attribute, check if its an allowed protocol
- if (in_array($name, $this->protocolAttributes)) {
- foreach ($this->allowedProtocols as $protocol) {
- $testOn = substr($value, 0, strlen($protocol));
- if ($testOn === $protocol) {
- $this->output .= ' ' . $name . '="' . $value . '"';
- }
+ if (in_array($name, $this->urlAttributes)) {
+ $urlComponents = parse_url($value);
+ if ((isset($urlComponents['scheme']) &&
+ in_array($urlComponents['scheme'], $this->allowedProtocols)) ||
+ (!isset($urlComponents['scheme']) && $this->urlPathAllowed)) {
+ $this->output .= ' ' . $name . '="' . $value . '"';
}
} else {
$this->output .= ' ' . $name . '="' . $value . '"';
@@ -495,6 +539,10 @@
public function filter($html) {
$this->parser->parse($html, $this);
+ // Close any remaining tags on the stack
+ while ($tagName = array_pop($this->tagStack)) {
+ $this->output .= '</' . $tagName . '>';
+ }
return $this->output;
}
}
Only allow subset of HTML. I tested this against a lot of the XSS attacks listed at http://ha.ckers.org/xss.html, and it stop all the ones I tested.