<?xml version="1.0" encoding="UTF-8"?>
<feed xml:lang="en-US" xmlns="http://www.w3.org/2005/Atom">
  <id>tag:www.refactormycode.com,2007:users1112</id>
  <link type="application/atom+xml" href="http://www.refactormycode.com/users/1112" rel="self"/>
  <title>grom</title>
  <updated>Fri Feb 27 05:32:07 -0800 2009</updated>
  <entry>
    <id>tag:www.refactormycode.com,2007:Refactor148941</id>
    <published>2009-02-27T05:32:07-08:00</published>
    <title>[PHP] On HTML Filter</title>
    <content type="html">&lt;p&gt;I have updated version available at &lt;a href="http://grom.zeminvaders.net/html-sanitizer" target="_blank"&gt;http://grom.zeminvaders.net/html-sanitizer&lt;/a&gt; . It handles self-closing tags and closes tags that are left open. Or below is the diff to the original posting.&lt;/p&gt;

&lt;pre&gt;--- filter.php	2009-02-27 15:34:00.000000000 +1000
+++ filter2.php	2009-02-27 15:33:29.000000000 +1000
@@ -1,7 +1,11 @@
 #!/usr/bin/php
 &amp;lt;?php
 /**
- * @author Cameron Zemek &amp;lt;cameronz@bundaberg.qld.gov.au&amp;gt;
+ * Script for sanitizing HTML input to only allow what is in the whitelist.
+ * Tested against majority of the hacks listed at http://ha.ckers.org/xss.html
+ *
+ * @author Cameron Zemek &amp;lt;grom@zeminvaders.net&amp;gt;
+ * @license http://opensource.org/licenses/mit-license.php MIT License
  */
 
 /**
@@ -141,6 +145,10 @@
         }
 
         $tagName = strtolower($this-&amp;gt;matchWord());
+        if ($tagName === 'h' &amp;amp;&amp;amp; $this-&amp;gt;look() &amp;gt;= '1' &amp;amp;&amp;amp; $this-&amp;gt;look() &amp;lt;= '6') {
+            $tagName .= $this-&amp;gt;matchAny();
+        }
+        $output .= $tagName;
 
         // If not valid tag, escape output
         if (strlen($tagName) === 0) {
@@ -149,14 +157,22 @@
         }
 
         if ($closeTag) {
+            if ($this-&amp;gt;look() !== '&amp;gt;') {
+                $this-&amp;gt;fireText(htmlspecialchars($output));
+                return;
+            }
             $this-&amp;gt;match('&amp;gt;');
             $this-&amp;gt;fireCloseTag($tagName);
             return;
         }
 
-        $this-&amp;gt;matchWhitespace();
+        $ws = $this-&amp;gt;matchWhitespace();
+        if ($ws === '' &amp;amp;&amp;amp; !($this-&amp;gt;look() === '/' || $this-&amp;gt;look() === '&amp;gt;')) {
+            $this-&amp;gt;fireText(htmlspecialchars($output));
+            return;
+        }
         $attributes = array();
-        while ($this-&amp;gt;look() !== '&amp;gt;' &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
+        while (!$this-&amp;gt;lookMatches('/&amp;gt;') &amp;amp;&amp;amp; $this-&amp;gt;look() !== '&amp;gt;' &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
             $attribute = $this-&amp;gt;attribute();
 
             // Invalid attribute, finish tag
@@ -168,18 +184,25 @@
             $attributes[$attributeName] = $attributeValue;
             $this-&amp;gt;matchWhitespace();
         }
-
+        if ($this-&amp;gt;look() === '/') {
+            $closeTag = true;
+            $this-&amp;gt;match('/');
+            $this-&amp;gt;match('&amp;gt;');
+        }
         if ($this-&amp;gt;look() === '&amp;gt;') {
             $this-&amp;gt;match('&amp;gt;');
         }
         $this-&amp;gt;fireOpenTag($tagName, $attributes);
+        if ($closeTag) {
+            $this-&amp;gt;fireCloseTag($tagName);
+        }
     }
 
     /**
      * Parse attribute
      */
     private function attribute() {
-        $attributeName = strtolower($this-&amp;gt;matchWord());
+        $attributeName = strtolower($this-&amp;gt;matchHyphenWords());
         if (strlen($attributeName) === 0) {
             return '';
         }
@@ -316,7 +339,6 @@
             $input = strtolower($input);
         }
         if ($input !== $char) {
-            // This means we have a logic coding error
             throw new Exception('Invalid match');
         }
         return $this-&amp;gt;input[ $this-&amp;gt;pos++ ];
@@ -328,7 +350,6 @@
             $input = strtolower($input);
         }
         if ($str !== $input) {
-            // This means we have a logic coding error
             throw new Exception('Invalid match');
         }
         $this-&amp;gt;pos += strlen($str);
@@ -343,6 +364,23 @@
         return $match;
     }
 
+    private function matchHyphenWords() {
+        $words = $this-&amp;gt;matchWord();
+        while ($this-&amp;gt;look() === '-' &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
+            // Is there another character after the hypen?
+            if ($this-&amp;gt;pos + 1 &amp;gt;= $this-&amp;gt;len) {
+                break;
+            }
+            // Is next character after hypen part of a word?
+            if (!ctype_alpha($this-&amp;gt;input [ $this-&amp;gt;pos + 1 ])) {
+                break;
+            }
+            // There is another word, so match the hyphen and word
+            $words .= $this-&amp;gt;match('-') . $this-&amp;gt;matchWord();
+        }
+        return $words;
+    }
+
     private function matchWord() {
         $word = '';
         while (ctype_alpha($this-&amp;gt;input[ $this-&amp;gt;pos ]) &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
@@ -410,9 +448,9 @@
     private $selfCloseTags = array('img', 'br', 'col');
 
     /**
-     * Attributes which contain URLs, and should be checked for valid protocols
+     * Attributes which contain URLs
      */
-    private $protocolAttributes = array('src', 'href');
+    private $urlAttributes = array('src', 'href');
 
     /**
      * Protocols which are allowed
@@ -420,6 +458,11 @@
     private $allowedProtocols = array('http');
 
     /**
+     * Are URL paths allowed, that is no protocol scheme is specified. Eg. /images/photo.jpg
+     */
+    private $urlPathAllowed = true;
+
+    /**
      * Should comments be removed?
      */
     private $stripComments = true;
@@ -431,20 +474,21 @@
         if (!in_array($tagName, $this-&amp;gt;allowedTags)) {
             return;
         }
-
-        array_push($this-&amp;gt;tagStack, $tagName);
+        if (!in_array($tagName, $this-&amp;gt;selfCloseTags)) {
+            array_push($this-&amp;gt;tagStack, $tagName);
+        }
         $this-&amp;gt;output .= '&amp;lt;' . $tagName;
         $allowedAttributes = $this-&amp;gt;allowedAttributes[$tagName];
         if (isset($allowedAttributes) &amp;amp;&amp;amp; is_array($allowedAttributes)) {
             foreach ($attributes as $name =&amp;gt; $value) {
                 if (in_array($name, $allowedAttributes)) {
                     // If its a protocol attribute, check if its an allowed protocol
-                    if (in_array($name, $this-&amp;gt;protocolAttributes)) {
-                        foreach ($this-&amp;gt;allowedProtocols as $protocol) {
-                            $testOn = substr($value, 0, strlen($protocol));
-                            if ($testOn === $protocol) {
-                                $this-&amp;gt;output .= ' ' . $name . '=&amp;quot;' . $value . '&amp;quot;';
-                            }
+                    if (in_array($name, $this-&amp;gt;urlAttributes)) {
+                        $urlComponents = parse_url($value);
+                        if ((isset($urlComponents['scheme']) &amp;amp;&amp;amp;
+                            in_array($urlComponents['scheme'], $this-&amp;gt;allowedProtocols)) ||
+                            (!isset($urlComponents['scheme']) &amp;amp;&amp;amp; $this-&amp;gt;urlPathAllowed)) {
+                            $this-&amp;gt;output .= ' ' . $name . '=&amp;quot;' . $value . '&amp;quot;';
                         }
                     } else {
                         $this-&amp;gt;output .= ' ' . $name . '=&amp;quot;' . $value . '&amp;quot;';
@@ -495,6 +539,10 @@
 
     public function filter($html) {
         $this-&amp;gt;parser-&amp;gt;parse($html, $this);
+        // Close any remaining tags on the stack
+        while ($tagName = array_pop($this-&amp;gt;tagStack)) {
+            $this-&amp;gt;output .= '&amp;lt;/' . $tagName . '&amp;gt;';
+        }
         return $this-&amp;gt;output;
     }
 }&lt;/pre&gt;</content>
    <author>
      <name>grom</name>
      <email>grom@zeminvaders.net</email>
    </author>
    <link type="text/html" href="http://www.refactormycode.com/codes/557-html-filter/refactors/148941" rel="alternate"/>
  </entry>
  <entry>
    <id>tag:www.refactormycode.com,2007:Refactor148940</id>
    <published>2009-02-27T04:49:21-08:00</published>
    <title>[PHP] On HTML Filter</title>
    <content type="html">&lt;p&gt;@Nick Ramsay, or better yet change tag function to allow self closing tags as follows:&lt;/p&gt;

&lt;pre&gt;&amp;lt;?php
    private function tag() {
        $output = $this-&amp;gt;match('&amp;lt;');

        // Check if close tag
        $closeTag = false;
        if ($this-&amp;gt;look() === '/') {
            $closeTag = true;
            $output .= $this-&amp;gt;match('/');
        }

        $tagName = strtolower($this-&amp;gt;matchWord());

        // If not valid tag, escape output
        if (strlen($tagName) === 0) {
            $this-&amp;gt;fireText(htmlspecialchars($output));
            return;
        }

        if ($closeTag) {
            $this-&amp;gt;match('&amp;gt;');
            $this-&amp;gt;fireCloseTag($tagName);
            return;
        }

        $this-&amp;gt;matchWhitespace();
        $attributes = array();
        while (!$this-&amp;gt;lookMatches(&amp;quot;/&amp;gt;&amp;quot;) &amp;amp;&amp;amp; $this-&amp;gt;look() !== '&amp;gt;' &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
            $attribute = $this-&amp;gt;attribute();

            // Invalid attribute, finish tag
            if (strlen($attribute) === 0) {
                break;
            }

            list($attributeName, $attributeValue) = $attribute;
            $attributes[$attributeName] = $attributeValue;
            $this-&amp;gt;matchWhitespace();
        }
        if ($this-&amp;gt;look() === '/') {
            $this-&amp;gt;match('/');
        }
        if ($this-&amp;gt;look() === '&amp;gt;') {
            $this-&amp;gt;match('&amp;gt;');
        }
        $this-&amp;gt;fireOpenTag($tagName, $attributes);
    }&lt;/pre&gt;</content>
    <author>
      <name>grom</name>
      <email>grom@zeminvaders.net</email>
    </author>
    <link type="text/html" href="http://www.refactormycode.com/codes/557-html-filter/refactors/148940" rel="alternate"/>
  </entry>
  <entry>
    <id>tag:www.refactormycode.com,2007:Refactor84535</id>
    <published>2008-11-24T23:32:27-08:00</published>
    <title>[C#] On Sanitize HTML</title>
    <content type="html">&lt;p&gt;Here is an actual parser (written in PHP) that only accepts white listed input, &lt;a href="http://refactormycode.com/codes/557-html-filter" target="_blank"&gt;http://refactormycode.com/codes/557-html-filter&lt;/a&gt;&lt;/p&gt;

&lt;pre&gt;&lt;/pre&gt;</content>
    <author>
      <name>grom</name>
      <email>grom@zeminvaders.net</email>
    </author>
    <link type="text/html" href="http://www.refactormycode.com/codes/333-sanitize-html/refactors/84535" rel="alternate"/>
  </entry>
  <entry>
    <id>tag:www.refactormycode.com,2007:Code557</id>
    <published>2008-10-23T23:00:25-07:00</published>
    <updated>2009-02-27T05:32:07-08:00</updated>
    <title>[PHP] HTML Filter</title>
    <content type="html">&lt;p&gt;Only allow subset of HTML. I tested this against a lot of the XSS attacks listed at &lt;a href="http://ha.ckers.org/xss.html" target="_blank"&gt;http://ha.ckers.org/xss.html&lt;/a&gt;, and it stop all the ones I tested.&lt;/p&gt;

&lt;pre&gt;#!/usr/bin/php
&amp;lt;?php
/**
 * @author Cameron Zemek &amp;lt;cameronz@bundaberg.qld.gov.au&amp;gt;
 */

/**
 * Callback handler for HtmlParser
 */
interface HtmlParserHandler {
    /**
     * Callback for open tag
     *
     * @param $tagName string Tag name
     * @param $attributes array Attributes as an associative array of name =&amp;gt; value
     */
    public function openTag($tagName, $attributes);

    /**
     * Callback for close tag
     *
     * @param $tagName string Tag name
     */
    public function closeTag($tagName);

    /**
     * Callback for comment tags
     *
     * @param $comment string Comment
     */
    public function comment($comment);

    /**
     * Callback for text
     *
     * @param $text string Escaped text
     */    
    public function text($text);
}

/**
 * Handles HTML special characters (ie. &amp;lt; &amp;amp; &amp;gt; &amp;quot;) thereby making it easier
 * for the filter to remove XSS attacks.
 */
class HtmlParser {
    /**
     * Parse HTML snippet
     *
     * @param $html string HTML snippet
     * @param $handler HtmlParserHandler Callback handler
     */
    public function parse($html, HtmlParserHandler $handler) {
        $this-&amp;gt;input = $html;        
        $this-&amp;gt;pos = 0;
        $this-&amp;gt;len = strlen($html);
        $this-&amp;gt;handler = $handler;

        $text = '';
        while ($this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
            $char = $this-&amp;gt;look();
            if ($char === '&amp;lt;') {
                if ($this-&amp;gt;lookMatches('&amp;lt;!')) {
                    // Handle HTML comment
                    $this-&amp;gt;commentBlock();
                } else {
                    // Process tag
                    $this-&amp;gt;fireText($text);
                    $text = '';
                    $this-&amp;gt;tag();
                }
            } else {
                $text .= $this-&amp;gt;char();
            }
        }
        if ($text !== '') {
            $this-&amp;gt;fireText($text);
        }
    }

    protected function fireOpenTag($tagName, $attributes) {
        $this-&amp;gt;handler-&amp;gt;openTag($tagName, $attributes);
    }

    protected function fireCloseTag($tagName) {
        $this-&amp;gt;handler-&amp;gt;closeTag($tagName);
    }

    protected function fireComment($comment) {
        $this-&amp;gt;handler-&amp;gt;comment($comment);
    }

    protected function fireText($text) {
        if (strlen($text) == '') {
            return;
        }
        $this-&amp;gt;handler-&amp;gt;text($text);
    }

    /**
     * Parse HTML comment block
     */
    private function commentBlock() {
        $comment = '';
        $this-&amp;gt;matches('&amp;lt;!');
        while ($this-&amp;gt;look() !== '&amp;gt;' &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
            if ($this-&amp;gt;lookMatches('--')) {
                $comment .= $this-&amp;gt;comment();
            } else {
                $this-&amp;gt;matchAny(); // Ignore characters outside comment
            }
        }
        $this-&amp;gt;match('&amp;gt;');
        $this-&amp;gt;fireComment($comment);
    }

    /**
     * Parse HTML comment
     */
    private function comment() {
        $comment = '';
        $this-&amp;gt;matches('--');
        while (!$this-&amp;gt;lookMatches('--') &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
            // Convert characters to stop comment hacks &amp;lt;!------&amp;gt;&amp;lt;script&amp;gt;alert('xss')&amp;lt;/script&amp;gt;--&amp;gt;
            $comment .= htmlspecialchars($this-&amp;gt;matchAny());
        }
        $this-&amp;gt;matches('--');
        return $comment;
    }

    /**
     * Parse HTML tag
     */
    private function tag() {
        $output = $this-&amp;gt;match('&amp;lt;');

        // Check if close tag
        $closeTag = false;
        if ($this-&amp;gt;look() === '/') {
            $closeTag = true;
            $output .= $this-&amp;gt;match('/');
        }

        $tagName = strtolower($this-&amp;gt;matchWord());

        // If not valid tag, escape output
        if (strlen($tagName) === 0) {
            $this-&amp;gt;fireText(htmlspecialchars($output));
            return;
        }

        if ($closeTag) {
            $this-&amp;gt;match('&amp;gt;');
            $this-&amp;gt;fireCloseTag($tagName);
            return;
        }

        $this-&amp;gt;matchWhitespace();
        $attributes = array();
        while ($this-&amp;gt;look() !== '&amp;gt;' &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
            $attribute = $this-&amp;gt;attribute();

            // Invalid attribute, finish tag
            if (strlen($attribute) === 0) {
                break;
            }

            list($attributeName, $attributeValue) = $attribute;
            $attributes[$attributeName] = $attributeValue;
            $this-&amp;gt;matchWhitespace();
        }

        if ($this-&amp;gt;look() === '&amp;gt;') {
            $this-&amp;gt;match('&amp;gt;');
        }
        $this-&amp;gt;fireOpenTag($tagName, $attributes);
    }

    /**
     * Parse attribute
     */
    private function attribute() {
        $attributeName = strtolower($this-&amp;gt;matchWord());
        if (strlen($attributeName) === 0) {
            return '';
        }
        $this-&amp;gt;matchWhitespace();
        if ($this-&amp;gt;look() === '=') {
            $this-&amp;gt;match('=');
            $this-&amp;gt;matchWhitespace();
            $attributeValue = $this-&amp;gt;attributeValue();
        } else {
            $attributeValue = null;
        }
        return array($attributeName, $attributeValue);
    }

    /**
     * Parse attribute value
     */
    private function attributeValue() {
        if ($this-&amp;gt;look() === '&amp;quot;' || $this-&amp;gt;look() === &amp;quot;'&amp;quot;) {
            $quoteChar = $this-&amp;gt;look();
            $this-&amp;gt;match($quoteChar);
            $attributeValue = $this-&amp;gt;matchUntil($quoteChar);
            $this-&amp;gt;match($quoteChar);
        } else {
            $attributeValue = '';
            while (!ctype_space($this-&amp;gt;look()) &amp;amp;&amp;amp; $this-&amp;gt;look() != '&amp;gt;' &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
                $attributeValue .= $this-&amp;gt;char();
            }
        }
        if (strlen($attributeValue) === 0) {
            return null;
        }
        return $attributeValue;
    }

    /**
     * Match character, handling special characters and character entities
     */
    private function char() {
        if ($this-&amp;gt;look() === '&amp;amp;') {
            return $this-&amp;gt;entity();
        } else {
            return htmlspecialchars($this-&amp;gt;matchAny());
        }
    }

    /**
     * Parse HTML entity
     */
    private function entity() {        
        if ($this-&amp;gt;lookMatches('&amp;amp;#x')) {
            return $this-&amp;gt;entityHex();
        } elseif ($this-&amp;gt;lookMatches('&amp;amp;#')) {
            return $this-&amp;gt;entityNumber();
        } else {
            return $this-&amp;gt;entityName();
        }
    }

    /**
     * Parse HTML entity name
     */
    private function entityName() {
        $entity = $this-&amp;gt;match('&amp;amp;') . $this-&amp;gt;matchWord();
        if (strlen($entity) === 1) {
            // Invalid entity, escape &amp;amp;
            return htmlspecialchars($entity);
        }
        if ($this-&amp;gt;look() === ';') {
            $entity .= $this-&amp;gt;match(';');
        } else {
            $entity .= ';';
        }
        return $entity;
    }

    /**
     * Parse HTML entity in number format. Eg. &amp;amp;#169;
     */
    private function entityNumber() {
        $entity = $this-&amp;gt;matches('&amp;amp;#');
        $entity .= $this-&amp;gt;matchNumber();
        $len = strlen($entity);
        if ($len &amp;lt;= 2 || $len &amp;gt; 6) {
            // Invalid entity, escape &amp;amp;
            return htmlspecialchars($entity);
        }
        if ($this-&amp;gt;look() === ';') {
            $entity .= $this-&amp;gt;match(';');
        } else {
            $entity .= ';';
        }
        return $entity;        
    }

    /**
     * Parse HTML entity in hex format. Eg. &amp;amp;#x6A;
     */
    private function entityHex() {
        $entity = $this-&amp;gt;matches('&amp;amp;#x', true);
        $entity .= $this-&amp;gt;matchHexNumber();
        $len = strlen($entity);
        if ($len &amp;lt;= 3 || $len &amp;gt; 7) {
            // Invalid entity, escape &amp;amp;
            return htmlspecialchars($entity);
        }
        if ($this-&amp;gt;look() === ';') {
            $entity .= $this-&amp;gt;match(';');
        } else {
            $entity .= ';';
        }
        return $entity; 
    }

    private function look() {
        return $this-&amp;gt;input[ $this-&amp;gt;pos ];
    }

    private function lookMatches($str, $ignoreCase = true) {
        $input = substr($this-&amp;gt;input, $this-&amp;gt;pos, strlen($str));
        if ($ignoreCase) {
            $input = strtolower($input);
        }
        return $str === $input;
    }

    private function matchAny() {
        return $this-&amp;gt;input[ $this-&amp;gt;pos++ ];
    }

    private function match($char, $ignoreCase = false) {
        $input = $this-&amp;gt;look();
        if ($ignoreCase) {
            $input = strtolower($input);            
        }
        if ($input !== $char) {
            // This means we have a logic coding error
            throw new Exception('Invalid match');
        }
        return $this-&amp;gt;input[ $this-&amp;gt;pos++ ];
    }

    private function matches($str, $ignoreCase = false) {
        $input = substr($this-&amp;gt;input, $this-&amp;gt;pos, strlen($str));
        if ($ignoreCase) {
            $input = strtolower($input);
        }
        if ($str !== $input) {
            // This means we have a logic coding error
            throw new Exception('Invalid match');
        }
        $this-&amp;gt;pos += strlen($str);
        return $str;
    }

    private function matchUntil($char) {
        $match = '';
        while ($this-&amp;gt;look() !== $char &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
            $match .= $this-&amp;gt;char();
        }
        return $match;
    }

    private function matchWord() {
        $word = '';
        while (ctype_alpha($this-&amp;gt;input[ $this-&amp;gt;pos ]) &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
            $word .= $this-&amp;gt;input[ $this-&amp;gt;pos++ ];
        }
        return $word;
    }

    private function matchNumber() {
        $num = '';
        while (ctype_digit($this-&amp;gt;input[ $this-&amp;gt;pos ]) &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
            $num .= $this-&amp;gt;input[ $this-&amp;gt;pos++ ];
        }
        return $num;
    }

    private function matchHexNumber() {
        $num = '';
        while (ctype_xdigit($this-&amp;gt;input[ $this-&amp;gt;pos ]) &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
            $num .= $this-&amp;gt;input[ $this-&amp;gt;pos++ ];
        }
        return $num;
    }

    private function matchWhitespace() {
        $ws = '';
        while (ctype_space($this-&amp;gt;input[ $this-&amp;gt;pos ]) &amp;amp;&amp;amp; $this-&amp;gt;pos &amp;lt; $this-&amp;gt;len) {
            $ws .= $this-&amp;gt;input[ $this-&amp;gt;pos++ ];
        }
        return $ws;
    }

    private $pos; // Position in input
    private $len; // Length of input
    private $input;
    private $handler;
}

class HtmlFilter implements HtmlParserHandler {
    /**
     * Allowed tags
     */
    private $allowedTags = array('a', 'b', 'i', 'u', 'strong', 'em',
        'sub', 'sup', 'br', 'ul', 'ol', 'li', 'table', 'colgroup', 'col',
        'thead', 'tbody', 'tr', 'td', 'img');

    /**
     * Attributes that are allowed
     */
    private $allowedAttributes = array(
        'a' =&amp;gt; array('href'),
        'col' =&amp;gt; array('width'),
        'td' =&amp;gt; array('rowspan', 'colspan', 'bgcolor', 'align'),
        'img' =&amp;gt; array('src', 'width', 'height', 'alt')
    );

    /**
     * Tag stack is used to balance tags
     */
    private $tagStack = array();

    /**
     * Tags which should always be self-closing (eg. &amp;quot;&amp;lt;img /&amp;gt;&amp;quot;)
     */
    private $selfCloseTags = array('img', 'br', 'col');

    /**
     * Attributes which contain URLs, and should be checked for valid protocols
     */
    private $protocolAttributes = array('src', 'href');

    /**
     * Protocols which are allowed
     */
    private $allowedProtocols = array('http');

    /**
     * Should comments be removed?
     */
    private $stripComments = true;

    private $output = ''; // Safe HTML

    public function openTag($tagName, $attributes) {
        // Ignore tags that are not white listed
        if (!in_array($tagName, $this-&amp;gt;allowedTags)) {
            return;
        }

        array_push($this-&amp;gt;tagStack, $tagName);
        $this-&amp;gt;output .= '&amp;lt;' . $tagName;
        $allowedAttributes = $this-&amp;gt;allowedAttributes[$tagName];
        if (isset($allowedAttributes) &amp;amp;&amp;amp; is_array($allowedAttributes)) {
            foreach ($attributes as $name =&amp;gt; $value) {
                if (in_array($name, $allowedAttributes)) {
                    // If its a protocol attribute, check if its an allowed protocol
                    if (in_array($name, $this-&amp;gt;protocolAttributes)) {
                        foreach ($this-&amp;gt;allowedProtocols as $protocol) {
                            $testOn = substr($value, 0, strlen($protocol));
                            if ($testOn === $protocol) {
                                $this-&amp;gt;output .= ' ' . $name . '=&amp;quot;' . $value . '&amp;quot;';
                            }
                        }
                    } else {
                        $this-&amp;gt;output .= ' ' . $name . '=&amp;quot;' . $value . '&amp;quot;';
                    }
                }
            }
        }
        if (in_array($tagName, $this-&amp;gt;selfCloseTags)) {
            $this-&amp;gt;output .= ' /';
        }
        $this-&amp;gt;output .= '&amp;gt;';
    }

    public function closeTag($tagName) {
        if (!in_array($tagName, $this-&amp;gt;tagStack)) {
            // Orphan close tag, ignore
            return;
        }
        while (true) {
            if (count($this-&amp;gt;tagStack) === 0) {
                break;
            }
            $popTag = array_pop($this-&amp;gt;tagStack);
            if ($popTag === $tagName) {
                break;
            }
            $this-&amp;gt;output .= '&amp;lt;/' . $popTag . '&amp;gt;';
        }
        $this-&amp;gt;output .= '&amp;lt;/' . $tagName . '&amp;gt;';
    }

    public function comment($comment) {
        if ($this-&amp;gt;stripComments) {
            return;
        }
        $this-&amp;gt;output .= '&amp;lt;!--' . $comment . '--&amp;gt;';
    }

    public function text($text) {
        $this-&amp;gt;output .= $text;
    }

    private $parser;
   
    public function __construct() {
        $this-&amp;gt;parser = new HtmlParser;
    }

    public function filter($html) {
        $this-&amp;gt;parser-&amp;gt;parse($html, $this);
        return $this-&amp;gt;output;
    }
}

$filename = $argv[1];
$contents = file_get_contents($filename);

$filter = new HtmlFilter;
$safeHTML = $filter-&amp;gt;filter($contents);

// Compress html
//$safeHTML = preg_replace('/\s+/', ' ', $safeHTML);

echo $safeHTML . &amp;quot;\n&amp;quot;;&lt;/pre&gt;</content>
    <author>
      <name>grom</name>
      <email>grom@zeminvaders.net</email>
    </author>
    <link type="text/html" href="http://www.refactormycode.com/codes/557-html-filter" rel="alternate"/>
  </entry>
</feed>

