Version 0 of Stephen Uhler's HTML parser in 10 lines

Updated 2005-07-27 00:05:53

EKB This is a follow-up to the discussion on Is Tcl Different!.

Here's the wonderful HTML parser in 10 lines as posted on that page:

    ############################################
    # Turn HTML into TCL commands
    #   html    A string containing an html document
    #   cmd                A command to run for each html tag found
    #   start        The name of the dummy html start/stop tags

    proc HMparse_html {html {cmd HMtest_parse} {start hmstart}} {
        regsub -all \{ $html {\&ob;} html
        regsub -all \} $html {\&cb;} html
        set w " \t\r\n"        ;# white space
        proc HMcl x {return "\[$x\]"}
        set exp <(/?)([HMcl ^$w>]+)[HMcl $w]*([HMcl ^>]*)>
        set sub "\}\n$cmd {\\2} {\\1} {\\3} \{"
        regsub -all $exp $html $sub html
        eval "$cmd {$start} {} {} \{ $html \}"
        eval "$cmd {$start} / {} {}"
   }

But it was missing the default value for cmd, HMtest_parse, so I wrote one and applied it to a sample bit of HTML:

   proc HMtest_parse {tag state props body} {
    if {$state == ""} {
        set msg "Start $tag"
        if {$props != ""} {
            set msg "$msg with args: $props"
        }
        set msg "$msg\n$body"
    } else {
        set msg "End $tag"
    }
    puts $msg
   }

   HMparse_html {
      <html>
        <p class="bubba">
        This is my very first paragraph. How do you
        like it? I think it has a lot to recommend it.
        </p>
        <p class="louielouie">
        This is my second paragraph, which is OK,
        but not as nice as my first one.
        </p>
      </html>
   }

This gives the following output:

 Start hmstart


 Start html


 Start p with args: class="bubba"

        This is my very first paragraph. How do you
        like it? I think it has a lot to recommend it.

 End p
 Start p with args: class="louielouie"

        This is my second paragraph, which is OK,
        but not as nice as my first one.

 End p
 End html
 End hmstart

In fact, the code is not HTML-specific, and can handle simple XML code (e.g., that doesn't use the self-closing <tag/> format). It's like a mini-SAX.