Parsing HTML

jcw - Here's a little script which turns a set of HTML/XML tags into a series of array changes. By setting a trace on the entire array or on specific elements, one can turn this into a SAX-like series of events:

 # htmlutil.tcl - by Jean-Claude Wippler, September 2001
  package provide htmlutil 0.1

 # parse HTML text, setting array elements along the way
  proc htmlparse {text {aref html} {ignorecase 1}} {
    upvar $aref avar
    set avar() ""

    regsub -all {<!--.*?-->} $text {} text
    append text </>

    set tags ""
    set hist ""

    foreach {a b c} [regexp -all -inline {(.*?)<(.*?)>} $text ] {
      set avar(<text>) $b
      set d ""
      regexp {^(\w+)\s(.*)} $c - c d
      if {$ignorecase} {
        set c [string toupper $c]
      }
      if {[regexp {^/(.*)} $c - e]} {
        set t "/"
        while {[llength $tags]} {
          set t [lindex $tags end]
          set avar(/$t) [lindex $hist end]
          set tags [lreplace $tags end end]
          set hist [lreplace $hist end end]
          if {[string equal $t $e]} break
        }
        # comment out line below to ignore unbalanced closing tags
        #if {![string equal $t $e]} { set avar($c) {} }
      } else {
        set avar($c) $d
        lappend tags $c
        lappend hist $d
      }
    }
  }

 # code below runs when this is launched as the main script
  if {[file root [file tail $argv0]] == "htmlutil"} {
    proc show {r e op} {
      upvar $r a
      puts [list set html($e) $a($e)]
    }

    trace add variable html write show

    set in {a<b c>d<e f>g<e h>i</e>j</e>k<e l>m</b>n</o>p}
    puts "Parsing: $in"

    puts [htmlparse $in]
  }

Output:

  Parsing: a<b c>d<e f>g<e h>i</e>j</e>k<e l>m</b>n</o>p
  set html() {}
  set html(<text>) a
  set html(B) c
  set html(<text>) d
  set html(E) f
  set html(<text>) g
  set html(E) h
  set html(<text>) i
  set html(/E) h
  set html(<text>) j
  set html(/E) f
  set html(<text>) k
  set html(E) l
  set html(<text>) m
  set html(/E) l
  set html(/B) c
  set html(<text>) n
  set html(<text>) p

Dec 2003: Found Carsten Zerbst' article about tDOM on Linux Magazine from 2002, which also shows an HTML -> DOM -> XPath example http://www.linux-magazine.com/issue/20/tDOM.pdf

Tclgumbo is a Tcl extension for parsing HTML.

Category Internet

Category Package

Category Parsing