Version 2 of TAX: A Tiny API for XML

Updated 2005-07-30 02:35:47

TAX was inspired by Stephen Uhler's HTML parser in 10 lines. In fact, the code is almost exactly the same. Just a couple of extra bells and whistles.

Here's the essential code:

    ############################################################
    #
    # Based heavily on Stephen Uhler's HTML parser in 10 lines
    # Modified by Eric Kemp-Benedict for XML
    #
    # Turn XML into TCL commands
    #   xml     A string containing an html document
    #   cmd     A command to run for each html tag found
    #   start   The name of the dummy html start/stop tags
    #
    # Namespace "tax" stands for "Tiny API for XML"
    #

    namespace eval tax {}

    proc tax::parse {cmd xml {start docstart}} {
        regsub -all \{ $xml {\&ob;} xml
        regsub -all \} $xml {\&cb;} xml
        set exp {<(/?)([^\s/>]+)\s*([^/>]*)(/?)>}
        set sub "\}\n$cmd {\\2} \[expr \{{\\1} ne \"\"\}\] \[expr \{{\\4} ne \"\"\}\] \
            \[regsub -all -- \{\\s+|(\\s*=\\s*)\} {\\3} \" \"\] \{"
        regsub -all $exp $xml $sub xml
        eval "$cmd {$start} 0 0 {} \{ $xml \}"
        eval "$cmd {$start} 1 1 {} {}"
   }

To use it, create a parser command, cmd, that will handle any tag found in the string xml. The parser calls cmd in the following way:

 cmd tag cl selfcl props body

where

  • tag is the tag (e.g., p, br, h1, etc. from HTML) or the special tag "docstart"
  • cl is a boolean saying if this is a closing tag (e.g., like </p>)
  • selfcl is a boolean saying if this is a self-closing tag (e.g., <br/> for XHTML)
  • props is a list of name/value pairs that can be passed to an array using array set
  • body is text following the tag that is not enclosed in a tag (e.g., for <p>My text</p>, "My text" is the body)

Here's an example of use (that also uses snit to build the parser -- there's one snit method for each tag).

    package require snit

    ############################################################
    #
    # Based heavily on Stephen Uhler's HTML parser in 10 lines
    # Modified by Eric Kemp-Benedict for XML
    #
    # Turn XML into TCL commands
    #   xml     A string containing an html document
    #   cmd     A command to run for each html tag found
    #   start   The name of the dummy html start/stop tags
    #
    # Namespace "tax" stands for "Tiny API for XML"
    #

    namespace eval tax {}

    proc tax::parse {cmd xml {start docstart}} {
        regsub -all \{ $xml {\&ob;} xml
        regsub -all \} $xml {\&cb;} xml
        set exp {<(/?)([^\s/>]+)\s*([^/>]*)(/?)>}
        set sub "\}\n$cmd {\\2} \[expr \{{\\1} ne \"\"\}\] \[expr \{{\\4} ne \"\"\}\] \
            \[regsub -all -- \{\\s+|(\\s*=\\s*)\} {\\3} \" \"\] \{"
        regsub -all $exp $xml $sub xml
        eval "$cmd {$start} 0 0 {} \{ $xml \}"
        eval "$cmd {$start} 1 0 {} {}"
   }

   snit::type parser {
        method docstart {cl args} {
            if $cl {
                puts "\n...End document"
            } else {
                puts "Start document...\n"
            }
        }
        method para {cl selfcl props body} {
            if {$props != {}} {
                array set temp $props
            }
            if {!$cl} {
                set outstring [regsub -all -- {\s+} [string trim $body] " "]
                if [info exists temp(indent)] {
                    set outstring "[string repeat { } $temp(indent)]$outstring"
                }
                puts $outstring
            }
        }
        method meta {cl selfcl props body} {
            if {$props != {}} {
                array set temp $props
            }
            foreach item [array names temp] {
                puts "[string totitle $item]: $temp($item)"
            }
            if {!$selfcl} {
                puts [regsub -all -- {\s+} [string trim $body] " "]
            } else {
                puts ""
            }
        }
   }

   parser myparser

   tax::parse myparser {
    <meta author="Anne Onymous"/>
    <meta>
        Composed in haste for purposes of demonstration.
    </meta>
    <para indent="3">
      This is an indented paragraph. Only the first line
      is indented, which you can tell if the paragraph goes
      on long enough.
    </para>
    <para>
      This is an ordinary paragraph. No line is indented. Not
      one. None at all, which you can tell if the paragraph
      goes on long enough.
    </para>
   }

It gives this output:


Start document...

Author: Anne Onymous

Composed in haste for purposes of demonstration.

   This is an indented paragraph. Only the first line is indented, which you can tell if the paragraph goes on long enough.

This is an ordinary paragraph. No line is indented. Not one. None at all, which you can tell if the paragraph goes on long enough.

...End document


Category XML Category Word and Text Processing Category Internet