[Keith Vetter] 2004-03-01 : Here's yet another way to parse an [XML] or HTML file. See also [Parsing HTML], [A little XML parser], [XML Shallow Parsing with Regular Expressions], [Playing SAX] and [Regular Expressions Are Not A Good Idea for Parsing XML, HTML, or e-mail Addresses]. This one, however, is written in pure tcl without needing any extensions. It probably doesn't handle all the XML corner cases but it's worked on all the valid XML I've thrown at it--including handling CDATA data. It's a SAX-like interface where every call to it returns four values: '''type''', '''value''', '''attr''', and '''etype''' where ''type'' is one of "XML", "PI", "TXT" or "EOF"; ''value'' is either the xml entity value or the entities' text; ''attr'' is the value of any attributes associated with the current XML entity; and '''etype''' is the type of entity--either "START", "END" or "EMPTY". ---- ##+########################################################################## # # xml.tcl -- Simple XML parser # by Keith Vetter, March 2004 # namespace eval ::XML { variable XML "" loc 0} proc ::XML::Init {xmlData} { variable XML variable loc set XML [string trim $xmlData]; regsub -all {} $XML {} XML ;# Remove all comments set loc 0 } # Returns {XML|TXT|EOF|PI value attributes START|END|EMPTY} proc ::XML::NextToken {{peek 0}} { variable XML variable loc set n [regexp -start $loc -indices {(.*?)\s*?<(/?)(.*?)(/?)>} \ $XML all txt stok tok etok] if {! $n} {return [list EOF]} foreach {all0 all1} $all {txt0 txt1} $txt \ {stok0 stok1} $stok {tok0 tok1} $tok {etok0 etok1} $etok break if {$txt1 >= $txt0} { ;# Got text set txt [string range $XML $txt0 $txt1] if {! $peek} {set loc [expr {$txt1 + 1}]} return [list TXT $txt] } set token [string range $XML $tok0 $tok1] ;# Got something in brackets if {! $peek} {set loc [expr {$all1 + 1}]} if {[regexp {^!\[CDATA\[(.*)\]\]} $token => txt]} { ;# Is it CDATA stuff? return [list TXT $txt] } # Check for Processing Instruction set type XML if {[regexp {^\?(.*)\?$} $token => token]} { set type PI } set attr "" regexp {^(.*?)\s+(.*?)$} $token => token attr set etype START ;# Entity type if {$etok0 <= $etok1} { if {$stok0 <= $stok1} { set token "/$token"} ;# Bad XML set etype EMPTY } elseif {$stok0 <= $stok1} { set etype END } return [list $type $token $attr $etype] } # ::XML::IsWellFormed # checks if the XML is well-formed )http://www.w3.org/TR/1998/REC-xml-19980210) # # Returns "" if well-formed, error message otherwise # missing: # characters: doesn't check valid extended characters # attributes: doesn't check anything: quotes, equals, unique, etc. # text stuff: references, entities, parameters, etc. # doctype internal stuff # proc ::XML::IsWellFormed {} { set result [::XML::_IsWellFormed] set ::XML::loc 0 return $result } ;proc ::XML::_IsWellFormed {} { array set emsg { XMLDECLFIRST "The XML declaration must come first" MULTIDOCTYPE "Only one DOCTYPE is allowed" INVALID "Invalid document structure" MISMATCH "Ending tag '$val' doesn't match starting tag" BADELEMENT "Bad element name '$val'" EOD "Only processing instructions allowed at end of document" BADNAME "Bad name '$val'" BADPI "No processing instruction starts with 'xml'" } # [1] document ::= prolog element Misc* # [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? # [27] Misc ::= Comment | PI | S # [28] doctypedecl ::= # [16] PI ::= set seen 0 ;# 1 xml, 2 pi, 4 doctype while {1} { foreach {type val attr etype} [::XML::NextToken] break if {$type eq "PI"} { if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} { return [subst $emsg(BADNAME)] } if {$val eq "xml"} { ;# XMLDecl if {$seen != 0} { return $emsg(XMLDECLFIRST) } # TODO: check version number exist and only encoding and # standalone attributes are allowed incr seen ;# Mark as seen XMLDecl continue } if {[string equal -nocase "xml" $val]} {return $emsg(BADPI)} set seen [expr {$seen | 2}] ;# Mark as seen PI continue } elseif {$type eq "XML" && $val eq "!DOCTYPE"} { ;# Doctype if {$seen & 4} { return $emsg(MULTIDOCTYPE) } set seen [expr {$seen | 4}] continue } break } # [39] element ::= EmptyElemTag | STag content ETag # [40] STag ::= < Name (S Attribute)* S? > # [42] ETag ::= # [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* # [44] EmptyElemTag ::= < Name (S Attribute)* S? /> # set stack {} set first 1 while {1} { if {! $first} { ;# Skip first time in foreach {type val attr etype} [::XML::NextToken] break } else { if {$type ne "XML" && $type ne "EOF"} { return $emsg(INVALID) } set first 0 } if {$type eq "EOF"} break ;# TODO: check attributes: quotes, equals and unique if {$type eq "TXT"} continue if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} { return [subst $emsg(BADNAME)] } if {$type eq "PI"} { if {[string equal -nocase xml $val]} { return $emsg(BADPI) } continue } if {$etype eq "START"} { ;# Starting tag lappend stack $val } elseif {$etype eq "END"} { ;# if {$val ne [lindex $stack end]} { return [subst $emsg(MISMATCH)] } set stack [lrange $stack 0 end-1] if {[llength $stack] == 0} break ;# Empty stack } elseif {$etype eq "EMPTY"} { ;# } } # End-of-Document can only contain processing instructions while {1} { foreach {type val attr etype} [::XML::NextToken] break if {$type eq "EOF"} break if {$type eq "PI"} { if {[string equal -nocase xml $val]} { return $emsg(BADPI) } continue } return $emsg(EOD) } return "" } ################################################################ # # Demo code # set xml { Geocache http://www.geocaching.com/seek/cache_details.aspx?wp=GCGPXK Geocache http://www.geocaching.com/seek/cache_details.aspx?wp=GC19DF } ::XML::Init $xml set wellFormed [::XML::IsWellFormed] if {$wellFormed ne ""} { puts "The xml is not well-formed: $wellFormed" exit 1 } puts "The xml is well-formed" while {1} { foreach {type val attr etype} [::XML::NextToken] break puts "looking at: $type '$val' '$attr' '$etype'" if {$type == "EOF"} break } ---- [Steve Ball]: Comment #1: This looks like an xmlTextReader-style interface (ie. looping and reading one token at a time). That's interesting to me because I'm now using libxml2's xmlTextReader interface in TclXML and am considering introducing it at the scripting level. Comment #2: Your code above will work fine as long as the input XML is well-formed. It has absolutely no error checking at all! Error checking is where all the hard work is... [KPV] True on both accounts. I was working with validated XML so error checking was not important. But what do you expect from 25 lines of code? [MSW] It's tcl, so I expect a fully featured all-in-one device suitable for every purpose (aka ``eierlegende Wollmilchsau'' :) ---- [KPV] In response to comment #2 above, I added a routine to check for well-formedness. It checks for most of the major well-formed constraints such as properly nesting tags (that's easy), proper prolog, valid names, etc. It '''doesn't''' handle: names with extended unicode characters, anything to do with attributes (quotes, unique, character sets, etc.), text stuff like references, entities, parameters, etc., nor doctype internal stuff. Now I remember how much I hate XML. At first blush, XML seems so simple, but once you dig deep there's so many weird gotchas. Checking for well-formedness quadrupled the size of the code and even so it's not complete. I love XML's design goal #6 [http://www.w3.org/TR/2004/REC-xml-20040204/#sec-origin-goals] ''XML documents should be human-legible and reasonably clear''. ---- [Category XML] | [Category Internet] | [Category Package]