Version 3 of Parsing XML

Updated 2004-03-01 16:32:00

Keith Vetter 2004-03-01 : Here's yet another way to parse an XML or HTML file. See also Parsing HTML, A little XML parser, XML Shallow Parsing with Regular Expressions, Playing SAX and Regular Expressions Are Not A Good Idea for Parsing XML, HTML, or e-mail Addresses.

This one, however, is written in pure tcl without needing any extensions. It probably doesn't handle all the XML corner cases but it's worked on all the valid XML I've thrown at it--including handling CDATA data.

It's a SAX-like interface where every call to it returns three values: type, value, and attr where type is one of "XML", "TXT" or "EOF"; value is either the xml entity value or the entities' text; and attr is the value of any attributes associated with the current XML entity.


 namespace eval ::XML { variable XML "" loc 0}

 proc ::XML::Init {xmlData} {
    variable XML
    variable loc

    set XML [string trim $xmlData];
    regsub -all {<!--.*?-->} $XML {} XML        ;# Remove all comments
    set loc 0
 }

 proc ::XML::NextToken {{peek 0}} {
    variable XML
    variable loc

    set n [regexp -start $loc -indices {(.*?)\s*?<(.*?)/?>} $XML all txt tok]
    if {! $n} {return [list EOF "" ""]}
    foreach {all0 all1} $all {txt0 txt1} $txt {tok0 tok1} $tok break

    if {$txt1 >= $txt0} {                       ;# Got text
        set txt [string range $XML $txt0 $txt1]
        if {! $peek} {set loc [expr {$txt1 + 1}]}
        return [list TXT $txt ""]
    }

    set token [string range $XML $tok0 $tok1]   ;# Got something in brackets
    if {! $peek} {set loc [expr {$all1 + 1}]}
    if {[regexp {^!\[CDATA\[(.*)\]\]} $token => txt]} { ;# Is it CDATA stuff?
        return [list TXT $txt ""]
    }
    set attr ""
    regexp {^(.*?)\s+(.*?)$} $token => token attr
    return [list XML $token $attr]
 }


 # Demo code
 set xml {<?xml version="1.0" encoding="ISO-8859-1"?>
 <loc version="1.0" src="Groundspeak">
 <waypoint>
  <name id="GCGPXK"><![CDATA[Playing Poker with the Squirrels by Rino 'n Rinette]]></name>
  <coord lat="40.1548166" lon="-82.5202833"/>
  <type>Geocache</type>
  <link text="Cache Details">http://www.geocaching.com/seek/cache_details.aspx?wp=GCGPXK</link> 
 </waypoint><waypoint>
  <name id="GC19DF"><![CDATA[Great Playground Caper by Treasure Hunters Inc.]]></name>
  <coord lat="40.0667166666667" lon="-82.5358"/>
  <type>Geocache</type>
  <link text="Cache Details">http://www.geocaching.com/seek/cache_details.aspx?wp=GC19DF</link>
 </waypoint>
 </loc>
 }

 ::XML::Init $xml
 while {1} {
    foreach {type val attr} [::XML::NextToken] break
    puts "looking at: $type '$val' '$attr'"
    if {$type == "EOF"} break
 }

Category XML | Category Internet | Category Package