Version 1 of Another minimalistic XML parser

Updated 2019-04-11 16:38:32 by AMG

wdb Yet another XML parser. Very minimalistic, just 2 (two!) procedures. Recognizes also singleton like <br />.

Procedure xmlToTree builds list of tokens, then calls procedure tokensVarToTree which destructively processes this list, resulting a dictionary.

Exempli gratia: xmlToTree <p>Hi!</p>type element name p attribute {} content {{type pcdata content Hi!}}

proc xmlToTree src1 {
  # remove comments
  set src [regsub -all <!--{1,1}?.*--> $src1 ""]
  # list of tag index pairs
  set pairs\
    [regexp -inline -indices -all {</?[[:alnum:]]+[^>]*/?>} $src]
  # list of tag strings
  set tagList [lmap pair $pairs {string range $src {*}$pair}]
  # indices of pcdata
  set strIdx [lrange [concat {*}$pairs] 1 end-1]
  # list of pcdata strings
  set strList {}
  foreach {i j} $strIdx {
    lappend strList [string range $src $i+1 $j-1]
  }
  # tokens alternating: tag, pcdata, tag, ...
  set tokens {}
  foreach tag $tagList str $strList {
    lappend tokens $tag
    if {![string is space $str]} then {
      lappend tokens $str
    }
  }
  # process list
  tokensVarToTree tokens
}

proc tokensVarToTree _tokens {
  upvar $_tokens tokens
  set openPattern {<([[:alnum:]:]+)[^>]*>}
  set emptyPattern {<([[:alnum:]:]+)[^>]*/>}
  set closePattern {</([[:alnum:]:]+)>}
  set dataPattern {^[^<]}
  # destructive
  set tokens [lassign $tokens token]
  if {[regexp $dataPattern $token]} then {
    # PCDATA
    dict set result type pcdata
    dict set result content $token
    return $result
  } elseif {[regexp $openPattern $token - name]} then {
    # TAG opening or empty
    dict set result type element
    dict set result name $name
    dict set result attribute {}
    dict set result content ""
    # attributes
    foreach {- key val}\
      [regexp -inline -all {([[:alnum:]]+)="(.*?)"} $token] {
      dict set result attribute $key $val
    }
    foreach {- key val}\
      [regexp -inline -all {([[:alnum:]]+)='(.*?)'} $token] {
      dict set result attribute $key $val
    }
  }
  if {[regexp $emptyPattern $token]} then {
    # TAG empty: done
    return $result
  }
  while {![regexp $closePattern [lindex $tokens 0]] &&
         [llength $tokens] > 0} {
    # TAG non-empty: fill contents
    dict lappend result content [tokensVarToTree tokens]
  }
  # remove closing TAG 
  set tokens [lrange $tokens 1 end]
  set result
}