XML as dict structure

wdb This tool translates XML elements to Tcl dicts where the keys are element, attribute, and content. Key element carries name such as rss; key attribute carries attributes such as href or id; key contents list of contained elements or dicts with key cdata and value text.

  • procedure xmlToDict (src) transforms src to dict structure.
  • procedure getElementByName (dict name) returns first contained element named name.
  • procedure getElementById (dict id) returns element of dict with attribute id.
  • procedure getAllElementsByName (dict name) returns all contained elements named name.

Currently no write procedures. License OLL like always.

#
# file: XmlDict-0.1.tm
#

package provide XmlDict 0.1

namespace eval XmlDict {
  namespace import ::tcl::mathop::+ ::tcl::mathop::-
  namespace export get* xml*
}

proc ::XmlDict::splitByPattern {src {pat {<!\[CDATA\[(?!\]\]>).*?\]\]>}}} {
  set ranges [regexp -inline -indices -all $pat $src]
  if {$ranges eq ""} then {
    set src
  } else {
    lappend result0 [string range $src 0 [- [lindex $ranges 0 0] 1]]
    foreach\
      range0 [lrange $ranges 0 end-1]\
      range1 [lrange $ranges 1 end] {
      lappend result0 [string range $src {*}$range0]
      lassign $range0 - a
      lassign $range1 b c 
      lappend result0 [string range $src [+ $a 1] [- $b 1]]
    }
    lappend result0 [string range $src {*}[lindex $ranges end]]\
      [string range $src [+ [lindex $ranges end end] 1] end]
    foreach str $result0 {
      if {$str ne ""} then {
        lappend result $str
      }
    }
    lappend result
  }
}

proc ::XmlDict::splitByMarkup src {
  set src [regsub -all {<{1,1}?!--.*?-->} $src ""]
  set parts [splitByPattern $src]
  foreach part $parts {
    if {[string match {<!\[CDATA\[*\]\]>} $part]} then {
      lappend result $part
    } else {
      lappend result {*}[splitByPattern $part {<[^>]+>}]
    }
  }
  lappend result
}

proc ::XmlDict::attributes tag {
  set atts0 [regexp -inline -all {\S+="[^"]*"} $tag]
  set atts1 [regexp -inline -all {\S+='[^']*'} $tag]
  lappend result
  foreach attStr $atts0 {
    regexp {^(\S+)="([^"]+)"$} $attStr - tag val
    lappend result $tag $val
  }
  foreach attStr $atts1 {
    regexp {^(\S+)='([^']+)'$} $attStr - tag val
    lappend result $tag $val
  }
  set result
}

proc ::XmlDict::tokensDump src {
  lappend result
  foreach part [splitByMarkup $src] {
    if {![string match <* $part]} then {
      lappend result [list cdata $part]
    } elseif {[regexp {^<!\[CDATA\[(.*)\]\]>$} $part - cdata]} then {
      lappend result [list cdata [string map {& &amp; < &lt; > &gt;} $cdata]]
    } elseif {[string match </* $part]} then {
      lappend result close
    } elseif {[regexp {<([[:alnum:]]+)[^>]*?>} $part - name]} then {
      lappend result [list element $name atts [attributes $part]]
      if {[string match */> $part]} then {
        lappend result close
      }
    }
  }
  set result
}

proc ::XmlDict::dumpToList dump {
  lappend result
  while {[llength $dump] > 0} {
    set dump [lassign $dump first]
    switch -exact -- [lindex $first 0] {
      element {
        set name [dict get $first element]
        set atts [dict get $first atts]
        lassign [dumpToList $dump] dump cont
        lappend result [list element $name attribute $atts content $cont]
      }
      cdata {
        lappend result $first
      }
      close {
        return [list $dump $result]
      }
    }
  }
  list $result $dump
}

proc ::XmlDict::xmlToDict xmlSrc {
  lassign [dumpToList [tokensDump $xmlSrc]] dump
  while {[llength $dump] > 0 &&
         [lindex $dump 0 0] ne "element"} {
    set dump [lrange $dump 1 end]
  }
  lindex $dump 0
}

proc ::XmlDict::childElements tree {
  if {[dict exists $tree content]} then {
    dict get $tree content
  }
}

proc ::XmlDict::getElementByName {tree name} {
  if {[dict exists $tree element] &&
      [string match $name [dict get $tree element]]} then {
    set tree
  } else {
    foreach child [childElements $tree] {
      set result [getElementByName $child $name]
      if {$result ne ""} then {
        return $result
      }
    }
  }
}

proc ::XmlDict::getElementById {tree id} {
  if {[dict exists $tree attribute id] &&
      [string match $id [dict get $tree attribute id]]} then {
    return $tree
  } else {
    foreach child [childElements $tree] {
      set result [getElementById $child $id]
      if {$result ne ""} then {
        return $result
      }
    }
  }
}

proc ::XmlDict::getAllElementsByName {tree name} {
  lappend result
  if {[dict exists $tree element] &&
      [string match $name [dict get $tree element]]} then {
    lappend result $tree
  }
  foreach el [childElements $tree] {
    lappend result {*}[getAllElementsByName $el $name]
  }
  set result
}

proc ::XmlDict::decodeEntities cdata {
  set entities\
    [lsort -unique [regexp -inline -all {&[^;]*;} $cdata]]
  lappend map "&quot;" \u0022 {*}{&amp; & &apos; ' &lt; <; &gt; >}
  foreach entity $entities {
    if {[regexp {&#([[:digit:]]+);} $entity - dec]} then {
      lappend map $entity [format %c $dec]
    } elseif {[regexp {&#x([[:xdigit:]]+);} $entity - hex]} then {
      set dec [scan $hex %x]
      lappend map $entity [format %c $dec]
    }
  }
  string map $map $cdata
}

proc ::XmlDict::getCdataFromElement tree {
  set result ""
  foreach el [dict get $tree content] {
    if {[dict exists $el cdata]} then {
      append result [decodeEntities [dict get $el cdata]]
    } else {
      append result [getCdataFromElement $el]
    }
  }
  set result
}

namespace import ::XmlDict::*