[wdb] On base of [Another minimalistic XML parser] Iʼve built a tiny XML package, intended to be productive, providing these [proc]edures:
% set tree [xml parse $src]
type element name rss attribute {} content { ... }
% _
parses $src to [dict]ionary $tree like .
% set path [xml findElementById $tree important]
4 0 2
% _
returns path af child element with attribute id="important." $path is used to gain access:
% set element [xml getElement $tree {*}$path]
type element name p attribute {id important} content {{type pcdata content hi!}}
% _
Simplified for text data:
% set text [xml getText $trelement {*}$path]
hi!
% _
To find all children with name item:
% set itemPaths [xml findElementsByName $tree item]
{1 3} {1 4} {1 5}
% _
Helpful when debuging:
% xml unparse $element -pp
shows $element with proper indentation.
As mentioned above, itʼs intended for production. Iʼve built a RSS processor dealing with feeds from outer world. Badass.
======
#
# file: XML-0.1.tm
#
# XMl -- minimalistic but working XML parser
# usage: package require XML
#
# xml parse $src ?-space yes|no?
# xml getElement $tree ?n1 ?n2 ...??
# xml getText $tree ?n1 ?n2 ...??
# xml findElementById $tree $id
# xml findElementsByName $tree $name
# xml unparse $tree ?-pp?
#
package require Tcl 8.6.1
package provide xml 0.1
namespace eval xml namespace export\
parse\
unparse\
getElement\
getText\
findElementById\
findElementsByName
namespace eval xml namespace import ::tcl::mathop::+
proc ::xml::parse {src args} {
set option [dict merge {
-space no
} {*}[lmap {a b} $args {list $a $b}]]
# remove comments
set src [regsub -all <!--{1,1}?.*--> $src ""]
# encode <![CDATA[http://www.taz.de//!p4608/]]>
set cdataMap [concat {*}[lsort -unique [lmap {a b}\
[regexp -inline -all {<!\[CDATA\[(.*?)\]\]>} $src] {
list $a [string map {< < & & > >} $b]
}]]]
set src [string map $cdataMap $src]
# list of tag index pairs
set pairs\
[regexp -inline -indices -all {</?[[:alnum:]:-]+[^>]*/?>} $src]
# list of tag strings
set tagList [lmap pair $pairs {string range $src {*}$pair}]
# indices of pcdata
set strIdx [lrange [concat {*}$pairs] 1 end-1]
# list of pcdata strings
set strList {}
foreach {i j} $strIdx {
lappend strList [string range $src $i+1 $j-1]
}
# tokens alternating: tag, pcdata, tag, ...
set tokens {}
foreach tag $tagList str $strList {
lappend tokens $tag
if {[dict get $option -space]} then {
if {$str ne ""} then {
lappend tokens $str
}
} else {
if {![string is space $str]} then {
lappend tokens [string trim $str]
}
}
}
# process list
tokensVarToTree tokens
}
proc ::xml::tokensVarToTree _tokens {
lassign [info level 0] recurse
upvar $_tokens tokens
set openPattern {<([[:alnum:]:-]+)[^>]*>}
set emptyPattern {<([[:alnum:]:-]+)[^>]*/>}
set closePattern {</([[:alnum:]:-]+)>}
set dataPattern {^[^<]}
# destructive
set tokens [lassign $tokens token]
if {[regexp $dataPattern $token]} then {
# PCDATA
dict set result type pcdata
dict set result content $token
return $result
} elseif {[regexp $openPattern $token - name]} then {
# TAG opening or empty
dict set result type element
dict set result name $name
dict set result attribute {}
dict set result content ""
# attributes
foreach {- key val}\
[regexp -inline -all {([[:alnum:]]+)="([^"]*?)"} $token] {
dict set result attribute $key $val
}
foreach {- key val}\
[regexp -inline -all {([[:alnum:]]+)='([^']*?)'} $token] {
dict set result attribute $key $val
}
}
if {[regexp $emptyPattern $token]} then {
# TAG empty: done
return $result
}
while {![regexp $closePattern [lindex $tokens 0]] &&
[llength $tokens] > 0} {
# TAG non-empty: fill contents
dict lappend result content [$recurse tokens]
}
# remove closing TAG
if {[regexp $closePattern [lindex $tokens 0]]} then {
set tokens [lrange $tokens 1 end]
}
set result
}
proc ::xml::getElement {tree args} {
lassign [info level 0] recurse
if {$args eq "" ||
[dict get $tree type] eq "pcdata"} then {
set tree
} else {
set args [lassign $args index]
if {$index < [llength [dict get $tree content]]} then {
$recurse [lindex [dict get $tree content] $index] {*}$args
}
}
}
proc ::xml::findElementById {tree id args} {
lassign [info level 0] recurse
if {[dict get $tree type] eq "element"} then {
set i 0
foreach child [dict get $tree content] {
if {[dict get $child type] eq "element" &&
[dict exists $child attribute id] &&
[dict get $child attribute id] eq $id} then {
return [concat $args $i]
}
set path [$recurse $child $id {*}$args $i]
if {$path ne ""} then {
return $path
}
incr i
}
}
}
proc ::xml::findElementsByNameRoutine {tree name args} {
lassign [info level 0] recurse
if {[dict get $tree type] eq "element"} then {
set i 0
foreach child [dict get $tree content] {
if {[dict get $child type] eq "element" &&
[dict get $child name] eq $name} then {
yield [concat $args $i]
}
$recurse $child $name {*}$args $i
incr i
}
}
}
proc ::xml::findElementsByName {tree name} {
lassign [info level 0] recurse
set i 1
while {[info commands c$i] ne ""} {
incr i
}
set coroutine c$i
coroutine $coroutine apply [list {tree name} {
yield [info coroutine]
findElementsByNameRoutine $tree $name
} [namespace current]] $tree $name
set result {}
while true {
set path [$coroutine]
if {$path ne ""} then {
lappend result $path
} else {
return $result
}
}
}
proc ::xml::unparse {tree {indent 0} args} {
lassign [info level 0] recurse
if {![string is digit $indent]} then {
lappend args $indent
set indent 0
}
set result ""
if {"-pp" in $args} then {
if {$indent > 0} then {
append result \n
}
append result [string repeat " " $indent]
}
if {[dict get $tree type] eq "pcdata"} then {
if {"-pp" in $args} then {
append result [string trim [dict get $tree content]]
} else {
append result [dict get $tree content]
}
} else {
append result <[dict get $tree name]
foreach {key val} [dict get $tree attribute] {
append result " $key="
if {[string first \u0022 $val] < 0} then {
append result \" $val \"
} else {
append result ' $val '
}
}
if {[llength [dict get $tree content]] == 0} then {
append result " />"
} else {
append result >
foreach child [dict get $tree content] {
append result [$recurse $child [+ $indent 1] {*}$args]
}
if {"-pp" in $args} then {
append result \n[string repeat " " $indent]
}
append result </[dict get $tree name]>
}
}
set result
}
proc ::xml::decode txt {
lappend map "<" < ">" > "&" & """ \"
set matches [lsort -unique [regexp -inline -all {&#[0-9]+;} $txt]]
foreach match $matches {
regexp {([0-9]+)} $match - i
lappend map $match [format %c [scan $i %d]]
}
string map $map $txt
}
proc ::xml::getText {tree args} {
set child [getElement $tree {*}$args]
if {[dict get $child type] eq "pcdata"} then {
decode [dict get $child content]
} elseif {[llength [dict get $child content]] > 0} then {
getText $child 0
}
}
namespace eval xml namespace ensemble create
======
<<categories>> XML