'''treeselect''' is a module prototype for accessing [htmlparse] data stored in a [struct::tree%|%tree] using a syntax a lot like that of CSS selectors. There is one semantic difference, however: each selector operates only on the direct descendants of the selected nodes, so `.class1 .class2` works like `.class1 > .class2` in CSS. The selector syntax is `?selector ...?` where each `selector` can be either `tag.class#id:nth-child(n)` (in this order) or `*`, which selects everything. If your data was produces by htmlparse the first selectors should be `hmstart html`. ====== # treeselect-0.1.tm package require Tcl 8.5 package require struct package require htmlparse package require http namespace eval ::treeselect { variable version 0.1.0 variable debug 0 proc url-to-tree {url} { set documentTree [::struct::tree] set conn [::http::geturl $url] set html [::http::data $conn] htmlparse::2tree $html $documentTree return $documentTree } proc parse-attibutes {data} { set attributes {} foreach pair $data { lassign [split $pair =] key value set firstChar [string index $value 0] # Unquote value. if {($firstChar eq [string index $value end]) && (($firstChar eq "'") || ($firstChar eq "\""))} { set value [string range $value 1 end-1] } dict set attributes $key $value } return $attributes } proc matches-selector? {selector tree node} { variable debug if {$selector eq "*"} { return 1 } regexp {([^ .#:]*)?(?:\.([^ .#:]+))?(?:#([^ .#:]+))?} \ $selector _ tag class id nth set requirements {} foreach varName {class id} { set value [set $varName] if {$value ne ""} { dict set requirements $varName $value } } set all [$tree getall $node] set type [dict get $all type] if {($type ne $tag) && ($tag ne "")} { return 0 } set attributes {} if {[dict exists $all data]} { set attributes [parse-attibutes [dict get $all data]] } if ($debug) { puts "matches-selector: $type [list $requirements $attributes]" } foreach {key value} $requirements { if {![dict exists $attributes $key] || ([dict get $attributes $key] ne $value)} { return 0 } } return 1 } # Usage: tree {?selector ...?} where each selector # can be tag.class#id:nth-child(n) or "*". proc query-selector {tree query {start {}}} { variable debug if {$start ne ""} { set nodes $start } else { set nodes [$tree rootname] } while {[llength $query] > 0} { set newNodes {} set selector [lindex $query 0] foreach node $nodes { lappend newNodes {*}[$tree children $node filter \ [list ::treeselect::matches-selector? $selector]] } if {[regexp {[^:]*(?:\:nth-child\(([0-9]+)\))} $selector _ n]} { set newNodes [lindex $newNodes $n] } set nodes $newNodes if ($debug) { puts "query-selector: $nodes" } set query [lrange $query 1 end] } if ($debug) { puts "query-selector: result: $nodes" } return $nodes } } ====== A use example can be found on [Web Scraping with htmlparse]. **See also** * [TreeQL] * [tdom] * [XPath] <>Web | Internet | XML