'''treeselect''' is a module prototype for accessing [htmlparse] data stored in a [struct::tree%|%tree] using a syntax a lot like that of CSS selectors. There is one semantic difference, however: each selector operates only on the direct descendants of the selected nodes, so `.class1 .class2` works like `.class1 > .class2` would in CSS. The query syntax is `?selector ...?` where each `selector` can be either `tag.class#id:nth-child(n)` or `*`, which selects everything. In `tag.class#id:nth-child(n)` each of `tag`, `.class`, `#id` and `:nth-child(n)` can be omitted but the ones used must follow in the specified order. If your data was produced by htmlparse the first selectors will be `hmstart` as in `hmstart html body ...`. ====== # treeselect version 0.3.2 # Copyright (c) 2015, 2021 D. Bohdan # License: MIT package require Tcl 8.5-10 package require struct package require htmlparse package require http package require fileutil catch { package require tls http::register https 443 [list ::tls::socket -tls1 1] } namespace eval ::treeselect { variable version [lindex [split [file root [file tail [info script]]] -] 1] variable debug 0 proc file-to-tree {path} { set documentTree [::struct::tree] set html [::fileutil::cat $path] htmlparse::2tree $html $documentTree return $documentTree } proc url-to-tree {url} { set documentTree [::struct::tree] set conn [::http::geturl $url] set html [::http::data $conn] ::http::cleanup $conn htmlparse::2tree $html $documentTree return $documentTree } proc get {tree nodes key} { set result {} foreach node $nodes { lappend result [$tree get $node $key] } return $result } proc parse-attributes {data} { set attributes {} foreach pair $data { lassign [split $pair =] key value set firstChar [string index $value 0] # Unquote value. if {($firstChar eq [string index $value end]) && (($firstChar eq "'") || ($firstChar eq "\""))} { set value [string range $value 1 end-1] } dict set attributes $key $value } return $attributes } proc matches-selector? {selector tree node} { variable debug if {$selector eq "*"} { return 1 } regexp {([^ .#:]*)?(?:\.([^ .#:]+))?(?:#([^ .#:]+))?} \ $selector _ tag class id nth set requirements {} foreach varName {class id} { set value [set $varName] if {$value ne ""} { dict set requirements $varName $value } } set all [$tree getall $node] set type [dict get $all type] if {($type ne $tag) && ($tag ne "")} { return 0 } set attributes {} if {[dict exists $all data]} { set attributes [parse-attributes [dict get $all data]] } if {$debug} { puts "matches-selector: $type [list $requirements $attributes]" } foreach {key value} $requirements { if {![dict exists $attributes $key] || ([dict get $attributes $key] ne $value)} { return 0 } } return 1 } # Usage: tree {?selector ...?} where each selector can be # either tag.class#id:nth-child(n) with n >= 1 or "*". proc query {tree query {start {}}} { variable debug if {$start eq ""} { set nodes [$tree rootname] } else { set nodes $start } while {[llength $query] > 0} { set newNodes {} set selector [lindex $query 0] foreach node $nodes { lappend newNodes {*}[$tree children $node filter \ [list ::treeselect::matches-selector? $selector]] } if {[regexp {[^:]*(?:\:nth-child\(([1-9][0-9]*)\))} $selector _ n]} { set newNodes [lindex $newNodes [expr {$n - 1}]] } set nodes $newNodes if ($debug) { puts "query: $nodes" } set query [lrange $query 1 end] } if ($debug) { puts "query: result: $nodes" } return $nodes } } ====== Use examples can be found on [Web Scraping with htmlparse] and [Hacker News]. **See also** * [TreeQL] * [tdom] * [XPath] --- [RLE] (2015-01-22): Added http::cleanup to url-to-tree proc to release resources consumed by the http::geturl call. [dbohdan] 2015-01-22: Thanks. I've bumped the version to account for that. <>Internet | Package | Web | XML