treeselect

treeselect is a module prototype for accessing htmlparse data stored in a tree using a syntax a lot like that of CSS selectors. There is one semantic difference, however: each selector operates only on the direct descendants of the selected nodes, so .class1 .class2 works like .class1 > .class2 would in CSS.

The query syntax is ?selector ...? where each selector can be either tag.class#id:nth-child(n) or *, which selects everything. In tag.class#id:nth-child(n) each of tag, .class, #id and :nth-child(n) can be omitted but the ones used must follow in the specified order. If your data was produced by htmlparse the first selectors will be hmstart as in hmstart html body ....

# treeselect version 0.3.2
# Copyright (c) 2015, 2021 D. Bohdan
# License: MIT
package require Tcl 8.5 9
package require struct
package require htmlparse
package require http
package require fileutil
catch {
    package require tls
    http::register https 443 [list ::tls::socket -tls1 1]
}

namespace eval ::treeselect {
    variable version [lindex [split [file root [file tail [info script]]] -] 1]
    variable debug 0

    proc file-to-tree {path} {
        set documentTree [::struct::tree]

        set html [::fileutil::cat $path]

        htmlparse::2tree $html $documentTree

        return $documentTree
    }

    proc url-to-tree {url} {
        set documentTree [::struct::tree]

        set conn [::http::geturl $url]
        set html [::http::data $conn]
        ::http::cleanup $conn

        htmlparse::2tree $html $documentTree

        return $documentTree
    }

    proc get {tree nodes key} {
        set result {}
        foreach node $nodes {
            lappend result [$tree get $node $key]
        }
        return $result
    }

    proc parse-attributes {data} {
        set attributes {}
        foreach pair $data {
            lassign [split $pair =] key value
            set firstChar [string index $value 0]

            # Unquote value.
            if {($firstChar eq [string index $value end]) &&
                    (($firstChar eq "'") || ($firstChar eq "\""))} {
                set value [string range $value 1 end-1]
            }

            dict set attributes $key $value
        }
        return $attributes
    }

    proc matches-selector? {selector tree node} {
        variable debug

        if {$selector eq "*"} {
            return 1
        }
        regexp {([^ .#:]*)?(?:\.([^ .#:]+))?(?:#([^ .#:]+))?} \
                $selector _ tag class id nth
        set requirements {}
        foreach varName {class id} {
            set value [set $varName]
            if {$value ne ""} {
                dict set requirements $varName $value
            }
        }

        set all [$tree getall $node]
        set type [dict get $all type]

        if {($type ne $tag) && ($tag ne "")} {
            return 0
        }

        set attributes {}
        if {[dict exists $all data]} {
            set attributes [parse-attributes [dict get $all data]]
        }

        if {$debug} {
            puts "matches-selector: $type [list $requirements $attributes]"
        }

        foreach {key value} $requirements {
            if {![dict exists $attributes $key] ||
                ([dict get $attributes $key] ne $value)} {
                return 0
            }
        }

        return 1
    }

    # Usage: tree {?selector ...?} where each selector can be
    # either tag.class#id:nth-child(n) with n >= 1 or "*".
    proc query {tree query {start {}}} {
        variable debug

        if {$start eq ""} {
            set nodes [$tree rootname]
        } else {
            set nodes $start
        }

        while {[llength $query] > 0} {

            set newNodes {}
            set selector [lindex $query 0]

            foreach node $nodes {
                lappend newNodes {*}[$tree children $node filter \
                        [list ::treeselect::matches-selector? $selector]]
            }
            if {[regexp {[^:]*(?:\:nth-child\(([1-9][0-9]*)\))} $selector _ n]} {
                set newNodes [lindex $newNodes [expr {$n - 1}]]
            }
            set nodes $newNodes
            if ($debug) {
                puts "query: $nodes"
            }
            set query [lrange $query 1 end]
        }
        if ($debug) {
            puts "query: result: $nodes"
        }
        return $nodes
    }
}

Use examples can be found on Web Scraping with htmlparse and Hacker News.

See also

---

RLE (2015-01-22): Added http::cleanup to url-to-tree proc to release resources consumed by the http::geturl call.

dbohdan 2015-01-22: Thanks. I've bumped the version to account for that.