Version 1 of treeselect

Updated 2015-01-11 19:00:50 by dbohdan

treeselect is a module prototype for accessing htmlparse data stored in a tree using a syntax a lot like that of CSS selectors. There is one semantic difference, however: each selector operated only on the direct descendants of a node, so .class1 .class2 works like .class1 > .class2 in CSS.

# treeselect-0.1.tm
package require Tcl 8.5
package require struct
package require htmlparse
package require http

namespace eval ::treeselect {
    variable version 0.1.0
    variable debug 0

    proc url-to-tree {url} {
        set documentTree [::struct::tree]

        set conn [::http::geturl $url]
        set html [::http::data $conn]

        htmlparse::2tree $html $documentTree

        return $documentTree
    }

    proc parse-attibutes {data} {
        set attributes {}
        foreach pair $data {
            lassign [split $pair =] key value
            set firstChar [string index $value 0]

            # Unquote value.
            if {($firstChar eq [string index $value end]) &&
                    (($firstChar eq "'") || ($firstChar eq "\""))} {
                set value [string range $value 1 end-1]
            }

            dict set attributes $key $value
        }
        return $attributes
    }

    proc matches-selector? {selector tree node} {
        variable debug

        if {$selector eq "*"} {
            return 1
        }
        regexp {([^ .#:]*)?(?:\.([^ .#:]+))?(?:#([^ .#:]+))?} \
                $selector _ tag class id nth
        set requirements {}
        foreach varName {class id} {
            set value [set $varName]
            if {$value ne ""} {
                dict set requirements $varName $value
            }
        }

        set all [$tree getall $node]
        set type [dict get $all type]

        if {($type ne $tag) && ($tag ne "")} {
            return 0
        }

        set attributes {}
        if {[dict exists $all data]} {
            set attributes [parse-attibutes [dict get $all data]]
        }

        if ($debug) {
            puts "matches-selector: $type [list $requirements $attributes]"
        }

        foreach {key value} $requirements {
            if {![dict exists $attributes $key] ||
                ([dict get $attributes $key] ne $value)} {
                return 0
            }
        }

        return 1
    }

    # Usage: tree {?selector ...?} where each selector
    # can be tag.class#id:nth-child(n) or "*".
    proc query-selector {tree query {start {}}} {
        variable debug

        if {$start ne ""} {
            set nodes $start
        } else {
            set nodes [$tree rootname]
        }

        while {[llength $query] > 0} {

            set newNodes {}
            set selector [lindex $query 0]

            foreach node $nodes {
                lappend newNodes {*}[$tree children $node filter \
                        [list ::treeselect::matches-selector? $selector]]
            }
            if {[regexp {[^:]*(?:\:nth-child\(([0-9]+)\))} $selector _ n]} {
                set newNodes [lindex $newNodes $n]
            }
            set nodes $newNodes
            if ($debug) {
                puts "query-selector: $nodes"
            }
            set query [lrange $query 1 end]
        }
        if ($debug) {
            puts "query-selector: result: $nodes"
        }
        return $nodes
    }
}

A use example can be found on Web Scraping with htmlparse.

See also