12 Dec 2009 [JM] This [LemonTree] version takes advantage of:<
> 1. [htmlparse]::2tree functionality that puts html contents on a [tree] structure<
> 2. how well developed [LemonTree] is that is easy to add new item types (Thanks [RS]!) This script could be used as a tool for your [web scraping] coding if you use [htmlparse] as the main parsing engine. see [Web Scraping with htmlparse] usage:<
> - copy & paste URL from your web browser to the entry, or...<
> - browse your file system with the "Browse..." button<
> Then load the html contents with the button "Get html file" and start navigating the tree structure. [LemonTreeQL_1] [JM] 7/27/2019 - You can try some TreeQL code as in the example below, by launching the proc "sample" from the console to color your nodes according to the TreeQL commands. This is a good way to try your queries, or to learn how TreeQL works.<
> This is just a proof of concept right now, but it could help to demonstrate why processing trees generated by htmlparse seems like the killer app for TreeQL [LemonTreeQL_2] ======tcl package require struct package require csv package require report package require htmlparse package require textutil package require http package require tls package require BWidget package require treeql namespace eval LemonTree {variable uniqueID 0} console show #http://www.cs.grinnell.edu/~walker/fluency-book/labs/sample-table.html proc sample {} { global t treeql q1 -tree t treeql q2 -tree t q1 query tree withatt type ul .t opentree _root set ctr 0 foreach elem [q1 result] { .t opentree $elem .t itemconfigure $elem -fill red puts "$ctr - $elem" incr ctr } set ul [lindex [q1 result] 3] q1 query replace $ul children map x { .t itemconfigure $x -fill blue # si funciona q2 query replace $x children withatt type PCDATA foreach elem [q2 result] { set nodeData [t get $elem data] puts [t get $elem type] if {[string match *Bin* $nodeData]} { #.t opentree $elem .t itemconfigure $elem -fill green #.t itemconfigure $elem -image $icon(file) .t itemconfigure $elem -image $LemonTree::icon(file) } } } } proc a {} { global t treeql tq -tree t tq query \ tree \ withatt type a \ get data foreach hijo [tq result] { puts $hijo } puts "=======" tq query \ tree \ withatt type a \ children \ get data foreach hijo [tq result] { puts $hijo } } proc example {} { global t treeql q1 -tree t treeql q2 -tree t q1 query tree withatt type ul .t opentree _root set ctr 0 foreach elem [q1 result] { .t opentree $elem .t itemconfigure $elem -fill red puts "$ctr - $elem" incr ctr } set ul [lindex [q1 result] 9] q1 query replace $ul children map x { .t itemconfigure $x -fill blue # si funciona q2 query replace $x children withatt type a foreach elem [q2 result] { set nodeData [t get $elem data] puts [t get $elem type] if {![string match *rel* $nodeData]} { #.t opentree $elem .t itemconfigure $elem -fill green #.t itemconfigure $elem -image $icon(file) .t itemconfigure $elem -image $LemonTree::icon(file) } } } } proc help {} { puts { treeql q1 -tree $t q1 query tree withatt type ul q1 result .t itemconfigure n1 -fill blue } } proc LemonTree::add {w parent type name {text ""}} { variable uniqueID; variable icon if {$name != "root"} { set val1 [::t get $name data] set val2 [::t get $name type] set val3 [::t index $name] # string range $val1 0 50 if {$text eq ""} {set text "$val3: <$val2> $val1"} #if {$text eq ""} {set text "$val3,$val2,$val1"} set id $name } else { puts "bad name: $name" set id "_root" } #puts $text #puts [string length $text] if {[string length $text] > 50} { set text [string range $text 0 49] } #set id n[incr uniqueID] #tk_messageBox -message "$id: $type,$name" set data [list type $type name $name] set fill [expr {[string match (* $text]? "blue": "black"}] set drawcross [expr {[info proc ::LemonTree::kids($type)] eq ""? "never": "allways"}] $w insert end $parent $id -text "$id - $text" -data $data -drawcross $drawcross -fill $fill if [info exists icon($type)] { $w itemconfigure $id -image $icon($type) } } proc LemonTree::open {w node} { if {[$w itemcget $node -drawcross] eq "allways"} { set data [$w itemcget $node -data] set type [dict'get $data type] foreach {ktype kids} [kids($type) $w $node] { foreach kid $kids {add $w $node $ktype $kid} } $w itemconfigure $node -drawcross auto } } proc LemonTree::kids(html) {w node} { set name [dict'get [$w itemcget $node -data] name] list html [::t children $name] } set path $BWIDGET::LIBRARY/images foreach {type name} {dir folder file file array copy html folder} { set LemonTree::icon($type) [image create photo -file $path/$name.gif] } proc LemonTree::Info {w node} { set type [dict'get [$w itemcget $node -data] type] if {[info proc ::LemonTree::info($type)] ne ""} { balloon $w [info($type) $w $node] } } #-- type-specific info providers: proc LemonTree::info(html) {w node} { #puts $node set name [dict'get [$w itemcget $node -data] name] if {$name != "root"} { set val1 [::t get $name data] set val2 [::t get $name type] set val3 [::t index $name] puts $node puts "\t[t index $name]" foreach nodo [t ancestors $name] { if {$nodo != "root"} { puts "\t[t index $nodo]" } } } else { set val1 "root" } return "$val1" } proc balloon {w text} { set top .balloon catch {destroy $top} toplevel $top -bd 1 pack [message $top.txt -aspect 10000 -bg lightyellow \ -borderwidth 0 -text $text -font {Helvetica 9}] wm overrideredirect $top 1 wm geometry $top +[winfo pointerx $w]+[winfo pointery $w] bind $top <1> [list destroy $top] raise $top } proc dict'get {dict key} { foreach {k value} $dict {if {$k eq $key} {return $value}} } #-- reconstruct a proc's definition as a string: proc procinfo name { set args "" foreach arg [info args $name] { if [info default $name $arg def] {lappend arg $def} lappend args $arg } return "proc $name {$args} {[info body $name]}" } # # # proc main {} { global t set url [.txt get] catch {t destroy} .t delete [.t nodes root] if {$url == ""} { tk_messageBox -message "specify html location" return } ::struct::tree t #set t [::struct::tree] if {[string range $url 0 3] == "http"} { http::register https 443 tls::socket set http [::http::geturl $url] set html [::http::data $http] } else { set html [read [set fh [open $url]]] close $fh } #puts $url htmlparse::2tree $html t htmlparse::removeVisualFluff t htmlparse::removeFormDefs t LemonTree::add .t root html root "(html)" return } # ------------------------------------------- label .lbl -text "URL or path:" entry .txt -width 60 button .btnBrowser -text Browse... -command {.txt insert end [tk_getOpenFile]} button .btnGet -text "html > tree" -command main button .btnSalir -text Exit -command { catch {t destroy} exit } grid .lbl .txt .btnBrowser grid .btnGet -row 1 -column 1 grid .btnSalir -row 1 -column 2 #-- Now to demonstrate and test the whole thing: Tree .t -background white -opencmd {LemonTree::open .t} \ -width 90 -height 30 -yscrollcommand {.y set} .t bindText <1> {LemonTree::Info .t} .t bindImage <1> {LemonTree::Info .t} scrollbar .y -command {.t yview} grid .t -row 2 -column 0 -columnspan 3 grid .y -row 2 -column 3 -sticky ns #.txt insert end "https://wiki.tcl-lang.org/page/RS" .txt insert end "http://www.cs.grinnell.edu/~walker/fluency-book/labs/sample-table.html" #-- Little development helpers: bind . {exec wish $argv0 &; exit} bind . {console show} ====== <> Category Application | Category GUI | Category Tutorial