12 Dec 2009 [JM] This [LemonTree] version takes advantage of:<
>
1. [htmlparse]::2tree functionality that puts html contents on a [tree] structure<
>
2. how well developed [LemonTree] is that is easy to add new item types (Thanks [RS]!)
This script could be used as a tool for your [web scraping] coding if you use htmlparse as the main parsing engine.
usage:<
>
- copy & paste URL from your web browser to the entry, or...<
>
- browse your file system with the "Browse..." button<
>
Then load the html contents with the button "Get html file"
and start navigating the tree structure.
[WikiDbImage LemonTreeBranch.gif]
======tcl
#!/bin/sh
#console show
package require struct
package require csv
package require report
package require htmlparse
package require textutil
package require http
package require BWidget
namespace eval LemonTree {variable uniqueID 0}
#http://www.cs.grinnell.edu/~walker/fluency-book/labs/sample-table.html
if 0 {The Tree widget allows a -data item for each node, which I use for a dict-like
list that contains the node's type and "real name" (as opposed to the "display name"
- for instance, a dir node would display only its [file tail], but the real name is
the full path). This routine adds a node to the LemonTree: }
proc LemonTree::add {w parent type name {text ""}} {
variable uniqueID; variable icon
if {$parent != "root"} {
set val1 [::t get $name data]
set val2 [::t get $name type]
set val3 [::t index $name]
if {$text eq ""} {set text "$val3,$val2,[string range $val1 0 20]"}
}
set id n[incr uniqueID]
set data [list type $type name $name]
#tk_messageBox -message "$type,$name"
set fill [expr {[string match (* $text]? "blue": "black"}]
set drawcross [expr {[info proc ::LemonTree::kids($type)] eq ""?
"never": "allways"}]
$w insert end $parent $id -text $text -data $data -drawcross $drawcross -fill $fill
if [info exists icon($type)] {
$w itemconfigure $id -image $icon($type)
}
}
if 0 {For speed, a Tree isn't drawn fully expanded at the beginning.
Instead, nodes are opened on demand, when the user clicks on the [+] icon.
I use the -drawcross "allways" mode (shudder - should be fixed to "always",
but then older code might break) to indicate that the node hasn't been opened before
- after the first opening, the mode is set to "auto", meaning to draw a cross only if the node has children. }
proc LemonTree::open {w node} {
if {[$w itemcget $node -drawcross] eq "allways"} {
set data [$w itemcget $node -data]
set type [dict'get $data type]
foreach {ktype kids} [kids($type) $w $node] {
foreach kid $kids {add $w $node $ktype $kid}
}
$w itemconfigure $node -drawcross auto
}
}
if 0 {So far for the generic LemonTree - the rest is already customization for specific item types.
The kids($type) call above looks like an array element
- in fact it's a way of dispatching the generic operation of providing the list of children
of an entity of given type, which of course depends on the type. For instance, the children
of a directory are its subdirectories, and then its files (with special-casing for Windows,
so that drive letters are the children of "/"): }
proc LemonTree::kids(dir) {w node} {
set name [dict'get [$w itemcget $node -data] name]
if {$::tcl_platform(platform) eq "windows" && $name eq "/"} {
return [list dir [file volumes]]
}
set dirs [lsort [glob -nocomplain -type d $name/*]]
set files [lsort [glob -nocomplain -type f $name/*]]
list dir $dirs file $files
}
proc LemonTree::kids(html) {w node} {
set name [dict'get [$w itemcget $node -data] name]
list html [::t children $name]
}
if 0 {Namespaces have a hierarchy, but contain collections of commands and variables as well.
So I introduced an intermediate layer (parens around the display name make these "meta-children"
come displayed in blue):}
proc LemonTree::kids(namespace) {w node} {
list ns-commands (Commands) ns-vars (Variables) ns-children (Children)
}
proc LemonTree::kids(ns-children) {w node} {
set ns [dict'get [$w itemcget [$w parent $node] -data] name]
list namespace [lsort [namespace children $ns]]
}
proc LemonTree::kids(ns-commands) {w node} {
set ns [dict'get [$w itemcget [$w parent $node] -data] name]
list command [lsort [info commands ${ns}::*]]
}
proc LemonTree::kids(ns-vars) {w node} {
set ns [dict'get [$w itemcget [$w parent $node] -data] name]
set res ""
foreach var [lsort [info vars ${ns}::*]] {
lappend res [expr {[array exists $var]? "array": "variable"}] $var
}
set res
}
#-- Arrays can also be seen as a one-level subtree:
proc LemonTree::kids(array) {w node} {
set name [dict'get [$w itemcget $node -data] name]
list variable [lsort [array names $name]]
}
proc LemonTree::kids(widget) {w node} {
set name [dict'get [$w itemcget $node -data] name]
list widget [winfo children $name]
}
if 0 {A Tree looks prettier if nodes have icons, so I'm using some of those that BWidget comes with:}
set path $BWIDGET::LIBRARY/images
foreach {type name} {dir folder file file array copy html folder} {
set LemonTree::icon($type) [image create photo -file $path/$name.gif]
}
# Some more icons come from adavis's Icons package:
set LemonTree::icon(widget) [image create photo -data {
R0lGODlhEAAQAIUAAFxaXGRmZFRWVGQmhFwmfFxeXOTm5MTCxLyWzLySzKyC
vKSCvJxyrJRmrIxipIxWpNze3AQCBMTGxJRWtJRatIxOrIRCpHw+pHw6nHQ2
lGwulOzu7Pz+/Pz+9Ozu5Pz+7NzexPz+5Pz+hPz+3NzevPz+BMTCBNzetMTC
DPz+xNze1NzezPwCBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAEAACwALAAAAAAQABAAAAaV
QEBAMCAUj0aCYFkwHBAJhWLBYDQcD8ghIjhIJhRKxXLBZDQaiYQLABDe8PiS
u+HY7/dOh+PhQvB4eh8fIH6Adh2DHyGFAn+BiQAiISMkhnmSIQAlI5KXHIkf
AiUmpCUnhoKLISgpIikmAlwqtCArkiUlIhwiuSKyEcHCESausMEsycrJEaal
y9ARIizN1NPQ0dfJfkEAIf5oQ3JlYXRlZCBieSBCTVBUb0dJRiBQcm8gdmVy
c2lvbiAyLjUNCqkgRGV2ZWxDb3IgMTk5NywxOTk4LiBBbGwgcmlnaHRzIHJl
c2VydmVkLg0KaHR0cDovL3d3dy5kZXZlbGNvci5jb20AOw==}]
set LemonTree::icon(namespace) [image create photo -data {
R0lGODlhEAAQAIIAAPwCBAQCBPz+xERCBMTCBISCBDQyNAAAACH5BAEAAAAA
LAAAAAAQABAAAANPCLoR+7AJ0SALYkxd79za12FgOTlAQBDhRxUFqrKEG8Py
OqwEfMeKwGDI8zVGul0vFsAFdaxB43ecKZfUKm1lZD6ERZgBZWn0OpYvGeJP
AAAh/mhDcmVhdGVkIGJ5IEJNUFRvR0lGIFBybyB2ZXJzaW9uIDIuNQ0KqSBE
ZXZlbENvciAxOTk3LDE5OTguIEFsbCByaWdodHMgcmVzZXJ2ZWQuDQpodHRw
Oi8vd3d3LmRldmVsY29yLmNvbQA7}]
set LemonTree::icon(command) [image create photo -data {
R0lGODlhEAAQAIIAAPwCBAQCBISChMTCxDQyNFxaXKSipPz+/CH5BAEAAAAA
LAAAAAAQABAAAANdCLobwbAFMciLwBFSihBEFHSG8QnmpQQEBX6loI5G5QTl
cMgrZSmEmsGxKqRWNV3hMrFlBtDoA1eTEaKHJdMYhR6+gxkF++UMGbiDzvDV
ioyHAJSHcchuGLQq4k8AACH+aENyZWF0ZWQgYnkgQk1QVG9HSUYgUHJvIHZl
cnNpb24gMi41DQqpIERldmVsQ29yIDE5OTcsMTk5OC4gQWxsIHJpZ2h0cyBy
ZXNlcnZlZC4NCmh0dHA6Ly93d3cuZGV2ZWxjb3IuY29tADs=}]
set LemonTree::icon(variable) [image create photo -data {
R0lGODlhEAAQAIYAAPwCBFxaVMR+RPzKjNze3AQCBMR6RPzGjPyODPz+/MzO
zPyKDPyKBPz29OTWzPyGDPyGBOx6BOza1OR2BKROBNSOXKRKBBwOBOzu7PTW
xPzizOySZPyCDFxaXOy2lNRyRMxmJCQOBPTm1OzStPTKrMR+XIRWLFxGNCQS
BDQyNIRSNDQuJERGRLyqlNzSvIx6ZKRuVEw6LLSyrLymhKSShBwaFFROTJyW
jMS+vNzW1OTazNzKrHRqXOzezOTOpPTq3OzWvOTStLyedMS+rLy2pMSynMSu
lAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAEAAAAA
LAAAAAAQABAAAAewgAAAAYSFhoQCA4IBBI2OjgUGBwiLBAmXlpcKkgsMlZcJ
BA0JDpIPEBGVjwkSBgOnExSfmBIVBxAMExYXswkYGRobHLq8gh2PHhoeHyAW
IYKzIiMkJSYnKCnQg5YNHtQqKywtK9qMBC4vMDEBMjIz2dCMDTQ1Njc4OToz
5PEEOzw3ZPToMcLHO23HfogQ0QMIkCA+hPBbhAPHECJFjMyYIUQIvEUpUqwQ
OXKkSEF+AgEAIf5oQ3JlYXRlZCBieSBCTVBUb0dJRiBQcm8gdmVyc2lvbiAy
LjUNCqkgRGV2ZWxDb3IgMTk5NywxOTk4LiBBbGwgcmlnaHRzIHJlc2VydmVk
Lg0KaHR0cDovL3d3dy5kZXZlbGNvci5jb20AOw==}]
if 0 {This thing is more useful if you can get more information about an item by clicking on it
- for a file, its size and date; for a variable, its value; for a proc, its full specification, etc.
As a small first shot, I selected a "balloon" for that purpose. }
proc LemonTree::Info {w node} {
set type [dict'get [$w itemcget $node -data] type]
if {[info proc ::LemonTree::info($type)] ne ""} {
balloon $w [info($type) $w $node]
}
}
#-- type-specific info providers:
proc LemonTree::info(array) {w node} {
set name [dict'get [$w itemcget $node -data] name]
return "$name: array, [array size $name] elements"
}
proc LemonTree::info(command) {w node} {
set name [dict'get [$w itemcget $node -data] name]
if {[info procs $name] ne ""} {
return [procinfo $name]
} else {return "$name: compiled command"}
}
proc LemonTree::info(dir) {w node} {
set name [dict'get [$w itemcget $node -data] name]
set mtime [clock format [file mtime $name] -format %y-%m-%d,%H:%M:%S]
set nfiles [llength [glob -nocomplain $name/*]]
return "$name\n$nfiles files\nModified: $mtime"
}
proc LemonTree::info(html) {w node} {
set name [dict'get [$w itemcget $node -data] name]
set val1 [::t get $name data]
set val2 [::t get $name type]
set val3 [::t index $name]
return "$val1"
}
proc LemonTree::info(file) {w node} {
set name [dict'get [$w itemcget $node -data] name]
set mtime [clock format [file mtime $name] -format %y-%m-%d,%H:%M:%S]
return "$name\n[file size $name] bytes\nModified: $mtime"
}
proc LemonTree::info(namespace) {w node} {
set ns [dict'get [$w itemcget $node -data] name]
return "namespace $ns\n[llength [info commands ${ns}::*]] commands,\
[llength [info vars ${ns}::*]] variables,\
[llength [namespace children $ns]] child(ren)"
}
proc LemonTree::info(variable) {w node} {
set name [dict'get [$w itemcget $node -data] name]
if [info exists $name] {
list $name = [set $name]
} else { #-- array element
set arr [dict'get [$w itemcget [$w parent $node] -data] name]
list ${arr}($name) = [set ${arr}($name)]
}
}
proc LemonTree::info(widget) {w node} {
set name [dict'get [$w itemcget $node -data] name]
return "[winfo class $name] $name [winfo geometry $name]"
}
#-- A simple ballon, modified from Bag of Tk algorithms:
proc balloon {w text} {
set top .balloon
catch {destroy $top}
toplevel $top -bd 1
pack [message $top.txt -aspect 10000 -bg lightyellow \
-borderwidth 0 -text $text -font {Helvetica 9}]
wm overrideredirect $top 1
wm geometry $top +[winfo pointerx $w]+[winfo pointery $w]
bind $top <1> [list destroy $top]
raise $top
}
if 0 {From Tcl 8.5, one would use a real dict, but it's easy to make a replacement
that works roughly the same in 8.4 (it returns "" for non- existing keys instead of throwing an error),
and might be slower, but I won't notice on dicts with two elements ;-}
proc dict'get {dict key} {
foreach {k value} $dict {if {$k eq $key} {return $value}}
}
#-- reconstruct a proc's definition as a string:
proc procinfo name {
set args ""
foreach arg [info args $name] {
if [info default $name $arg def] {lappend arg $def}
lappend args $arg
}
return "proc $name {$args} {[info body $name]}"
}
#
#
#
proc main {} {
set url [.txt get]
if {$url == ""} {
tk_messageBox -message "specify html location"
return
}
::struct::tree t
if {[string range $url 0 3] == "http"} {
set http [::http::geturl $url]
set html [::http::data $http]
} else {
set html [read [set fh [open $url]]]
close $fh
}
#puts $url
htmlparse::2tree $html t
htmlparse::removeVisualFluff t
htmlparse::removeFormDefs t
LemonTree::add .t root html root "(html)"
return
}
# -------------------------------------------
label .lbl -text "URL or path:"
entry .txt -width 60
button .btnBrowser -text Browse... -command {.txt insert end [tk_getOpenFile]}
button .btnGet -text "Get html file" -command main
button .btnSalir -text Exit -command {
catch {t destroy}
exit
}
grid .lbl .txt .btnBrowser
grid .btnGet -row 1 -column 1
grid .btnSalir -row 1 -column 2
#-- Now to demonstrate and test the whole thing:
Tree .t -background white -opencmd {LemonTree::open .t} \
-width 60 -height 30 -yscrollcommand {.y set}
.t bindText <1> {LemonTree::Info .t}
.t bindImage <1> {LemonTree::Info .t}
scrollbar .y -command {.t yview}
grid .t -row 2 -column 0 -columnspan 3
grid .y -row 2 -column 3 -sticky ns
.txt insert end "http://www.cs.grinnell.edu/~walker/fluency-book/labs/sample-table.html"
#-- Little development helpers:
bind . {exec wish $argv0 &; exit}
bind . {console show}
======
<> Category Application | Category GUI | Category Tutorial