LemonTree branch

12 Dec 2009 JM This is a LemonTree variant to explore html trees created by htmlparse
27 Jul 2019 JM an attempt to renovate LemonTree branch with TreeQL example
(Imagine an interactive console to try your TreeQL commands!)

The 2 building blocks:

  1. htmlparse::2tree functionality that puts html contents on a tree structure
  2. LemonTree was developed to be easy to add new item types (Thanks RS!)

This script could be used as a tool for your web scraping coding if you use htmlparse as the main parsing engine.

see Web Scraping with htmlparse

usage:
- copy & paste URL from your web browser to the entry, or...
- browse your file system with the "Browse..." button
Then load the html contents with the button "html > tree" and start navigating the tree structure.

LemonTreeQL_1

JM 7/27/2019 - You can try some TreeQL code as in the example below, by launching the proc "sample" from the console to color your nodes according to the TreeQL commands. This is a good way to try your queries, or to learn how TreeQL works.
This is just a proof of concept right now, but it could help to demonstrate why processing trees generated by htmlparse seems like the killer app for TreeQL

LemonTreeQL_2

package require struct
package require csv
package require report
package require htmlparse
package require textutil
package require http
package require tls
package require BWidget
package require treeql

namespace eval LemonTree {variable uniqueID 0}
console show

#http://www.cs.grinnell.edu/~walker/fluency-book/labs/sample-table.html

proc sample {} {
global t

treeql q1 -tree t
treeql q2 -tree t

q1 query tree withatt type ul

.t opentree _root
set ctr 0
foreach elem [q1 result] {
  .t opentree $elem
  .t itemconfigure $elem -fill red
  puts "$ctr - $elem"
  incr ctr
}

set ul [lindex [q1 result] 3] 
q1 query replace $ul children map x {
        .t itemconfigure $x -fill blue
        
        q2 query replace $x children withatt type PCDATA

        foreach elem [q2 result] {
          set nodeData [t get $elem data]
          puts [t get $elem type]
                if {[string match *Bin* $nodeData]} {
                  #.t opentree $elem
                  .t itemconfigure $elem -fill green
                  #.t itemconfigure $elem -image $icon(file)
                  .t itemconfigure $elem -image $LemonTree::icon(file)
          }
        }
        

}
}

 proc LemonTree::add {w parent type name {text ""}} {
    variable uniqueID; variable icon
 
    
                if {$name != "root"} {
      set val1 [::t get $name data]
      set val2 [::t get $name type]
      set val3 [::t index $name]
      # string range $val1 0 50 
      if {$text eq ""} {set text "$val3: <$val2> $val1"}
      #if {$text eq ""} {set text "$val3,$val2,$val1"}
      set id $name
                } else {
                  puts "ok: $name"
                        set id "_root"
                }

                #puts $text
                #puts [string length $text]
                if {[string length $text] > 50} {
                        set text [string range $text 0 49]
                }

    #set id n[incr uniqueID]

    #tk_messageBox -message "$id: $type,$name"

    set data [list type $type name $name]
    set fill [expr {[string match (* $text]? "blue": "black"}]
    set drawcross [expr {[info proc ::LemonTree::kids($type)] eq ""?
         "never": "allways"}]
    $w insert end $parent $id -text "$id - $text" -data $data -drawcross $drawcross -fill $fill
    if [info exists icon($type)] {
             $w itemconfigure $id -image $icon($type)
    }
 }

proc LemonTree::open {w node} {
    if {[$w itemcget $node -drawcross] eq "allways"} {
        set data [$w itemcget $node -data]
        set type [dict'get $data type]
        foreach {ktype kids} [kids($type) $w $node] {
            foreach kid $kids {add $w $node $ktype $kid}
        }
        $w itemconfigure $node -drawcross auto
    }
 }

 proc LemonTree::kids(html) {w node} {
    set name [dict'get [$w itemcget $node -data] name]
    list html [::t children $name]
 }

set path $BWIDGET::LIBRARY/images

foreach {type name} {dir folder file file array copy html folder} {
   set LemonTree::icon($type) [image create photo -file $path/$name.gif]
}

proc LemonTree::Info {w node} {
    set type [dict'get [$w itemcget $node -data] type]
    if {[info proc ::LemonTree::info($type)] ne ""} {
        balloon $w [info($type) $w $node]
    }
 }
 
#-- type-specific info providers:

 proc LemonTree::info(html) {w node} {
    #puts $node
    set name [dict'get [$w itemcget $node -data] name]
    if {$name != "root"} {
      set val1 [::t get $name data]
      set val2 [::t get $name type]
      set val3 [::t index $name]
      puts $node
      puts "\t[t index $name]"
      foreach nodo [t ancestors $name] {
       if {$nodo != "root"} {
         puts "\t[t index $nodo]"
       }
      }
    } else {
     set val1 "root"
    }
    return "$val1"
 }
 
proc balloon {w text} {
    set top .balloon
    catch {destroy $top}
    toplevel $top -bd 1
    pack [message $top.txt -aspect 10000 -bg lightyellow \
        -borderwidth 0 -text $text -font {Helvetica 9}]
    wm overrideredirect $top 1
    wm geometry $top +[winfo pointerx $w]+[winfo pointery $w]
    bind  $top <1> [list destroy $top]
    raise $top
 }

proc dict'get {dict key} {
    foreach {k value} $dict {if {$k eq $key} {return $value}}
 }
#-- reconstruct a proc's definition as a string:
proc procinfo name {
    set args ""
    foreach arg [info args $name] {
        if [info default $name $arg def] {lappend arg $def}
        lappend args $arg
    }
    return "proc $name {$args} {[info body $name]}"
 }
 
#
#
#
proc main {} {
global t

  set url [.txt get]
  catch {t destroy}
  .t delete [.t nodes root]

  if {$url == ""} {
  tk_messageBox -message "specify html location"
  return
  }
  ::struct::tree t
  #set t [::struct::tree]
  if {[string range $url 0 3] == "http"} {
    http::register https 443 tls::socket
                set http  [::http::geturl $url]
    set html  [::http::data $http]
  } else {
    set html [read [set fh [open $url]]]
    close $fh
  }
  
  #puts $url
  htmlparse::2tree $html t
  htmlparse::removeVisualFluff t
  htmlparse::removeFormDefs t
  LemonTree::add .t root html root "(html)"
  
return
}
 
# -------------------------------------------
label .lbl -text "URL or path:"
entry .txt -width 60
button .btnBrowser -text Browse... -command {.txt insert end [tk_getOpenFile]}
button .btnGet -text "html > tree" -command main
button .btnSalir -text Exit -command {
catch {t destroy}
exit
}
grid .lbl .txt .btnBrowser
grid .btnGet -row 1 -column 1
grid .btnSalir -row 1 -column 2

#-- Now to demonstrate and test the whole thing: 
Tree .t -background white -opencmd {LemonTree::open .t} \
-width 90 -height 30 -yscrollcommand {.y set}
 .t bindText  <1> {LemonTree::Info .t}
 .t bindImage <1> {LemonTree::Info .t}
 
scrollbar .y -command {.t yview}

grid .t -row 2 -column 0 -columnspan 3 
grid .y -row 2 -column 3 -sticky ns

#.txt insert end "https://wiki.tcl-lang.org/page/RS"
.txt insert end "http://www.cs.grinnell.edu/~walker/fluency-book/labs/sample-table.html"

#-- Little development helpers:
bind . <Escape> {exec wish $argv0 &; exit}
bind . <F1> {console show}