Tcllib tree style interface to tDOM

if 0 {

This code provides an interface to the tdom structure that is compatible with the API [L1 ] of tcllib's struct::tree data structure (not compatible with struct 2.0 and later). Not all of tcllib's methods are provided--only the ones I found most useful. This code is not nearly as robust as the tcllib code. The code requires tdom as well as the way-cool object oriented extension xotcl.

The tdom interface is much more flexible than this interface which assumes that document elements are always created with a tag named "node". With tdom any mix of tag names can be used in the tree and text nodes can be inserted as well.

Why? I find the tcllib API pretty easy to work with (mostly because I've used it frequently) especially for playing around in the interpreter. Although, it is much easier to create a dom by creating an xml file and then using dom parse. The real reason I wrote this was so that I could play around with xpath searches in my tkoutline application.

An extra method domDoc is provided to access the domDoc object. This object behaves as documented in tdom. Also the node names in the tree are each domNode objects.

-Brian Theado

Sample session:

 % Tree mytree
 ::mytree

Insert some nodes

 % mytree insert root 0
 domNode4
 % mytree insert root 0
 domNode5
 % mytree insert root 0
 domNode6
 % mytree children root
 domNode6 domNode5 domNode4
 % mytree insert domNode4 0
 domNode7
 % mytree domDoc
 domDoc1
 % [mytree domDoc] documentElement
 domNode2

Perform an xpath query for all the leaves in the tree

 % [[mytree domDoc] documentElement] selectNodes {//*[count(child::*)=0]}
 domNode5 domNode6 domNode7

}


 # Partial wrapping of tDOM functionality with tcllib tree interface
 package require tdom
 package require XOTcl
 namespace import xotcl::*
 Class Tree

 Tree instproc init {{existingDomDoc ""}} {
    [self] instvar domDoc
    if {[string length $existingDomDoc] == 0} {
        set domDoc [dom createDocument node]
    } else {
        set domDoc $existingDomDoc
    }
    [self] parametercmd domDoc
    next
 }
 Tree instproc insertNode {parent idx child} {

    # tcllib tree indexing starts at zero, but DOM starts at one
    incr idx
    set sibling [$parent child $idx]
    if {[string length $sibling] == 0} {
        return [$parent appendChild $child]
    } else {
        return [$parent insertBefore $child $sibling]
    }
 }

 # tcllib tree allows an optional third argument to specify the
 # name of the node.  tDOM doesn't allow node names to be specified
 # The easy way out--don't allow the third argument 
 Tree instproc insert {parent idx} {
    [self] instvar domDoc

    # Create the node
    set newNode [$domDoc createElement node]
    
    # Add the node to the tree
    [self] insertNode $parent $idx $newNode
 }
 Tree instproc move {newParent idx nodeToMove args} {
    foreach node [concat $nodeToMove $args] {
        set oldParent [$node parentNode]
        $oldParent removeChild $node
        [self] insertNode $newParent $idx $node
        incr idx
    }
 }
 Tree instproc keyexists {node -key key} {
    $node hasAttribute $key
 }
 Tree instproc keys {node} {
    $node attributes
 }
 Tree instproc depth {node} {
    return [llength [$node ancestor all]]
 }
 Tree instproc size {node} {
    return [llength [$node descendant all]]
 }
 Tree instproc isleaf {node} {
    return [expr [llength [$node descendant all]] == 0]
 }
 
 # This seems correct, but isn't giving me what I expect
 Tree instproc index {node} {
    return [llength [$node psibling all]]
 }

 # Try this instead
 Tree instproc index {node} {
    return [lsearch [[self] children [[self] parent $node]] $node]
 }

 Tree instproc numchildren {node} {
    return [llength [$node child all]]
 }

 Tree instproc set {node args} {
    switch [llength $args] {
        0 {
            $node getAttribute data
        }
        1 {
            $node setAttribute data [lindex $args 0]
            $node getAttribute data
        }
        2 {
            $node getAttribute [lindex $args 1]
        }
        3 {
            set switch [lindex $args 0]
            set key [lindex $args 1]
            set value [lindex $args 2]
            $node setAttribute $key $value
            $node getAttribute $key
        }
        default {error "wrong number of arguments"}
    }
 }

 # Dynamically construct the simple command that have only a node argument
 foreach {treeCmd domCmd} {children childNodes parent parentNode previous previousSibling next nextSibling delete delete} {
    Tree instproc $treeCmd {node} "\$node $domCmd"
 }
 Tree instproc destroy {args} {
    [self] instvar domDoc
    $domDoc delete
    next
 }

 # The tcllib interface always names the root node "root"
 # tDOM doesn't have a way to specify node names.  Therefore,
 # install this filter to automatically convert "root" to the
 # actual root element
 Tree instproc convertRoot {node args} {
    [self] instvar domDoc

    # Convert input node from name "root"
    if {$node == "root"} {
        set node [$domDoc documentElement]
    }

    # Dispatch the method
    set retVal [eval next $node $args]

    # Convert output node to name "root"
    if {$retVal == [$domDoc documentElement]} {
        return root
    } else {
        return $retVal
    }
 }
 Tree instfilter convertRoot

 # Only call the filter for methods that have node as a first argument
 Tree instfilterguard convertRoot {
    ([lsearch {init destroy} [self calledproc]] < 0) &&
        ([lsearch [Tree info instcommands] [self calledproc]] >= 0)
 }

LV So, what in version 2.0 or newer of struct::tree prevents the above from working - and is there a way to improve the code so that it would work in either version?


male - 2010-03-23 - I had the problem to parse very malformed html, not parsable by tDOM, so I decided to try tcllib's htmlparse package with the current struct::tree package. After parsing the html source text into the struct::tree, I convert the tree into a tDOM DOM to use XPaths to extract the needed information.

Here the code usable with tcl 8.6 due to the usage of try/on/finally:

 package require tdom;
 package require htmlparse;
 package require struct::tree;
 
 namespace eval html2dom {
     proc Attributes {data} {
         set attributes [dict create];
 
         foreach {=> name value} [regexp -inline -all -- {(\w+)(?:=\"?([^\"]*))?\"?} $data] {
             dict set attributes $name [expr {$value eq "" ? $name : $value}];
         }
 
         return $attributes;
     }
 
     proc Walk {tree parentNode node dom parentDomNode} {
         set type [$tree get $node type];
 
         if {$parentNode eq $node} {
             set domNode $parentDomNode;
         } else {
             if {$type eq "PCDATA"} {
                 set domNode [$dom createTextNode $type];
 
                 $domNode nodeValue [$tree get $node data];
                 $parentDomNode appendChild $domNode;
 
                 return;
             }
 
             set domNode [$dom createElement $type];
 
             $parentDomNode appendChild $domNode;
         }
 
         if {[$tree keyexists $node data]} {
             set data [$tree get $node data];
 
             if {$data ne ""} {
                 set attributes [Attributes $data];
 
                 if {$attributes ne ""} {
                     $domNode setAttribute {*}$attributes;
                 }
             }
         }
 
         foreach childNode [$tree children $node] {
             [lindex [info level 0] 0] $tree $node $childNode $dom $domNode;
         }
 
         return;
     }
 
     proc convert {htmlFileName} {
         try {
             set htmlData    [tDOM::xmlReadFile $htmlFileName];
             set tree        [struct::tree];
 
             htmlparse::2tree $htmlData $tree;
             htmlparse::removeVisualFluff $tree;
             htmlparse::removeFormDefs $tree;
 
             set dom         [dom createDocument html];
             set domDoc      [$dom documentElement];
             set treeRoot    [$tree rootname];
 
             Walk $tree $treeRoot $treeRoot $dom $domDoc;
         } on error {reason options} {
             if {[info exists tree]} {
                 $tree destroy;
             }
 
             if {[info exists dom]} {
                 $dom delete;
             }
 
             return -code error -options $options $reason;
         } finally {
             $tree destroy;
         }
 
         return $dom
     }
 
     namespace export -clear {[a-z]*};
     namespace ensemble create;
 }
 
 set dom     [html2dom convert $htmlFileName];
 set domDoc  [$dom documentElement];
 set nodes   [$domDoc selectNodes $xpath];