male: I needed to parse a web site pages, but the HTML provided by this web site is so malformed, that tDOM has no chance to parse these pages.
So I tried successfully to use htmlparse, which uses struct::tree to store the parsed data in. So I use the struct::tree routines to convert its data into a tDOM DOM tree.
The source below is tested in the context of the my needs to parse pages from a specific web site, but the used packages should allow to use this source a few more websites.
So I published the source here on the wiki.
Using this code it is possible to:
... to convert this HTML data into a tDOM DOM.
html2dom converturl url
Where "url" points to a web resource of one the MIME types "text/html", "text/xhtml", "application/xhtml+xml".
The command returns the tDOM DOM object.
html2dom convertfile fileName
The command returns the tDOM DOM object.
html2dom convertdata htmlData
The command returns the tDOM DOM object.
package require http; package require uri; package require tdom; package require htmlparse; package require struct::tree; namespace eval html2dom { proc Attributes {data} { regsub -all {\s*=\s*} $data {=} data; set attributes [dict create]; set valueRanges [list]; foreach {=> nameRange extValueRange valueRange} [regexp -inline -indices -all -- {([a-zA-Z]\w+)=(\"([^\"]*)\")} $data] { # emacs: " lappend valueRanges $extValueRange; set name [string range $data {*}$nameRange]; set value [string range $data {*}$valueRange]; dict set attributes $name [expr {$value eq "" ? $name : $value}]; } foreach {=> nameRange valueRange} [regexp -inline -indices -all -- {([a-zA-Z]\w+)(?:=?(\S*))?} $data] { lassign $valueRange curFirst curLast; set forget 0; foreach range $valueRanges { lassign $range first last; if {$first <= $curFirst && $curLast <= $last} { set forget 1; break; } } if {$forget} { continue; } lappend valuesRange $valueRange; set name [string range $data {*}$nameRange]; set value [string range $data {*}$valueRange]; dict set attributes $name [expr {$value eq "" ? $name : $value}]; } return $attributes; } proc Walk {tree parentNode node dom parentDomNode} { set type [$tree get $node type]; set hasData 0; if {[$tree keyexists $node data]} { set hasData 1; } if {![regexp {[a-zA-Z]\w*} $type]} { if {$hasData} { set data [$tree get $node data]; } $tree set $node data [concat $type $data]; set type PCDATA; } if {$parentNode eq $node} { set domNode $parentDomNode; } else { if {$type eq "PCDATA"} { set domNode [$dom createTextNode $type]; $domNode nodeValue [$tree get $node data]; $parentDomNode appendChild $domNode; return; } set domNode [$dom createElement $type]; $parentDomNode appendChild $domNode; } if {$hasData} { set data [$tree get $node data]; if {$data ne ""} { set attributes [Attributes $data]; if {$attributes ne ""} { $domNode setAttribute {*}$attributes; } } } foreach childNode [$tree children $node] { [lindex [info level 0] 0] $tree $node $childNode $dom $domNode; } return; } proc converturl {url} { http::config -accept "text/html, text/xhtml, application/xhtml+xml"; set ncode 0; while {$ncode < 200 || $ncode >= 300} { # test if the url is valid # if {[catch {set token [http::geturl $url -validate 1];} reason options] == 1} { http::config -accept "*/*"; return -options $options $reason; } # error and redirection handling # set ncode [http::ncode $token]; set meta [http::meta $token]; # error handling # if {[http::status $token] eq "error" || $ncode >= 400} { if {[http::status $token] eq "error"} { set reason "[http::error $token] ([http::code $token])"; } else { set reason "[http::code $token]"; } set errorCode [list $ncode [http::code $token] $meta]; http::cleanup $token; http::config -accept "*/*"; return -code error -errorcode $errorCode $reason; } # redirection handling # if {$ncode >= 300} { set url [uri::resolve $url [dict get [http::meta $token] Location]]; http::cleanup $token; continue; } http::cleanup $token; # fetching the HTML data from the url # if {[catch {set token [http::geturl $url];} reason options] == 1} { http::config -accept "*/*"; return -options $options $reason; } set htmlData [http::data $token]; http::cleanup $token; } http::config -accept "*/*"; # converting the HTML data to the tDOM DOM # if {[catch {set dom [convertdata $htmlData];} reason options] == 1} { return -options $options $reason; } set titleNode [[$dom documentElement] selectNodes {/html/head/title}]; if {$titleNode ne "" && [regexp {[[:<:]]404[[:>:]]} [$titleNode text]]} { $dom delete; return -code error -errorcode [list 404 "HTTP/1.1 404 Not Found" $meta] "HTTP/1.1 404 Not Found" } return $dom; } proc convertfile {htmlFileName} { if {[catch {set dom [convertdata [tDOM::xmlReadFile $htmlFileName]];} reason options] == 1} { return -options $options $reason; } return $dom; } proc convertdata {htmlData} { try { # create a tree to be filled by htmlparse # set tree [struct::tree]; # parse the HTML data and remove "unwanted" data # htmlparse::2tree $htmlData $tree; htmlparse::removeVisualFluff $tree; htmlparse::removeFormDefs $tree; # convert the HTML data tree into a tDOM DOM tree # set dom [dom createDocument html]; set domDoc [$dom documentElement]; set treeRoot [$tree rootname]; Walk $tree $treeRoot $treeRoot $dom $domDoc; } on error {reason options} { if {[info exists tree]} { $tree destroy; } if {[info exists dom]} { $dom delete; } return -options $options $reason; } finally { $tree destroy; } return $dom } namespace export -clear {[a-z]*}; namespace ensemble create; }
set url http://www.nirsoft.net/utils/index.html; set dom [html2dom converturl $url]; set domDoc [$dom documentElement]; set baseNode [$domDoc selectNodes {/html/head/base}]; if {$baseNode eq ""} { set uri [uri::split $url]; set baseUrl [uri::join {*}[dict set uri path [file dirname [dict get $uri path]]]]/; } else { set baseUrl [$baseNode @href]/; } set utils [dict create]; foreach sectionNode [$domDoc selectNodes {//table[@class="title2"]}] { set abbreviation [[$sectionNode selectNodes {preceding-sibling::a}] @name]; set title [[$sectionNode selectNodes {.//text()[1]}] nodeValue]; set descriptionNode [$sectionNode selectNodes {../span[@class="special3"]/text()}]; if {$descriptionNode ne ""} { set description [$descriptionNode nodeValue]; } else { set description ""; } set filesTableNode [$sectionNode selectNodes {../table[@class="filestable"]}]; set section [dict create abbr $abbreviation title $title desc $description utils [dict create]]; puts section=$abbreviation; dict update section utils sectionUtils { foreach node [$filesTableNode selectNodes {.//node()[@class="filetitle"]/..}] { set longDescription [$node text]; set anchorNode [$node selectNodes {a[1]}]; regexp {((?:\w+\s*)*\w+)(?:\s+v(\d+\.\d+))?(?: . (.+))?$} [$anchorNode text] => name version shortDescription; set relativeUrl [$anchorNode @href] set relativeUri [uri::split $relativeUrl]; set absoluteUrl [uri::resolve $baseUrl $relativeUrl]; set abbr [file rootname [file tail [dict get $relativeUri path]]] dict set sectionUtils $abbr [dict create \ abbr $abbr \ name $name \ version $version \ shortdesc $shortDescription \ longdesc $longDescription \ relurl $relativeUrl \ absurl $absoluteUrl \ ]; puts \tutil=$abbr } } dict set utils $abbreviation $section; }