**This is some code to convert HTML data to a tDOM DOM.** ***Why*** [male]: I needed to parse a web site pages, but the [HTML] provided by this web site is so malformed, that [tDOM] has no chance to parse these pages. So I tried successfully to use [htmlparse], which uses [struct]::[tree] to store the parsed data in. So I use the struct::tree routines to convert its data into a tDOM [DOM] tree. The source below is tested in the context of the my needs to parse pages from a specific web site, but the used packages should allow to use this source a few more websites. So I published the source here on the wiki. ***Functionality*** Using this code it is possible to: * specify an URL to download the HTML data from (following even redirections) * specify a path to a HTML file * specify already loaded/built/... HTML data ... to convert this HTML data into a tDOM DOM. ***Syntax*** ****Download of the HTML data via URL:**** html2dom converturl url Where "url" points to a web resource of one the MIME types "text/html", "text/xhtml", "application/xhtml+xml". The command returns the tDOM DOM object. ****Loading the HTML data from a file:**** html2dom convertfile fileName The command returns the tDOM DOM object. ****Converting directly given HTML data:**** html2dom convertdata htmlData The command returns the tDOM DOM object. ***Dependencies*** * tcl 8.6 (because of the usage of try/error/finally - tcl 8.5, if try/error/finally would be eleminated) * [http] package (from within tcl) * [tcllib] pagckages ** [uri] - to parse URI/URLs ** [htmlparse] - to parse HTML data ** [struct::tree] - to store/handle the HTML data parse result * [tDOM] - the XML DOM package ***The source*** ====== package require http; package require uri; package require tdom; package require htmlparse; package require struct::tree; namespace eval html2dom { proc Attributes {data} { regsub -all {\s*=\s*} $data {=} data; set attributes [dict create]; set valueRanges [list]; foreach {=> nameRange extValueRange valueRange} [regexp -inline -indices -all -- {([a-zA-Z]\w+)=(\"([^\"]*)\")} $data] { lappend valueRanges $extValueRange; set name [string range $data {*}$nameRange]; set value [string range $data {*}$valueRange]; dict set attributes $name [expr {$value eq "" ? $name : $value}]; } foreach {=> nameRange valueRange} [regexp -inline -indices -all -- {([a-zA-Z]\w+)(?:=?(\S*))?} $data] { lassign $valueRange curFirst curLast; set forget 0; foreach range $valueRanges { lassign $range first last; if {$first <= $curFirst && $curLast <= $last} { set forget 1; break; } } if {$forget} { continue; } lappend valuesRange $valueRange; set name [string range $data {*}$nameRange]; set value [string range $data {*}$valueRange]; dict set attributes $name [expr {$value eq "" ? $name : $value}]; } return $attributes; } proc Walk {tree parentNode node dom parentDomNode} { set type [$tree get $node type]; set hasData 0; if {[$tree keyexists $node data]} { set hasData 1; } if {![regexp {[a-zA-Z]\w*} $type]} { if {$hasData} { set data [$tree get $node data]; } $tree set $node data [concat $type $data]; set type PCDATA; } if {$parentNode eq $node} { set domNode $parentDomNode; } else { if {$type eq "PCDATA"} { set domNode [$dom createTextNode $type]; $domNode nodeValue [$tree get $node data]; $parentDomNode appendChild $domNode; return; } set domNode [$dom createElement $type]; $parentDomNode appendChild $domNode; } if {$hasData} { set data [$tree get $node data]; if {$data ne ""} { set attributes [Attributes $data]; if {$attributes ne ""} { $domNode setAttribute {*}$attributes; } } } foreach childNode [$tree children $node] { [lindex [info level 0] 0] $tree $node $childNode $dom $domNode; } return; } proc converturl {url} { http::config -accept "text/html, text/xhtml, application/xhtml+xml"; set ncode 0; while {$ncode < 200 || $ncode >= 300} { # test if the url is valid # if {[catch {set token [http::geturl $url -validate 1];} reason options] == 1} { http::config -accept "*/*"; return -options $options $reason; } # error and redirection handling # set ncode [http::ncode $token]; set meta [http::meta $token]; # error handling # if {[http::status $token] eq "error" || $ncode >= 400} { if {[http::status $token] eq "error"} { set reason "[http::error $token] ([http::code $token])"; } else { set reason "[http::code $token]"; } set errorCode [list $ncode [http::code $token] $meta]; http::cleanup $token; http::config -accept "*/*"; return -code error -errorcode $errorCode $reason; } # redirection handling # if {$ncode >= 300} { set url [uri::resolve $url [dict get [http::meta $token] Location]]; http::cleanup $token; continue; } http::cleanup $token; # fetching the HTML data from the url # if {[catch {set token [http::geturl $url];} reason options] == 1} { http::config -accept "*/*"; return -options $options $reason; } set htmlData [http::data $token]; http::cleanup $token; } http::config -accept "*/*"; # converting the HTML data to the tDOM DOM # if {[catch {set dom [convertdata $htmlData];} reason options] == 1} { return -options $options $reason; } set titleNode [[$dom documentElement] selectNodes {/html/head/title}]; if {$titleNode ne "" && [regexp {[[:<:]]404[[:>:]]} [$titleNode text]]} { $dom delete; return -code error -errorcode [list 404 "HTTP/1.1 404 Not Found" $meta] "HTTP/1.1 404 Not Found" } return $dom; } proc convertfile {htmlFileName} { if {[catch {set dom [convertdata [tDOM::xmlReadFile $htmlFileName]];} reason options] == 1} { return -options $options $reason; } return $dom; } proc convertdata {htmlData} { try { # create a tree to be filled by htmlparse # set tree [struct::tree]; # parse the HTML data and remove "unwanted" data # htmlparse::2tree $htmlData $tree; htmlparse::removeVisualFluff $tree; htmlparse::removeFormDefs $tree; # convert the HTML data tree into a tDOM DOM tree # set dom [dom createDocument html]; set domDoc [$dom documentElement]; set treeRoot [$tree rootname]; Walk $tree $treeRoot $treeRoot $dom $domDoc; } on error {reason options} { if {[info exists tree]} { $tree destroy; } if {[info exists dom]} { $dom delete; } return -options $options $reason; } finally { $tree destroy; } return $dom } namespace export -clear {[a-z]*}; namespace ensemble create; } ====== ***The usage (an example)*** ====== set url http://www.nirsoft.net/utils/index.html; set dom [html2dom converturl $url]; set domDoc [$dom documentElement]; set baseNode [$domDoc selectNodes {/html/head/base}]; if {$baseNode eq ""} { set uri [uri::split $url]; set baseUrl [uri::join {*}[dict set uri path [file dirname [dict get $uri path]]]]/; } else { set baseUrl [$baseNode @href]/; } set utils [dict create]; foreach sectionNode [$domDoc selectNodes {//table[@class="title2"]}] { set abbreviation [[$sectionNode selectNodes {preceding-sibling::a}] @name]; set title [[$sectionNode selectNodes {.//text()[1]}] nodeValue]; set descriptionNode [$sectionNode selectNodes {../span[@class="special3"]/text()}]; if {$descriptionNode ne ""} { set description [$descriptionNode nodeValue]; } else { set description ""; } set filesTableNode [$sectionNode selectNodes {../table[@class="filestable"]}]; set section [dict create abbr $abbreviation title $title desc $description utils [dict create]]; puts section=$abbreviation; dict update section utils sectionUtils { foreach node [$filesTableNode selectNodes {.//node()[@class="filetitle"]/..}] { set longDescription [$node text]; set anchorNode [$node selectNodes {a[1]}]; regexp {((?:\w+\s*)*\w+)(?:\s+v(\d+\.\d+))?(?: . (.+))?$} [$anchorNode text] => name version shortDescription; set relativeUrl [$anchorNode @href] set relativeUri [uri::split $relativeUrl]; set absoluteUrl [uri::resolve $baseUrl $relativeUrl]; set abbr [file rootname [file tail [dict get $relativeUri path]]] dict set sectionUtils $abbr [dict create \ abbr $abbr \ name $name \ version $version \ shortdesc $shortDescription \ longdesc $longDescription \ relurl $relativeUrl \ absurl $absoluteUrl \ ]; puts \tutil=$abbr } } dict set utils $abbreviation $section; } ====== <>HTML | XML