**This is some code to convert HTML data to a tDOM DOM.**
***Why***
[male]: I needed to parse a web site pages, but the [HTML] provided by this web site is so malformed, that [tDOM] has no chance to parse these pages.
So I tried successfully to use [htmlparse], which uses [struct]::[tree] to store the parsed data in. So I use the struct::tree routines to convert its data into a tDOM [DOM] tree.
The source below is tested in the context of the my needs to parse pages from a specific web site, but the used packages should allow to use this source a few more websites.
So I published the source here on the wiki.
***Functionality***
Using this code it is possible to:
* specify an URL to download the HTML data from (following even redirections)
* specify a path to a HTML file
* specify already loaded/built/... HTML data
... to convert this HTML data into a tDOM DOM.
***Syntax***
****Download of the HTML data via URL:****
html2dom converturl url
Where "url" points to a web resource of one the MIME types "text/html", "text/xhtml", "application/xhtml+xml".
The command returns the tDOM DOM object.
****Loading the HTML data from a file:****
html2dom convertfile fileName
The command returns the tDOM DOM object.
****Converting directly given HTML data:****
html2dom convertdata htmlData
The command returns the tDOM DOM object.
***Dependencies***
* tcl 8.6 (because of the usage of try/error/finally - tcl 8.5, if try/error/finally would be eleminated)
* [http] package (from within tcl)
* [tcllib] pagckages
** [uri] - to parse URI/URLs
** [htmlparse] - to parse HTML data
** [struct::tree] - to store/handle the HTML data parse result
* [tDOM] - the XML DOM package
***The source***
======
package require http;
package require uri;
package require tdom;
package require htmlparse;
package require struct::tree;
namespace eval html2dom {
proc Attributes {data} {
regsub -all {\s*=\s*} $data {=} data;
set attributes [dict create];
set valueRanges [list];
foreach {=> nameRange extValueRange valueRange} [regexp -inline -indices -all -- {([a-zA-Z]\w+)=(\"([^\"]*)\")} $data] {
lappend valueRanges $extValueRange;
set name [string range $data {*}$nameRange];
set value [string range $data {*}$valueRange];
dict set attributes $name [expr {$value eq "" ? $name : $value}];
}
foreach {=> nameRange valueRange} [regexp -inline -indices -all -- {([a-zA-Z]\w+)(?:=?(\S*))?} $data] {
lassign $valueRange curFirst curLast;
set forget 0;
foreach range $valueRanges {
lassign $range first last;
if {$first <= $curFirst && $curLast <= $last} {
set forget 1;
break;
}
}
if {$forget} {
continue;
}
lappend valuesRange $valueRange;
set name [string range $data {*}$nameRange];
set value [string range $data {*}$valueRange];
dict set attributes $name [expr {$value eq "" ? $name : $value}];
}
return $attributes;
}
proc Walk {tree parentNode node dom parentDomNode} {
set type [$tree get $node type];
set hasData 0;
if {[$tree keyexists $node data]} {
set hasData 1;
}
if {![regexp {[a-zA-Z]\w*} $type]} {
if {$hasData} {
set data [$tree get $node data];
}
$tree set $node data [concat $type $data];
set type PCDATA;
}
if {$parentNode eq $node} {
set domNode $parentDomNode;
} else {
if {$type eq "PCDATA"} {
set domNode [$dom createTextNode $type];
$domNode nodeValue [$tree get $node data];
$parentDomNode appendChild $domNode;
return;
}
set domNode [$dom createElement $type];
$parentDomNode appendChild $domNode;
}
if {$hasData} {
set data [$tree get $node data];
if {$data ne ""} {
set attributes [Attributes $data];
if {$attributes ne ""} {
$domNode setAttribute {*}$attributes;
}
}
}
foreach childNode [$tree children $node] {
[lindex [info level 0] 0] $tree $node $childNode $dom $domNode;
}
return;
}
proc converturl {url} {
http::config -accept "text/html, text/xhtml, application/xhtml+xml";
set ncode 0;
while {$ncode < 200 || $ncode >= 300} {
# test if the url is valid
#
if {[catch {set token [http::geturl $url -validate 1];} reason options] == 1} {
http::config -accept "*/*";
return -options $options $reason;
}
# error and redirection handling
#
set ncode [http::ncode $token];
set meta [http::meta $token];
# error handling
#
if {[http::status $token] eq "error" || $ncode >= 400} {
if {[http::status $token] eq "error"} {
set reason "[http::error $token] ([http::code $token])";
} else {
set reason "[http::code $token]";
}
set errorCode [list $ncode [http::code $token] $meta];
http::cleanup $token;
http::config -accept "*/*";
return -code error -errorcode $errorCode $reason;
}
# redirection handling
#
if {$ncode >= 300} {
set url [uri::resolve $url [dict get [http::meta $token] Location]];
http::cleanup $token;
continue;
}
http::cleanup $token;
# fetching the HTML data from the url
#
if {[catch {set token [http::geturl $url];} reason options] == 1} {
http::config -accept "*/*";
return -options $options $reason;
}
set htmlData [http::data $token];
http::cleanup $token;
}
http::config -accept "*/*";
# converting the HTML data to the tDOM DOM
#
if {[catch {set dom [convertdata $htmlData];} reason options] == 1} {
return -options $options $reason;
}
set titleNode [[$dom documentElement] selectNodes {/html/head/title}];
if {$titleNode ne "" && [regexp {[[:<:]]404[[:>:]]} [$titleNode text]]} {
$dom delete;
return -code error -errorcode [list 404 "HTTP/1.1 404 Not Found" $meta] "HTTP/1.1 404 Not Found"
}
return $dom;
}
proc convertfile {htmlFileName} {
if {[catch {set dom [convertdata [tDOM::xmlReadFile $htmlFileName]];} reason options] == 1} {
return -options $options $reason;
}
return $dom;
}
proc convertdata {htmlData} {
try {
# create a tree to be filled by htmlparse
#
set tree [struct::tree];
# parse the HTML data and remove "unwanted" data
#
htmlparse::2tree $htmlData $tree;
htmlparse::removeVisualFluff $tree;
htmlparse::removeFormDefs $tree;
# convert the HTML data tree into a tDOM DOM tree
#
set dom [dom createDocument html];
set domDoc [$dom documentElement];
set treeRoot [$tree rootname];
Walk $tree $treeRoot $treeRoot $dom $domDoc;
} on error {reason options} {
if {[info exists tree]} {
$tree destroy;
}
if {[info exists dom]} {
$dom delete;
}
return -options $options $reason;
} finally {
$tree destroy;
}
return $dom
}
namespace export -clear {[a-z]*};
namespace ensemble create;
}
======
***The usage (an example)***
======
set url http://www.nirsoft.net/utils/index.html;
set dom [html2dom converturl $url];
set domDoc [$dom documentElement];
set baseNode [$domDoc selectNodes {/html/head/base}];
if {$baseNode eq ""} {
set uri [uri::split $url];
set baseUrl [uri::join {*}[dict set uri path [file dirname [dict get $uri path]]]]/;
} else {
set baseUrl [$baseNode @href]/;
}
set utils [dict create];
foreach sectionNode [$domDoc selectNodes {//table[@class="title2"]}] {
set abbreviation [[$sectionNode selectNodes {preceding-sibling::a}] @name];
set title [[$sectionNode selectNodes {.//text()[1]}] nodeValue];
set descriptionNode [$sectionNode selectNodes {../span[@class="special3"]/text()}];
if {$descriptionNode ne ""} {
set description [$descriptionNode nodeValue];
} else {
set description "";
}
set filesTableNode [$sectionNode selectNodes {../table[@class="filestable"]}];
set section [dict create abbr $abbreviation title $title desc $description utils [dict create]];
puts section=$abbreviation;
dict update section utils sectionUtils {
foreach node [$filesTableNode selectNodes {.//node()[@class="filetitle"]/..}] {
set longDescription [$node text];
set anchorNode [$node selectNodes {a[1]}];
regexp {((?:\w+\s*)*\w+)(?:\s+v(\d+\.\d+))?(?: . (.+))?$} [$anchorNode text] => name version shortDescription;
set relativeUrl [$anchorNode @href]
set relativeUri [uri::split $relativeUrl];
set absoluteUrl [uri::resolve $baseUrl $relativeUrl];
set abbr [file rootname [file tail [dict get $relativeUri path]]]
dict set sectionUtils $abbr [dict create \
abbr $abbr \
name $name \
version $version \
shortdesc $shortDescription \
longdesc $longDescription \
relurl $relativeUrl \
absurl $absoluteUrl \
];
puts \tutil=$abbr
}
}
dict set utils $abbreviation $section;
}
======
<>HTML | XML