Version 0 of HTML to DOM via http/htmlparse/struct::tree packages

Updated 2010-03-23 12:48:09 by male

This is some code to convert HTML data to a tDOM DOM.

Why

I needed to parse a web site pages, but the HTML provided by this web site is so malformed, that tDOM has no chance to parse these pages.

So I tried successfully to use htmlparse, which uses struct::tree to store the parsed data in. So I use the struct::tree to convert its data into a tDOM DOM tree.

The source below is tested in the context of the my needs to parse pages from a specific web site, but the used packages should allow to use this source a few more websites.

So I published the source here on the wiki.

Functionality

Using this code it is possible to:

  • specify an URL to download the HTML data from (following even redirections)
  • specify a path to a HTML file
  • specify already loaded/built/... HTML data

... to convert this HTML data into a tDOM DOM.

Syntax

Download of the HTML data via URL:

 html2dom converturl url

Where "url" points to a web resource of one the MIME types "text/html", "text/xhtml", "application/xhtml+xml".

The command returns the tDOM DOM object.

Loading the HTML data from a file:

 html2dom convertfile fileName

The command returns the tDOM DOM object.

Converting directly given HTML data:

 html2dom convertdata htmlData

The command returns the tDOM DOM object.

Dependencies

  • tcl 8.6 (because of the usage of try/error/finally - tcl 8.5, if try/error/finally would be eleminated)
  • http package (from within tcl)
  • tcllib pagckages
  • tDOM - the XML DOM package

The source

 package require http;
 package require uri;
 package require tdom;
 package require htmlparse;
 package require struct::tree;

 namespace eval html2dom {
     proc Attributes {data} {
         set attributes [dict create];

         foreach {=> name value} [regexp -inline -all -- {(\w+)(?:=\"?([^\"]*))?\"?} $data] {
             dict set attributes $name [expr {$value eq "" ? $name : $value}];
         }

         return $attributes;
     }

     proc Walk {tree parentNode node dom parentDomNode} {
         set type [$tree get $node type];

         if {$parentNode eq $node} {
             set domNode $parentDomNode;
         } else {
             if {$type eq "PCDATA"} {
                 set domNode [$dom createTextNode $type];

                 $domNode nodeValue [$tree get $node data];
                 $parentDomNode appendChild $domNode;

                 return;
             }

             set domNode [$dom createElement $type];

             $parentDomNode appendChild $domNode;
         }

         if {[$tree keyexists $node data]} {
             set data [$tree get $node data];

             if {$data ne ""} {
                 set attributes [Attributes $data];

                 if {$attributes ne ""} {
                     $domNode setAttribute {*}$attributes;
                 }
             }
         }

         foreach childNode [$tree children $node] {
             [lindex [info level 0] 0] $tree $node $childNode $dom $domNode;
         }

         return;
     }

     proc converturl {url} {
         http::config -accept "text/html, text/xhtml, application/xhtml+xml";

         set ncode 0;

         while {$ncode < 200 || $ncode >= 300} {
             # test if the url is valid
             #
             if {[catch {set token [http::geturl $url -validate 1];} reason options] == 1} {
                 http::config -accept "*/*";

                 return -options $options $reason;
             }

             # error and redirection handling
             #
             set ncode   [http::ncode $token];
             set meta    [http::meta $token];

             # error handling
             #
             if {[http::status $token] eq "error" || $ncode >= 400} {
                 if {[http::status $token] eq "error"} {
                     set reason "[http::error $token] ([http::code $token])";
                 } else {
                     set reason "[http::code $token]";
                 }

                 set errorCode [list $ncode [http::code $token] $meta];

                 http::cleanup $token;
                 http::config -accept "*/*";

                 return -code error -errorcode $errorCode $reason;
             }

             # redirection handling
             #
             if {$ncode >= 300} {
                 set url [uri::resolve $url [dict get [http::meta $token] Location]];

                 http::cleanup $token;

                 continue;
             }

             http::cleanup $token;

             # fetching the HTML data from the url
             #
             if {[catch {set token [http::geturl $url];} reason options] == 1} {
                 http::config -accept "*/*";

                 return -options $options $reason;
             }

             set htmlData [http::data $token];

             http::cleanup $token;
         }

         http::config -accept "*/*";

         # converting the HTML data to the tDOM DOM
         #
         if {[catch {set dom [convertdata $htmlData];} reason options] == 1} {
             return -options $options $reason;
         }

         set titleNode [[$dom documentElement] selectNodes {/html/head/title}];

         if {$titleNode ne "" && [regexp {[[:<:]]404[[:>:]]} [$titleNode text]]} {
             $dom delete;

             return -code error -errorcode [list 404 "HTTP/1.1 404 Not Found" $meta] "HTTP/1.1 404 Not Found"
         }

         return $dom;
     }

     proc convertfile {htmlFileName} {
         if {[catch {set dom [convertdata [tDOM::xmlReadFile $htmlFileName]];} reason options] == 1} {
             return -options $options $reason;
         }

         return $dom;
     }

     proc convertdata {htmlData} {
         try {
             # create a tree to be filled by htmlparse
             #
             set tree [struct::tree];

             # parse the HTML data and remove "unwanted" data
             #
             htmlparse::2tree $htmlData $tree;
             htmlparse::removeVisualFluff $tree;
             htmlparse::removeFormDefs $tree;

             # convert the HTML data tree into a tDOM DOM tree
             #
             set dom         [dom createDocument html];
             set domDoc      [$dom documentElement];
             set treeRoot    [$tree rootname];

             Walk $tree $treeRoot $treeRoot $dom $domDoc;
         } on error {reason options} {
             if {[info exists tree]} {
                 $tree destroy;
             }

             if {[info exists dom]} {
                 $dom delete;
             }

             return -options $options $reason;
         } finally {
             $tree destroy;
         }

         return $dom
     }

     namespace export -clear {[a-z]*};
     namespace ensemble create;
 }

The usage (an example)

 set url         http://www.nirsoft.net/utils/index.html;
 set dom         [html2dom converturl $url];
 set domDoc      [$dom documentElement];
 set baseNode    [$domDoc selectNodes {/html/head/base}];

 if {$baseNode eq ""} {
     set uri     [uri::split $url];
     set baseUrl [uri::join {*}[dict set uri path [file dirname [dict get $uri path]]]]/;
 } else {
     set baseUrl [$baseNode @href]/;
 }

 set utils [dict create];

 foreach sectionNode [$domDoc selectNodes {//table[@class="title2"]}] {
     set abbreviation    [[$sectionNode selectNodes {preceding-sibling::a}] @name];
     set title           [[$sectionNode selectNodes {.//text()[1]}] nodeValue];
     set descriptionNode [$sectionNode selectNodes {../span[@class="special3"]/text()}];

     if {$descriptionNode ne ""} {
         set description [$descriptionNode nodeValue];
     } else {
         set description "";
     }

     set filesTableNode      [$sectionNode selectNodes {../table[@class="filestable"]}];
     set section             [dict create abbr $abbreviation title $title desc $description utils [dict create]];

     puts section=$abbreviation;

     dict update section utils sectionUtils {
         foreach node [$filesTableNode selectNodes {.//node()[@class="filetitle"]/..}] {
             set longDescription [$node text];
             set anchorNode      [$node selectNodes {a[1]}];

             regexp {((?:\w+\s*)*\w+)(?:\s+v(\d+\.\d+))?(?: . (.+))?$} [$anchorNode text] => name version shortDescription;

             set relativeUrl [$anchorNode @href]
             set relativeUri [uri::split $relativeUrl];
             set absoluteUrl [uri::resolve $baseUrl $relativeUrl];
             set abbr        [file rootname [file tail [dict get $relativeUri path]]]

             dict set sectionUtils $abbr [dict create \
                 abbr        $abbr \
                 name        $name \
                 version     $version \
                 shortdesc   $shortDescription \
                 longdesc    $longDescription \
                 relurl      $relativeUrl \
                 absurl      $absoluteUrl \
             ];

             puts \tutil=$abbr
         }
     }

     dict set utils $abbreviation $section;
 }

Category HTMLCategory XML