Keith Vetter 2019-03-01 : A couple of recent weekend projects involved scraping and parsing web pages, e.g. ao3. To help reverse engineer the document format, I wanted a tool that would nicely format the HTML with proper indentation. Initially I just used a bunch of Emacs commands but that got old fast. I tried using tdom, but it doesn't handle invalid HTML. I next tried html-tidy but I found it too verbose about HTML errors and too intrusive in it's trying to fix the bad HTML.
So I decided to write my own HTML formatter. A long time ago (15 years!) I wrote Parsing XML, so I decided to adapt that in creating an HTML formatter.
One quirk in this code: for my needs, I don't want any JavaScript so I delete all <script> tags.
##+########################################################################## # # pretty_html.tsh -- reformats HTML in a prettier format adding newlines and indentation # by Keith Vetter 2019-02-26 # namespace eval ::PrettyHtml { variable HTML "" variable loc 0 variable INDENT " " # NB. "br" should be in INLINE_ELEMENTS but I prefer to treat it like a block element variable INLINE_ELEMENTS { a abbr acronym b bdo big button cite code dfn em i img input kbd label map object output q samp script select small span strong sub sup textarea time tt} variable VOID_ELEMENTS { area base br col command embed hr img input keygen link meta param source track wbr} variable START_ON_SAME_LINE $INLINE_ELEMENTS variable END_ON_SAME_LINE [concat $INLINE_ELEMENTS p td th h1 h2 h3 h4 li title option] variable TAGS_TO_DELETE {script} } proc ::PrettyHtml::Pretty {html} { variable VOID_ELEMENTS variable INDENT ::PrettyHtml::_InitParser $html set pretty "" set openElementsStack {} set val "" set etype "" set indent "" while {1} { lassign [list $val $etype $indent] lastTag lastEType lastIndent lassign [::PrettyHtml::_NextToken] type val attr etype # puts "DEBUG: type: '$type' attr: '$attr' etype: '$etype' val: '$val'" if {$type eq "EOF"} break if {$type in {PI DOCTYPE CDATA}} { set tag [::PrettyHtml::MakeTag $type $val $attr $etype] append pretty $tag continue } if {$type eq "TXT"} { # TODO: string trimleft if lastTag is block type if {$lastTag eq "br"} { set val [string cat $lastIndent $INDENT [string trimleft $val]] } append pretty $val continue } # ASSERT $type eq "TAG" set tagName [string tolower $val] if {$tagName in $VOID_ELEMENTS} { set etype "VOID" } set indentLevel [llength $openElementsStack] if {$etype eq "START"} { lappend openElementsStack $tagName } elseif {$etype eq "END"} { set openElementsStack [lrange $openElementsStack 0 end-1] incr indentLevel -1 } set tag [::PrettyHtml::MakeTag $type $tagName $attr $etype] set indent [::PrettyHtml::GetIndent $indentLevel $tagName $etype] # Don't indent if we're just closing a tag we just opened if {$etype eq "END" && $lastEType eq "START" && $tagName eq $lastTag} { set indent "" } append pretty $indent $tag } set pretty [string trim $pretty] return $pretty } proc ::PrettyHtml::_InitParser {htmlData} { variable HTML variable loc variable TAGS_TO_DELETE set HTML [string trim $htmlData]; regsub -all {<!--.*?-->} $HTML {} HTML ;# Remove all comments foreach tag $TAGS_TO_DELETE { set re "<${tag}.*?</${tag}>" regsub -all $re $HTML {} HTML } set loc 0 } proc ::PrettyHtml::_NextToken {{peek 0}} { # To be called repeatedly until EOF, each time returning the next element of the HTML file # Each element consists of list of: type value attributes etype # type: TAG|TXT|EOF|PI|DOCTYPE|CDATA # val: tagname or PI or text # attributes: raw attributes of the tag # etype: START|END|VOID # # NB. parsing html with REGEX is prone to errors--you can construct pathological # data such as putting tags inside CDATA fields. But in reality, this works ok. variable HTML variable loc set n [regexp -start $loc -indices {(.*?)\s*?<(/?)(.*?)(/?)>} \ $HTML all txt stok tok etok] if {! $n} {return [list EOF]} foreach {all0 all1} $all {txt0 txt1} $txt \ {stok0 stok1} $stok {tok0 tok1} $tok {etok0 etok1} $etok break if {$txt1 >= $txt0} { ;# Got text set txt [string range $HTML $txt0 $txt1] if {! $peek} {set loc [expr {$txt1 + 1}]} return [list TXT $txt] } set token [string range $HTML $tok0 $tok1] ;# Got something in brackets # CDATA is special, it closes with ']]' if {[string range $token 0 7] eq "!\[CDATA\["} { set n [regexp -start $loc -indices {!\[CDATA\[(.*?)\]\]>} $HTML all cdata] set cdata [string range $HTML {*}$cdata] if {! $peek} {set loc [expr {[lindex $all 1] + 1}]} return [list CDATA $cdata] } if {! $peek} {set loc [expr {$all1 + 1}]} # Check for Processing Instruction <?...?> set type TAG if {[regexp {^\?(.*)\?$} $token => token]} { set type PI } if {[regexp {^!(.*)$} $token => token]} { set type DOCTYPE } set attr "" regexp {^(.*?)\s+(.*?)$} $token => token attr set etype START ;# Entity type if {$etok0 <= $etok1} { if {$stok0 <= $stok1} { set token "/$token"} ;# Bad HTML set etype VOID } elseif {$stok0 <= $stok1} { set etype END } return [list $type $token $attr $etype] } proc ::PrettyHtml::GetIndent {indentLevel tag etype} { # Helper function to return how a tag should be indented (if at all) variable START_ON_SAME_LINE variable END_ON_SAME_LINE variable INDENT if {$etype ne "END" && $tag in $START_ON_SAME_LINE} { return "" } if {$etype eq "END" && $tag in $END_ON_SAME_LINE} { return "" } set indent [string cat "\n" [string repeat $INDENT $indentLevel]] return $indent } proc ::PrettyHtml::MakeTag {type tagName attr etype} { # Helper function to create the correct tag, handling processing instructions, if {$attr ne ""} { set attr " $attr" } if {$type eq "PI"} { set tag [string cat "<?" $tagName $attr "?>"] } elseif {$type eq "CDATA"} { set tag "<!\[CDATA\[$tagName\]\]>" } else { set prefix [expr {$etype eq "END" ? "</" : "<"}] if {$type eq "DOCTYPE"} { set prefix "<!" } set suffix [expr {$etype eq "VOID" ? "/>" : ">"}] set tag [string cat $prefix $tagName $attr $suffix] } return $tag } ################################################################ # # command line interface # if {[llength $argv] != 1} { puts stderr "usage: pretty_html foo.html" exit 1 } set fname [lindex $argv 0] set fin [open $fname r] set html [string trim [read $fin]] close $fin set pretty [::PrettyHtml::Pretty $html] puts $pretty return