Pretty Html

Keith Vetter 2019-03-01 : A couple of recent weekend projects involved scraping and parsing web pages, e.g. ao3. To help reverse engineer the document format, I wanted a tool that would nicely format the HTML with proper indentation. Initially I just used a bunch of Emacs commands but that got old fast. I tried using tdom, but it doesn't handle invalid HTML. I next tried html-tidy but I found it too verbose about HTML errors and too intrusive in it's trying to fix the bad HTML.

So I decided to write my own HTML formatter. A long time ago (15 years!) I wrote Parsing XML, so I decided to adapt that in creating an HTML formatter.

One quirk in this code: for my needs, I don't want any JavaScript so I delete all <script> tags.

##+##########################################################################
#
# pretty_html.tsh -- reformats HTML in a prettier format adding newlines and indentation
# by Keith Vetter 2019-02-26
#
namespace eval ::PrettyHtml {
    variable HTML ""
    variable loc 0

    variable INDENT "  "
    # NB. "br" should be in INLINE_ELEMENTS but I prefer to treat it like a block element
    variable INLINE_ELEMENTS {
        a abbr acronym b bdo big button cite code dfn em i img input
        kbd label map object output q samp script select small span strong sub
        sup textarea time tt}
    variable VOID_ELEMENTS {
        area base br col command embed hr img input keygen link meta param source track wbr}
    variable START_ON_SAME_LINE $INLINE_ELEMENTS
    variable END_ON_SAME_LINE [concat $INLINE_ELEMENTS p td th h1 h2 h3 h4 li title option]

    variable TAGS_TO_DELETE {script}
}

proc ::PrettyHtml::Pretty {html} {
    variable VOID_ELEMENTS
    variable INDENT

    ::PrettyHtml::_InitParser $html

    set pretty ""
    set openElementsStack {}
    set val ""
    set etype ""
    set indent ""

    while {1} {
        lassign [list $val $etype $indent] lastTag lastEType lastIndent

        lassign [::PrettyHtml::_NextToken] type val attr etype
        # puts "DEBUG: type: '$type' attr: '$attr' etype: '$etype' val: '$val'"

        if {$type eq "EOF"} break
        if {$type in {PI DOCTYPE CDATA}} {
            set tag [::PrettyHtml::MakeTag $type $val $attr $etype]
            append pretty $tag
            continue
        }
        if {$type eq "TXT"} {
            # TODO: string trimleft if lastTag is block type
            if {$lastTag eq "br"} {
                set val [string cat $lastIndent $INDENT [string trimleft $val]]
            }
            append pretty $val
            continue
        }

        # ASSERT $type eq "TAG"
        set tagName [string tolower $val]

        if {$tagName in $VOID_ELEMENTS} { set etype "VOID" }
        set indentLevel [llength $openElementsStack]

        if {$etype eq "START"} {
            lappend openElementsStack $tagName
        } elseif {$etype eq "END"} {
            set openElementsStack [lrange $openElementsStack 0 end-1]
            incr indentLevel -1
        }
        set tag [::PrettyHtml::MakeTag $type $tagName $attr $etype]
        set indent [::PrettyHtml::GetIndent $indentLevel $tagName $etype]

        # Don't indent if we're just closing a tag we just opened
        if {$etype eq "END" && $lastEType eq "START" && $tagName eq $lastTag} { set indent "" }
        append pretty $indent $tag
    }
    set pretty [string trim $pretty]
    return $pretty
}

proc ::PrettyHtml::_InitParser {htmlData} {
    variable HTML
    variable loc
    variable TAGS_TO_DELETE

    set HTML [string trim $htmlData];
    regsub -all {<!--.*?-->} $HTML {} HTML          ;# Remove all comments
    foreach tag $TAGS_TO_DELETE {
        set re "<${tag}.*?</${tag}>"
        regsub -all $re $HTML {} HTML
    }
    set loc 0
}

proc ::PrettyHtml::_NextToken {{peek 0}} {
    # To be called repeatedly until EOF, each time returning the next element of the HTML file
    # Each element consists of list of: type value attributes etype
    #   type:        TAG|TXT|EOF|PI|DOCTYPE|CDATA
    #   val:         tagname or PI or text
    #   attributes:  raw attributes of the tag
    #   etype:       START|END|VOID
    #
    # NB. parsing html with REGEX is prone to errors--you can construct pathological
    # data such as putting tags inside CDATA fields. But in reality, this works ok.

    variable HTML
    variable loc

    set n [regexp -start $loc -indices {(.*?)\s*?<(/?)(.*?)(/?)>} \
               $HTML all txt stok tok etok]
    if {! $n} {return [list EOF]}

    foreach {all0 all1} $all {txt0 txt1} $txt \
        {stok0 stok1} $stok {tok0 tok1} $tok {etok0 etok1} $etok break

    if {$txt1 >= $txt0} {                       ;# Got text
        set txt [string range $HTML $txt0 $txt1]
        if {! $peek} {set loc [expr {$txt1 + 1}]}
        return [list TXT $txt]
    }

    set token [string range $HTML $tok0 $tok1]   ;# Got something in brackets

    # CDATA is special, it closes with ']]'
    if {[string range $token 0 7] eq "!\[CDATA\["} {
        set n [regexp -start $loc -indices {!\[CDATA\[(.*?)\]\]>} $HTML all cdata]
        set cdata [string range $HTML {*}$cdata]
        if {! $peek} {set loc [expr {[lindex $all 1] + 1}]}
        return [list CDATA $cdata]
    }
    if {! $peek} {set loc [expr {$all1 + 1}]}

    # Check for Processing Instruction <?...?>
    set type TAG
    if {[regexp {^\?(.*)\?$} $token => token]} {
        set type PI
    }
    if {[regexp {^!(.*)$} $token => token]} {
        set type DOCTYPE
    }

    set attr ""
    regexp {^(.*?)\s+(.*?)$} $token => token attr

    set etype START                             ;# Entity type
    if {$etok0 <= $etok1} {
        if {$stok0 <= $stok1} { set token "/$token"} ;# Bad HTML
        set etype VOID
    } elseif {$stok0 <= $stok1} {
        set etype END
    }
    return [list $type $token $attr $etype]
}

proc ::PrettyHtml::GetIndent {indentLevel tag etype} {
    # Helper function to return how a tag should be indented (if at all)
    variable START_ON_SAME_LINE
    variable END_ON_SAME_LINE
    variable INDENT

    if {$etype ne "END" && $tag in $START_ON_SAME_LINE} { return "" }
    if {$etype eq "END" && $tag in $END_ON_SAME_LINE} { return "" }
    set indent [string cat "\n" [string repeat $INDENT $indentLevel]]
    return $indent
}
proc ::PrettyHtml::MakeTag {type tagName attr etype} {
    # Helper function to create the correct tag, handling processing instructions,

    if {$attr ne ""} { set attr " $attr" }
    if {$type eq "PI"} {
        set tag [string cat "<?" $tagName $attr "?>"]
    } elseif {$type eq "CDATA"} {
        set tag "<!\[CDATA\[$tagName\]\]>"
    } else {
        set prefix [expr {$etype eq "END" ? "</" : "<"}]
        if {$type eq "DOCTYPE"} { set prefix "<!" }
        set suffix [expr {$etype eq "VOID" ? "/>" : ">"}]
        set tag [string cat $prefix $tagName $attr $suffix]
    }
    return $tag
}
################################################################
#
# command line interface
#
if {[llength $argv] != 1} {
    puts stderr "usage: pretty_html foo.html"
    exit 1
}

set fname [lindex $argv 0]
set fin [open $fname r]
set html [string trim [read $fin]]
close $fin

set pretty [::PrettyHtml::Pretty $html]
puts $pretty

return