imgsizer

Keith Vetter 2007-06-06 : I came across an html utility written by Eric Raymond called imgsizer. It edits web pages inserting missing WIDTH and HEIGHT attributes to image tags [L1 ]. By having these tags, browsers can load pages faster exploiting multithreading image loading; without them, browsers must do sequential image loading.

I wanted to use Eric's tool but it's written in Python. So I just decided to rewrite it in tcl.

I first tried using TDOM to parse web pages, but it was too brittle and couldn't handle malformed html. This version uses regular expressions to find the image tags to update.

If you have TclMagick, then this utility will run in pure tcl. Otherwise, it uses Img for figuring out image size which means that Tk also gets loaded. (Using tcllib's jpeg, gif and png library is left as an exercise for the reader.)


 ##+##########################################################################
 #
 # imgsizer.tsh -- splices in WIDTH and HEIGHT parameters for HTML IMG tags
 # This allows browsers to load pages faster by multi-threading the image
 # loading rather than strict sequential loading.
 #
 # by Keith Vetter, May 2007
 # inspired by Eric Raymond's version: http://www.catb.org/~esr/software.html 
 
 package require http
 package require cmdline
 set OPT(haveMagick) [expr {! [catch {package require TclMagick}]}]
 if {! $OPT(haveMagick)} {package require Img}
 catch {wm withdraw .}
 
 array set OPT {version 0.1 root . keepOld 0 noOverwrite 0 verbose 1}
 
 ##+##########################################################################
 #
 # ImageSizer -- fixes up WIDTH and HEIGHT parameters for IMG tag in one file
 #  uses regexp to find img tags (TDOM was too fragile)
 #
 proc ImageSizer {fname} {
    INFO 1 "$fname\n"
    set fin [open $fname r]
    set data [read $fin]; list
    close $fin
 
    set result {}
    set last 0
    set ::CHANGED 0
    foreach idx [regexp -inline -all -indices {<img.*?>} $data] {
        foreach {start end} $idx break
        append result [string range $data $last [expr {$start-1}]]
        set imgTag [string range $data $start $end]
        append result [HandleImgTag $imgTag]
        set last [expr {$end+1}]
    }
    append result [string range $data $last end]
    SaveResult $result $fname
 }
 ##+##########################################################################
 #
 # HandleImgTag -- fixes up one image tag
 #
 proc HandleImgTag {imgTag} {
    GetAllAttributes $imgTag
 
    set src [GetAttribute src "<none>"]
    if {$src eq "<none>"} {
        WARN "Image tag without src"
        return $imgTag
    }
    INFO 2 "    $src"
 
    if {[file pathtype $src] ne "relative"} {
        set src [file join $::OPT(root) ".$src"]
    }
 
    set w [GetAttribute width "?"]
    set h [GetAttribute height "?"]
    if {[string first "%" $w] != -1 ||
        [string first "%" $h] != -1} {
        INFO 2 " -- skipping: %\n"
        return $imgTag
    }
    if {$::OPT(noOverwrite) && [string is integer -strict $w] &&
        [string is integer -strict $h]} {
        INFO 2 " -- skipping: noOverwrite\n"
        return $imgTag
    }
 
    if {[catch {foreach {w2 h2} [GetImageSize $src] break}]} {
        INFO 2 "\n"
        WARN "ERROR: cannot read image dimensions for '$src'"
        return $imgTag
    }
 
    if {$w ne $w2 || $h ne $h2} {incr ::CHANGED}
 
    set ::ATTR(width) $w2
    set ::ATTR(height) $h2
    INFO 2 "   ($w,$h) => ($w2,$h2)\n"
 
    return [RebuildImgTag]
 }
 ##+##########################################################################
 #
 # RebuildImgTag -- returns new image tag with all its attributes
 #
 proc RebuildImgTag {} {
 
    array set attr [array get ::ATTR]
    set all [concat src width height [array names attr]]
 
    set html "<img"
    foreach arr $all {
        if {! [info exists attr($arr)]} continue
 
        set value $attr($arr)
        set delim "\""
        if {[string is integer -strict $value]} {
            set delim ""
        } elseif {[string first "\"" $value] != -1} {
            set delim "'"
        }
        append html " $arr=$delim$value$delim"
        unset attr($arr)
    }
    append html "/>"
    return $html
 }
 ##+##########################################################################
 #
 # GetAttribute -- returns attribute value, using default if not found
 #
 proc GetAttribute {which default} {
    set which [string tolower $which]
    if {[info exists ::ATTR($which)]} {
        return $::ATTR($which)
    }
    return $default
 }
 ##+##########################################################################
 #
 # GetAllAttributes -- extracts all attributes for a given tag into global ATTR
 #
 proc GetAllAttributes {thisTag} {
    set last 2
    unset -nocomplain ::ATTR
    while {1} {
        set next [regexp -inline -nocase -indices -start $last {\s.*?=.} $thisTag]
        if {$next eq {}} break
 
        foreach {start end} [lindex $next 0] break
        set name [string range $thisTag [expr {$start+1}] [expr {$end-2}]]
        set name [string tolower $name]
 
        set delim [string index $thisTag $end]
        if {$delim eq "'" || $delim eq "\""} {
            set start2 [expr {$end+1}]
            set end2 [string first $delim $thisTag $start2]
        } else {
            set start2 $end
            set idx [regexp -indices -inline -start $start2 {\s|>} $thisTag]
            set end2 [lindex $idx 0 1]
        }
        set value [string range $thisTag $start2 [expr {$end2 - 1}]]
        set last $end2
 
        set ::ATTR($name) $value
    }
 }
 ##+##########################################################################
 #
 # GetImageSize -- returns the size of an image
 #
 proc GetImageSize {iname} {
    if {[info exists ::CACHE($iname)]} {        ;# Cache to avoid downloads
        return $::CACHE($iname)
    }
 
    if {[regexp {(?i)^http:} $iname]} {         ;# Is iname really a URL???
        set ::CACHE($iname) [GetWebImageSize $iname]
        return $::CACHE($iname)
    }
 
    if {$::OPT(haveMagick)} {
        set wand [magick create wand]
        $wand ReadImage $iname
        set w [$wand width]
        set h [$wand height]
    } else {
        set img [image create photo -file $iname]
        set w [image width $img]
        set h [image height $img]
        image delete $img
    }
 
    set ::CACHE($iname) [list $w $h]
    return [list $w $h]
 }
 ##+##########################################################################
 #
 # GetWebImageSize -- returns the size of an image after first downloading
 # from the web
 #
 proc GetWebImageSize {url} {
    set token [::http::geturl $url]
    ::http::wait $token
    set idata [::http::data $token] ; list
    ::http::cleanup $token
 
    if {$::OPT(haveMagick)} {
        set wand [magick create wand]
        $wand ReadImageBlob $idata
        set w [$wand width]
        set h [$wand height]
    } else {
        set img [image create photo -data $idata]
        set w [image width $img]
        set h [image height $img]
        image delete $img
    }
 
    return [list $w $h]
 }
 ##+##########################################################################
 #
 # SaveResult -- safely saves our result while safely moving files around.
 #
 proc SaveResult {html fname} {
    if {$::CHANGED == 0} {
        INFO 2 "    no change\n"
        return
    }
    set tempname [GetTempName $fname ".tmp"]
    set backname [GetTempName $fname ".bak"]
 
    set fout [open $tempname w]
    puts -nonewline $fout $html
    close $fout
 
    file rename $fname $backname
    file rename $tempname $fname
    if {! $::OPT(keepOld)} {
        file delete $backname
    }
 }
 ##+##########################################################################
 #
 # GetTempName -- returns an unused filename based on a given basename and
 # extension. Not bullet-proof, race condition exists but good enough for now.
 #
 proc GetTempName {base extension} {
    set fname "$base$extension"
    if {! [file exists $fname]} { return $fname }
 
    for {set i 1} {$i < 1000} {incr i} {
        set fname [format "%s%s%03d" $base $extension $i]
        if {! [file exists $fname]} { return $fname }
    }
 
    for {set i 1} {$i < 1000} {incr i} {
        set rand [expr {int(rand()*0x7FFFffff)}]
        set fname "$base$extension$rand"
        if {! [file exists $fname]} { return $fname }
    }
    error "Could not create tempfile '$fname' '$extension'"
 }
 proc ParseArgs {} {
    global argc argv OPT
 
    for {set i 0} {$i < $argc} {incr i} {
        set arg [lindex $argv $i]
        switch -regexp -- $arg {
            ^--document-root$ -
            ^-d$ { set OPT(root) [lindex $argv [incr i]]}
            ^-d  { set OPT(root) [string range $arg 2 end]}
            ^--no-overwrite$ -
            ^-n$ { set OPT(noOverwrite) 1 }
            ^--keep-original$ -
            ^-k$ { set OPT(keepOld) 1 }
            ^-q$ { set OPT(verbose) 0 }
            ^-v$ { incr OPT(verbose) }
            ^-V$ { DoHelp version }
            ^-h$ - ^-?$ - ^--help DoHelp
 
            ^--$ { incr i; break }
            ^- { WARN "unknown option: \"$arg\""; DoHelp usage }
            default { break }
        }
    }
    set argc [expr {$argc - $i}]
    if {$argc <= 0 && ! $::tcl_interactive} { DoHelp usage }
    set argv [lrange $argv $i end]
 }
 proc WARN {msg} { puts stderr $msg}
 proc INFO {lvl msg} {if {$lvl <= $::OPT(verbose)} {puts -nonewline $msg }}
 proc DoHelp {{what help}} {
    if {$what eq "version"} {
        puts "imgsizer version $::OPT(version)"
        exit
    }
 
    set txt "imgsizer ?-d documentRoot? ?-n? ?-k? html-files"
    if {$what eq "usage"} {
        puts "usage: $txt"
        exit
    }
 
    append txt "\n\n"
    append txt "The imgsizer script automatically inserts WIDTH and HEIGHT parameters\n"
    append txt "for IMG tags for HTML files. These parameters enable browsers to show\n"
    append txt "pages faster by multi-threading the image loading rather than strict\n"
    append txt "sequential loading.\n\n"
 
    append txt "This script will insert missing WIDTH and HEIGHT parameters and\n"
    append txt "correct existing parameters unless they contain a percent sign (%),\n"
    append txt "or if you gave the '-n' or '--no-overwrite' switch.\n\n"
 
    append txt "Options:\n"
    append txt "  -d, --document-root   Directory for absolute image filenames\n"
    append txt "                        (i.e, ones which contain a leading '/')\n"
    append txt "  -n, --no-overwrite    Do not overwrite existing image size info\n"
    append txt "  -k, --keep-original   Keep original source files with .bak extension\n"
    append txt "  -V                    Show version information\n"
    append txt "  -v                    Verbose output\n"
    append txt "  -q                    Quiet output\n"
 
    puts $txt
    exit
 }
 
 ################################################################
 ParseArgs
 if {$tcl_interactive} return                    ;# For debugging
 
 foreach arg $argv {
    if {$tcl_platform(platform) ne "windows"} {
        ImageSizer $arg
        continue
    }
    foreach fname [glob -nocomplain -- $arg] {  ;# Manual glob'ing
        ImageSizer $fname
    }
 }
 
 exit                                            ;# Img package loads Tk
 return