mime type recognition in pure tcl

CMcC 20040925

I've written a compiler from unix magic(5) files to pure tcl.

The idea is you generate a script which should perform precisely the same function as the unix file command (it gives you information about a file)

This code isn't complete, but handles most common cases - certainly enough to produce useful file recognition. The full unix file command recognises/discriminates thousands of different file types. This'll be useful to generate file type recognition for tclhttpd uploads, for example to stop people uploading viruses.

You'll find that a lot of the magic files out there are buggy, too.

Enjoy!

Update 20040927 - Added an optimiser and generate code from a tree.

A later form of this code was put into Tcllib, under the module fumagic, short for fileutil::magic.

Output from /etc/mime-magic is here mime type discriminator ... about 500 LOC, and you'll need magiclib.tcl from here.


magiccompile.tcl:

    # file to compile the magic file from magic(5) into a tcl program
    package require fileutil
    
    source magiclib.tcl
    source magictree.tcl
    
    namespace eval magic {}
    set magic::debug 0
    
    # parse an individual line
    proc magic::parseline {line {maxlevel 10000}} {
    
        # calculate the line's level
        set unlevel [string trimleft $line >]
        set level [expr [string length $line] - [string length $unlevel]]
        if {$level > $maxlevel} {
            return -code continue "Skip - too high a level"
        }
    
        # regexp parse line into (offset, type, value, command)
        set parse [regexp -expanded -inline {^(\S+)\s+(\S+)\s*((\S|(\B\s))*)\s*(.*)$} $unlevel]
        if {$parse == {}} {
            error "Can't parse: '$unlevel'"
        }
    
        # unpack parsed line
        set value ""
        set command ""
        foreach {junk offset type value junk1 junk2 command} $parse break
    
        # handle trailing spaces
        if {[string index $value end] eq "\\"} {
            append value " "
        }
        if {[string index $command end] eq "\\"} {
            append command " "
        }
    
        if {$value == ""} {
            error "no value"        ;# badly formatted line
        }
    
        variable debug
        if {$debug > 1} {
            puts "level:$level offset:$offset type:$type value:'$value' command:'$command'"
        }
    
        # return the line's fields
        return [list $level $offset $type $value $command]
    }
    
    # process a magic file
    proc magic::process {file {maxlevel 10000}} {
        variable level        ;# level of line
        variable linenum        ;# line number
    
        set level 0
        set script {}
    
        set linenum 0
        ::fileutil::foreachLine line $file {
            incr linenum
            set line [string trim $line " "]
            if {[string index $line 0] eq "#"} {
                continue        ;# skip comments
            } elseif {$line == ""} {
                continue        ;# skip blank lines
            } else {
                # parse line
                if {[catch {parseline $line $maxlevel} parsed]} {
                    continue        ;# skip erroring lines
                }
    
                # got a valid line
                foreach {level offset type value message} $parsed break
    
                # strip comparator out of value field,
                # (they are combined)
                set compare [string index $value 0]
                switch -glob --  $value {
                    [<>]=* {
                        set compare [string range $value 0 1]
                        set value [string range $value 2 end]
                    }
    
                    <* -
                    >* -
                    &* -
                    ^* {
                        set value [string range $value 1 end]
                    }
    
                    =* {
                        set compare "=="
                        set value [string range $value 1 end]
                    }
    
                    !* {
                        set compare "!="
                        set value [string range $value 1 end]
                    }
    
                    x {
                        # this is the 'don't care' match
                        # used for collecting values
                        set value ""
                    }
    
                    default {
                        # the default comparator is equals
                        set compare "=="
                        if {[string match {\\[<!>=]*} $value]} {
                            set value [string range $value 1 end]
                        }
                    }
                }
    
                # process type field
                set qual ""
                switch -glob -- $type {
                    pstring* -
                    string* {
                        # String or Pascal string type
    
                        # extract string match qualifiers
                        set type [split $type /]
                        set qual [lindex $type 1]
                        set type [lindex $type 0]
    
                        # convert pstring to string + qualifier
                        if {$type eq "pstring"} {
                            append qual "p"
                            set type "string"
                        }
    
                        # protect hashes in output script value
                        set value [string map [list "\#" "\\#" \" \\\" \{ \\\{ \} \\\}] $value]
    
                        if {($value eq "\\0") && ($compare eq ">")} {
                            # record 'any string' match
                            set value ""
                            set compare x
                        } elseif {$compare eq "!="} {
                            # string doesn't allow !match
                            set value !$value
                            set compare "=="
                        }
    
                        if {$type ne "string"} {
                            # don't let any odd string types sneak in
                            puts stderr "Reject String: ${file}:$linenum $type - $line"
                            continue
                        }
                    }
    
                    regex {
                        # I am *not* going to handle regex
                        puts stderr "Reject Regex: ${file}:$linenum $type - $line"
                        continue
                    }
    
                    *byte* -
                    *short* -
                    *long* -
                    *date* {
                        # Numeric types
    
                        # extract numeric match &qualifiers
                        set type [split $type &]
                        set qual [lindex $type 1]
    
                        if {$qual ne ""} {
                            # this is an &-qualifier
                            set qual &$qual
                        } else {
                            # extract -qualifier from type
                            set type [split $type -]
                            set qual [lindex $type 1]
                            if {$qual ne ""} {
                                set qual -$qual
                            }
                        }
                        set type [lindex $type 0]
    
                        # perform value adjustments
                        if {$compare ne "x"} {
                            # trim redundant Long value qualifier
                            set value [string trimright $value L]
    
                            if {[catch {set value [expr $value]} x eo]} {
                                # check that value is representable in tcl
                                puts stderr "Reject Value Error: ${file}:$linenum '$value' '$line' - $eo"
                                continue;
                            }
    
                            # coerce numeric value into hex
                            set value [format "0x%x" $value]
                        }
                    }
    
                    default {
                        # this is not a type we can handle
                        puts stderr "Reject Unknown Type: ${file}:$linenum $type - $line"
                        continue
                    }
                }
            }
    
            # collect some summaries
            variable debug
            if {$debug} {
                variable types
                set types($type) $type
                variable quals
                set quals($qual) $qual
            }
    
            #puts "$linenum level:$level offset:$offset type:$type qual:$qual compare:$compare value:'$value' message:'$message'"
    
            # protect hashes in output script message
            set message [string map [list "\#" "\\\#" \" \\\" \} \\\} ( \\( ) \\)] $message]
    
            if {![string match "(*)" $offset]} {
                catch {set offset [expr $offset]}
            }
    
            # record is the complete match command,
            # encoded for tcl code generation
            set record [list $linenum $type $qual $compare $offset $value $message]
            if {$script == {}} {
                # the original script has level 0,
                # regardless of what the script says
                set level 0
            }
            
            if {$level == 0} {
                # add a new 0-level record
                lappend script $record
            } else {
                # find the growing edge of the script
                set depth [lrepeat [expr $level] end]
                while {[catch {
                    # get the insertion point
                    set insertion [lindex $script {*}$depth]
                }]} {
                    # handle scripts which jump levels,
                    # reduce depth to current-depth+1
                    set depth [lreplace $depth end end]
                }
    
                # add the record at the insertion point
                lappend insertion $record
    
                # re-insert the record into its correct position
                lset script {*}$depth $insertion
            }
        }
        #puts "Script: $script"
        return $script
    }
    
    proc magic::install {args} {
        foreach arg $args {
            proc magic::/[file tail $arg] {} [magic::compile $arg]
        }
    }
    
    # compile up magic files or directories of magic files
    proc magic::compile {args} {
        set tcl ""
        set script {}
        foreach arg $args {
            if {[file type $arg] == "directory"} {
                foreach file [glob [file join $arg *]] {
                    set script1 [process $file]
                    lappend script [list file $file] {*}$script1
                    #append tcl "magic::file_start $file" \n
                    #append tcl [run $script1] \n
                }
            } else {
                set file $arg
                set script1 [process $file]
                lappend script [list file $file] {*}$script1
                #append tcl "magic::file_start $file" \n
                #append tcl [run $script1] \n
            }
        }
    
        #puts stderr $script
        puts "\# $args"
        set t [2tree $script]
        set tcl [treegen $t root]
        puts [treedump $t]
        #set tcl [run $script]
    
        return $tcl
    }
    
    proc magic::generate {args} {
        foreach arg $args {
            set out [::open [file tail $arg].tcl w]
    
            puts $out "proc magic::/[file tail $arg] {} \{"
            puts $out [magic::compile $arg]
            puts $out "return {}"
            puts $out \}
    
            ::close $out
        }
    }
    
    #puts [magic::compile {*}$argv]
    #puts stderr "typemap: [array get magic::typemap]"
    magic::generate {*}$argv
    
    #set script [magic::compile {} /usr/share/misc/file/magic]
    #puts "\# types:[array names magic::types]"
    #puts "\# quals:[array names magic::quals]"
    #puts "Script: $script"
    
    return

-----
magiclib.tcl:
    # TODO:
    
    # Required Functionality:
    
    # implement full offset language
    # implement pstring (pascal string, blerk)
    # implement regex form (blerk!)
    # implement string qualifiers
    
    # Optimisations:
    
    # reorder tests according to expected or observed frequency
    # this conflicts with reduction in strength optimisations.
    
    # Rewriting within a  level will require pulling apart the
    # list of tests at that level and reordering them.
    # There is an inconsistency between handling at 0-level and
    # deeper level - this has to be removed or justified.
    
    # Hypothetically, every test at the same level should be
    # mutually exclusive, but this is not given, and should be
    # detected.  If true, this allows reduction in strength to switch
    # on Numeric tests
    
    # reduce Numeric tests at the same level to switches
    #
    # - first pass through clauses at same level to categorise as
    #   variant values over same test (type and offset).
    
    # work out some way to cache String comparisons
    
    # Reduce seek/reads for String comparisons at same level:
    #
    # - first pass through clauses at same level to determine string ranges.
    #
    # - String tests at same level over overlapping ranges can be
    #   written as sub-string comparisons over the maximum range
    #   this saves re-reading the same string from file.
    #
    # - common prefix strings will have to be guarded against, by
    #   sorting string values, then sorting the tests in reverse length order.
    
    namespace eval magic {}
    
    set magic::debug 0
    set magic::optimise 1
    
    # open the file to be scanned
    proc magic::open {file} {
        variable fd
        set fd [::open $file]
        fconfigure $fd -translation binary
    
        # fill the string cache
        # the vast majority of magic strings are in the first 4k of the file.
        variable strbuf
        set strbuf [read $fd 4096]
    
        # clear the fetch cache
        variable cache
        catch {unset cache}
    
        variable result
        set result ""
    
        variable string
        set string ""
    
        variable numeric
        set numeric -9999
    
        return $fd
    }
    
    proc magic::close {file} {
        variable fd
        ::close $fd
    }
    
    # mark the start of a magic file in debugging
    proc magic::file_start {name} {
        variable debug
        if {$debug} {
            puts stderr "File: $name"
        }
    }
    
    # return the emitted result
    proc magic::result {{msg ""}} {
        variable result
        if {$msg != ""} {
            emit $msg
        }
        return -code return $result
    }
    
    # emit a message
    proc magic::emit {msg} {
        variable string
        variable numeric
        set msg [::string map [list \\b "" %s $string %ld $numeric %d $numeric] $msg]
    
        variable result
        append result " " $msg
        set result [string trim $result " "]
    }
    
    # handle complex offsets - TODO
    proc magic::offset {where} {
        variable debug
        #if {$debug} {
            puts stderr "OFFSET: $where"
        #}
        return 0
    }
    
    # fetch and cache a value from the file
    proc magic::fetch {where what scan} {
        variable cache
        variable numeric
    
        if {![info exists cache($where,$what,$scan)]} {
            variable fd
            seek $fd $where
            binary scan [read $fd $what] $scan numeric
            set cache($where,$what,$scan) $numeric
        } else {
            set numeric $cache($where,$what,$scan)
        }
        return $numeric
    }
    
    # maps magic typenames to field characteristics: size, binary scan format
    array set magic::typemap {
        byte {1 c}
        ubyte {1 c}
        short {2 S}
        ushort {2 S}
        beshort {2 S}
        leshort {2 s}
        ubeshort {2 S}
        uleshort {2 s}
        long {4 I}
        belong {4 I}
        lelong {4 i}
        ubelong {4 I}
        ulelong {4 i}
        date {2 S}
        bedate {2 S}
        ledate {2 s}
        ldate {4 I}
        beldate {4 I}
        leldate {4 i}
    }
    
    # generate short form names
    foreach {n v} [array get magic::typemap] {
        foreach {len scan} $v {
            #puts stderr "Adding $scan - [list $len $scan]"
            set magic::typemap($scan) [list $len $scan]
            break
        }
    }
    
    proc magic::Nv {type offset {qual ""}} {
        variable typemap
        variable numeric
    
        # unpack the type characteristics
        foreach {size scan} $typemap($type) break
    
        # fetch the numeric field
        set numeric [fetch $offset $size $scan]
    
        if {$qual != ""} {
            # there's a mask to be applied
            set numeric [expr $numeric $qual]
        }
    
        variable debug
        if {$debug} {
            puts stderr "NV $type $offset $qual: $numeric"
        }
    
        return $numeric
    }
    
    # Numeric - get bytes of $type at $offset and $compare to $val
    # qual might be a mask
    proc magic::N {type offset comp val {qual ""}} {
        variable typemap
        variable numeric
    
        # unpack the type characteristics
        foreach {size scan} $typemap($type) break
    
        # fetch the numeric field
        set numeric [fetch $offset $size $scan]
    
        if {$comp == "x"} {
            # anything matches - don't care
            return 1
        }
    
        # get value in binary form, then back to numeric
        # this avoids problems with sign, as both values are
        # [binary scan]-converted identically
        binary scan [binary format $scan $val] $scan val
    
        if {$qual != ""} {
            # there's a mask to be applied
            set numeric [expr $numeric $qual]
        }
    
        set c [expr $val $comp $numeric]        ;# perform comparison
    
        variable debug
        if {$debug} {
            puts stderr "numeric $type: $val $comp $numeric / $qual - $c"
        }
    
        return $c
    }
    
    proc magic::getString {offset len} {
        # cache the first 1k of the file
        variable string
        set end [expr {$offset + $len - 1}]
        if {$end < 4096} {
            # in the string cache
            variable strbuf
            set string [string range $strbuf $offset $end]
        } else {
            # an unusual one
            variable fd
            seek $fd $offset        ;# move to the offset
            set string [read $fd $len]
        }
        return $string
    }
    
    proc magic::S {offset comp val {qual ""}} {
        variable fd
        variable string
    
        # convert any backslashes
        set val [subst -nocommands -novariables $val]
    
        if {$comp eq "x"} {
            # match anything - don't care, just get the value
            set string ""
    
            seek $fd $offset        ;# move to the offset
            while {([::string length $string] < 100)
                   && [::string is print [set c [read $fd 1]]]} {
                if {[string is space $c]} {
                    break
                }
                append string $c
            }
    
            return 1
        }
    
        # get the string and compare it
        set string [getString $offset [::string length $val]]
        set cmp [::string compare $val $string]
        set c  [expr $cmp $comp 0]
    
        variable debug
        if {$debug} {
            puts "String '$val' $comp '$string' - $c"
            if {$c} {
                puts "offset $offset - $string"
            }
        }
    
        return $c
    }
----
magictree.tcl
    package require struct::list
    package require struct::tree
    
    namespace eval magic {}
    
    proc magic::path {tree} {
        $tree set root path {}
        foreach child [$tree children root] {
            $tree walk $child -type dfs node {
                set path [$tree get [$tree parent $node] path]
                lappend path [$tree index $node]
                $tree set $node path $path
            }
        }
    }
    
    proc magic::tree_el {tree parent file line type qual comp offset val message args} {
        set node [$tree insert $parent end]
        set path [$tree get $parent path]
        lappend path [$tree index $node]
        $tree set $node path $path
    
        # generate a proc call type for the type, Numeric or String
        variable typemap
        switch -glob -- $type {
            *byte* -
            *short* -
            *long* -
            *date* {
                set otype N
                set type [lindex $typemap($type) 1]
            }
            *string {
                set otype S
            }
            default {
                puts stderr "Unknown type: '$type'"
            }
        }
    
        foreach key {line type qual comp offset val message file otype} {
            if {[catch {
                $tree set $node $key [set $key]
            } result eo]} {
                puts "Tree: $eo - $file $line $type"
            }
        }
    
        # now add children
        foreach el $args {
            tree_el $tree $node $file {*}$el
        }
        return $node
    }
    
    proc magic::2tree {script} {
        variable tree;
        set tree [::struct::tree]
    
        $tree set root path ""
        $tree set root otype Root
        $tree set root type root
        $tree set root message "unknown"
    
        # generate a test for each match
        set file "unknown"
        foreach el $script {
            #puts "EL: $el"
            if {[lindex $el 0] eq "file"} {
                set file [lindex $el 1]
            } else {
                append result [tree_el $tree root $file {*}$el]
            }
        }
        optNum $tree root
        #optStr $tree root
        puts stderr "Script contains [llength [$tree children root]] discriminators"
        path $tree
        return $tree
    }
    
    proc magic::isStr {tree node} {
        return [expr {"S" eq [$tree get $node otype]}]
    }
    
    proc magic::sortRegion {r1 r2} {
        set cmp 0
        if {[catch {
            if {[string match (*) $r1] || [string match (*) $r2]} {
                set cmp [string compare $r1 $r2]
            } else {
                set cmp [expr {[lindex $r1 0] - [lindex $r2 0]}]
                if {!$cmp} {
                    set cmp 0
                    set cmp [expr {[lindex $r1 1] - [lindex $r2 1]}]
                }
            }
        } result eo]} {
            set cmp [string compare $r1 $r2]
        }
        return $cmp
    }
    
    proc magic::optStr {tree node} {
        variable regions
        catch {unset regions}
        array set regions {}
        optStr1 $tree $node
        puts stderr "Regions [array statistics regions]"
        foreach region [lsort -index 0 -command magic::sortRegion [array name regions]] {
            puts "$region - $regions($region)"
        }
    }
    
    proc magic::optStr1 {tree node} {
        # traverse each numeric element of this node's children,
        # categorising them
    
        set kids [$tree children $node]
        foreach child $kids {
            optStr1 $tree $child
        }
    
        set strings [$tree children $node filter magic::isStr]
        #puts stderr "optstr: $node: $strings"
    
        variable regions
        foreach el $strings {
            #if {[$tree get $el otype] eq "String"} {
            #    puts "[$tree getall $el] - [string length [$tree get $el val]]"
            #}
    
            if {[$tree get $el comp] eq "x"} {
                continue
            }
    
            set offset [$tree get $el offset]
            set len [string length [$tree get $el val]]
            lappend regions([list $offset $len]) $el
        }
    }
    
    proc magic::isNum {tree node} {
        return [expr {"N" eq [$tree get $node otype]}]
    }
    
    proc magic::switchNSort {tree n1 n2} {
        return [expr {[$tree get $n1 val] - [$tree get $n1 val]}]
    }
    
    proc magic::optNum {tree node} {
        # traverse each numeric element of this node's children,
        # categorising them
    
        set kids [$tree children $node]
        foreach child $kids {
            optNum $tree $child
        }
    
        set numerics [$tree children $node filter magic::isNum]
        #puts stderr "optNum: $node: $numerics"
        if {[llength $numerics] < 2} {
            return
        }
    
        array set offsets {}
        foreach el $numerics {
            if {[$tree get $el comp] ne "=="} {
                continue
            }
            lappend offsets([$tree get $el type],[$tree get $el offset],[$tree get $el qual]) $el
        }
    
        #puts "Offset: stderr [array get offsets]"
        foreach {match nodes} [array get offsets] {
            if {[llength $nodes] < 2} {
                continue
            }
    
            catch {unset matcher}
            foreach n $nodes {
                set nv [expr [$tree get $n val]]
                if {[info exists matcher($nv)]} {
                    puts stderr "Node <[$tree getall $n]> clashes with <[$tree getall $matcher($nv)]>"
                } else {
                    set matcher($nv) $n
                }
            }
    
            foreach {type offset qual} [split $match ,] break
            set switch [$tree insert $node [$tree index [lindex $nodes 0]]]
            $tree set $switch otype Switch
            $tree set $switch message $match
            $tree set $switch offset $offset
            $tree set $switch type $type
            $tree set $switch qual $qual
    
            set nodes [lsort -command [list magic::switchNSort $tree] $nodes]
    
            $tree move $switch end {*}$nodes
            set path [$tree get [$tree parent $switch] path]
            lappend path [$tree index $switch]
            $tree set $switch path $path
        }
    }
    
    proc magic::treedump {tree} {
        set result ""
        $tree walk root -type dfs node {
            set path [$tree get $node path]
            set depth [llength $path]
            append result [string repeat "  " $depth] [list $path] ": " [$tree get $node type]
            if {[$tree keyexists $node offset]} {
                append result ,[$tree get $node offset]
            }
            if {[$tree keyexists $node qual]} {
                set q [$tree get $node qual]
                if {$q ne ""} {
                    append result ,$q
                }
            }
    
            if {[$tree keyexists $node comp]} {
                append result  [$tree get $node comp]
            }
            if {[$tree keyexists $node val]} {
                append result  [$tree get $node val]
            }
    
            if {$depth == 1} {
                set msg [$tree get $node message]
                set n $node
                while {($n != {}) && ($msg == "")} {
                    set n [lindex [$tree children $n] 0]
                    if {$n != {}} {
                        set msg [$tree get $n message]
                    }
                }
                append result " " ( $msg )
                if {[$tree keyexists $node file]} {
                    append result " - " [$tree get $node file]
                }
            }
    
            #append result " <" [$tree getall $node] >
            append result \n
        }
        return $result
    }
    
    proc magic::treegen {tree node} {
        return "[treegen1 $tree $node]\nresult\n"
    }
    
    proc magic::treegen1 {tree node} {
        set result ""
        foreach k {otype type offset comp val qual message} {
            if {[$tree keyexists $node $k]} {
                set $k [$tree get $node $k]
            }
        }
    
        if {$otype eq "N"} {
            set type [list N $type]
        } elseif {$otype eq "S"} {
            set type S
        }
    
        switch $otype {
            N -
            S {
                # this is a complex offset - call the offset interpreter
                if {[string match "(*)" $offset]} {
                    set offset "\[offset $offset\]"
                }
    
                append result "if \{\[$type $offset $comp [list $val] $qual\]\} \{"
    
                if {[$tree isleaf $node]} {
                    if {$message != ""} {
                        append result "emit [list $message]"
                    } else {
                        append result "emit [$tree get $node path]"
                    }
                } else {
                    if {$message != ""} {
                        append result "emit [list $message]\n"
                    }
                    foreach child [$tree children $node] {
                        append result [treegen1 $tree $child]
                    }
                    #append result "\nreturn \$result"
                }
    
                append result "\}\n"
            }
    
            Root {
                foreach child [$tree children $node] {
                    append result [treegen1 $tree $child]
                }
            }
    
            Switch {
                # this is a complex offset - call the offset interpreter
                if {[string match "(*)" $offset]} {
                    set offset "\[offset $offset\]"
                }
    
                append result "switch -- \[Nv $type $offset $qual\] "
    
                variable typemap
                set scan [lindex $typemap($type) 1]
                foreach child [$tree children $node] {
                    binary scan [binary format $scan [$tree get $child val]] $scan val
                    append result "$val \{"
    
                    if {[$tree isleaf $child]} {
                        append result "emit [list [$tree get $child message]]"
                    } else {
                        append result "emit [list [$tree get $child message]]\n"
                        foreach grandchild [$tree children $child] {
                            append result [treegen1 $tree $grandchild]
                        }
                    }
                    append result "\} "
                }
                append result "\n"
            }
        }
        return $result
    }
    
    return

Whew ... let us never speak of file discrimination programs again :)