Version 0 of mime type recognition in pure tcl

Updated 2004-09-25 08:45:38 by CMCc

CMcC 20040925

I've written a compiler from unix magic(5) files to pure tcl.

The idea is you generate a script which should perform precisely the same function as the unix file command (it gives you information about a file)

This code isn't complete, but handles most common cases. You'll find that a lot of the magic files out there are buggy, too.


magiccompile.tcl:

    # file to parse the magic file from magic(5) into a tcl program
    package require fileutil

    source magiclib.tcl

    namespace eval magic {}

    # parse an individual line
    proc magic::parseline {line {maxlevel 10000}} {
        set unlevel [string trimleft $line >]
        set level [expr [string length $line] - [string length $unlevel]]

        if {$level > $maxlevel} {
            return -code continue "Skip - too high a level"
        }
        set parse [regexp -expanded -inline {^(\S+)\s+(\S+)\s*((\S|(\B\s))*)\s*(.*)$} $unlevel]
        if {$parse == {}} {
            error "Can't parse: '$unlevel'"
        }
        set value ""
        set command ""
        foreach {junk offset type value junk1 junk2 command} $parse break
        if {[string index $value end] eq "\\"} {
            append value " "
        }
        if {[string index $command end] eq "\\"} {
            append command " "
        }

        #set value [subst -nocommands -novariables $value]
        #set command [subst -nocommands -novariables $command]
        if {$value == ""} {
            error "no value"
        }

        #puts "level:$level offset:$offset type:$type value:'$value' command:'$command'"
        return [list $level $offset $type $value $command]
    }

    # process a magic file
    proc magic::process {file {maxlevel 10000}} {
        variable level        ;# level of line
        variable linenum        ;# line number
        set level 0
        set script {}

        set linenum 0
        ::fileutil::foreachLine line $file {
            incr linenum
            set line [string trim $line " "]
            if {[string index $line 0] eq "#"} {
                continue        ;# skip comments
            } elseif {$line == ""} {
                continue
            } else {
                # parse line
                if {[catch {parseline $line $maxlevel} parsed]} {
                    continue
                }

                # got a valid line
                foreach {level offset type value message} $parsed break

                set compare [string index $value 0]
                switch -glob --  $value {
                    [<>]=* {
                        set compare [string range $value 0 1]
                        set value [string range $value 2 end]
                    }

                    <* -
                    >* -
                    &* -
                    ^* {
                        set value [string range $value 1 end]
                    }

                    =* {
                        set compare "=="
                        set value [string range $value 1 end]
                    }

                    !* {
                        set compare "!="
                        set value [string range $value 1 end]
                    }

                    x {
                        set value ""
                    }

                    default {
                        set compare "=="
                        if {[string match {\\[<!>=]*} $value]} {
                            set value [string range $value 1 end]
                        }
                    }
                }

                set qual ""

                switch -glob -- $type {
                    pstring* -
                    string* {
                        set type [split $type /]
                        set qual [lindex $type 1]
                        set type [lindex $type 0]
                        if {$type eq "pstring"} {
                            append qual "p"
                            set type "string"
                        }
                        set value [string map [list "\#" "\\\#"] $value]
                        if {($value eq "\\0") && ($compare eq ">")} {
                            set value ""
                            set compare x
                        } elseif {$compare eq "!="} {
                            set value !$value
                            set compare "=="
                        }
                        if {$type ne "string"} {
                            puts stderr "Reject String: ${file}:$linenum $type - $line"
                            continue
                        }
                    }

                    regex {
                        # I am *not* going to handle regex
                        puts stderr "Reject Regex: ${file}:$linenum $type - $line"
                        continue
                    }

                    *byte* -
                    *short* -
                    *long* -
                    *date* {
                        # break up the type
                        set type [split $type &]
                        set qual [lindex $type 1]

                        if {$qual ne ""} {
                            set qual &$qual
                        } else {
                            set type [split $type -]
                            set qual [lindex $type 1]
                            if {$qual ne ""} {
                                set qual -$qual
                            }
                        }
                        set type [lindex $type 0]

                        if {$compare ne "x"} {
                            set value [string trimright $value L] ;# redundant Long
                            if {[catch {set value [expr $value]} x eo]} {
                                puts stderr "Reject Value Error: ${file}:$linenum '$value' '$line' - $eo"
                                continue;
                            }
                            set value [format "0x%x" $value]
                        }
                    }
                    default {
                        puts stderr "Reject Unknown Type: ${file}:$linenum $type - $line"
                        continue
                    }
                }
            }

            # collect some summaries
            variable types
            set types($type) $type
            variable quals
            set quals($qual) $qual

            #puts "$linenum level:$level offset:$offset type:$type qual:$qual compare:$compare value:'$value' message:'$message'"

            set message [string map [list "\#" "\\\#"] $message]

            set record [list $type $qual $compare $offset $value $message]
            if {$script == {}} {
                set level 0
            }

            if {$level == 0} {
                lappend script $record
            } else {
                # find the growing edge
                set depth [lrepeat [expr $level] end]
                while {[catch {
                    set insertion [lindex $script {expand}$depth]
                }]} {
                    set depth [lreplace $depth end end]
                }

                #puts stderr "PRE: $depth - $script - $insertion"
                lappend insertion $record
                lset script {expand}$depth $insertion
                #puts stderr "POST: $script"
            }
        }
        #puts "Script: $script"
        return $script
    }

    # generate code for the alternative tests
    proc magic::run_else {type qual comp offset val message args} {
        set result ""
        if {[string match "(*)" $offset]} {
            set offset "\[magic::offset $offset\]"
        }

        switch -glob -- $type {
            *byte* -
            *short* -
            *long* -
            *date* {
                set type [list Numeric $type]
            }
            *string {
                set type String
            }
        }

        append result "if \{\[magic::$type $offset $comp [list $val] $qual\]\} \{" \n
        append result "\tmagic::emit [list $message]" \n
        if {[llength $args]} {
            append result [run_c {expand}$args] \n
        }
        append result "\} else"
        return $result
    }

    # generate code for the next level tests
    proc magic::run_c {args} {
        set result ""
        foreach el $args {
            append result [run_else {expand}$el]
        }
        if {[llength $args] != 0} {
            append result " \{\}" \n
        }
        return $result
    }

    # generate code for each of a series of tests
    proc magic::run_el {type qual comp offset val message args} {
        set result ""
        if {[string match "(*)" $offset]} {
            set offset "\[magic::offset $offset\]"
        }

        switch -glob -- $type {
            *byte* -
            *short* -
            *long* -
            *date* {
                set type [list numeric $type]
            }
        }

        append result "if \{\[magic::$type $offset $comp [list $val] $qual\]\} \{" \n
        append result "\tmagic::emit [list $message]" \n
        append result [run_c {expand}$args]
        append result "return \[magic::result\]" \n
        append result "\}" \n
        return $result
    }

    # generate code for a magic file's tests
    proc magic::run {script} {
        set result ""
        foreach el $script {
            append result [run_el {expand}$el]
        }
        return $result
    }

    # compile up magic files or directories of magic files
    proc magic::compile {args} {
        set tcl ""
        foreach arg $args {
            if {[file type $arg] == "directory"} {
                foreach file [glob [file join $arg *]] {
                    set script1 [process $file]
                    append tcl "magic::file_start $file" \n
                    append tcl [run $script1] \n
                }
            } else {
                set script1 [process $arg]
                append tcl "magic::file_start $arg" \n
                append tcl [run $script1] \n
            }
        }
        return $tcl
    }

    proc magic::install {args} {
        foreach arg $args {
            proc magic::/[file tail $arg] {} [magic::compile $arg]
        }
    }

    proc magic::generate {args} {
        foreach arg $args {
            set out [::open [file tail $arg].tcl w]

            puts $out "proc magic::/[file tail $arg] {} \{"
            puts $out [magic::compile $arg]
            puts $out "return {}"
            puts $out \}

            close $out
        }
    }

    #puts [magic::compile {expand}$argv]
    magic::generate {expand}$argv

magiclib.tcl

    namespace eval magic {}

    set magic::debug 0

    # open the file to be scanned
    proc magic::open {file} {
        variable fd
        set fd [::open $file]
        fconfigure $fd -translation binary

        # clear the fetch cache
        variable cache
        unset cache
    }

    # mark the start of a magic file
    proc magic::file_start {name} {
        variable debug
        if {$debug} {
            puts stderr "File: $name"
        }
    }

    # return the emitted result
    proc magic::result {} {
        variable result
        return $result
    }

    # emit a message
    proc magic::emit {msg} {
        variable string
        variable numeric
        set msg [::string map [list \\b "" %s $string %d $numeric] $msg]
        variable result
        append result $msg
        return 0
    }

    # handle complex offsets - TODO
    proc magic::offset {where} {
        variable debug
        #if {$debug} {
            puts stderr "OFFSET: $where"
        #}
        return 0
    }

    # 
    proc magic::fetch {where what} {
        variable cache
        if {![info exists cache($where,$what)]} {
            variable fd
            seek $fd $where
            set cache($where,$what) [read $fd $what]
        }
        return $cache($where,$what)
    }

    # maps magic typenames to field characteristics
    array set magic::typemap {
        byte {1 -1 c}
        ubyte {1 0xff c}
        short {2 -1 S}
        ushort {2 0xffff S}
        beshort {2 -1 S}
        leshort {2 -1 s}
        ubeshort {2 0xffff S}
        uleshort {2 0xffff s}
        long {4 -1 I}
        belong {4 -1 I}
        lelong {4 -1 i}
        ubelong {4 0xffffffff I}
        ulelong {4 0xffffffff i}
    }

    proc magic::Numeric {type offset comp val {qual ""}} {
        variable typemap
        variable numeric

        foreach {size mask scan} $typemap($type) break

        set numeric [fetch $offset $size]
        if {[::string bytelength $numeric] == 0} {
            return 0
        }

        if {$comp == "x"} {
            # anything matches - don't care
            binary scan $numeric $scan numeric
            return 1
        }

        set val [binary format $scan $val]

        binary scan $numeric $scan numeric
        binary scan $val $scan val

        if {$qual != ""} {
            set numeric [expr $numeric $qual]
        }

        set c [expr $val $comp $numeric]
        variable debug
        if {$debug} {
            puts stderr "numeric $type: $val $comp $numeric / $qual - $c"
        }

        return $c
    }

    proc magic::String {offset comp val {qual ""}} {
        variable fd
        variable string
        set val [subst -nocommands -novariables $val]
        seek $fd $offset

        if {$comp eq "x"} {
            # anything matches - don't care
            set string ""
            while {([::string length $string] < 100)
                   && ![::string is space [set c [read $fd 1]]]} {
                append string $c
            }
            return 1
        }

        set string [read $fd [::string length $val]]
        set cmp [::string compare $val $string]
        set c  [expr $cmp $comp 0]

        variable debug
        if {$debug} {
            puts "String '$val' $comp '$string' - $c"
            if {$c} {
                puts "offset $offset - $string"
            }
        }
        return $c
    }