CMcC 20040925
I've written a compiler from unix magic(5) files to pure tcl.
The idea is you generate a script which should perform precisely the same function as the unix file command (it gives you information about a file)
This code isn't complete, but handles most common cases - certainly enough to produce useful file recognition. The full unix file command recognises/discriminates thousands of different file types. This'll be useful to generate file type recognition for tclhttpd uploads, for example to stop people uploading viruses.
You'll find that a lot of the magic files out there are buggy, too.
Enjoy!
Update 20040927 - Added an optimiser and generate code from a tree.
A later form of this code was put into Tcllib, under the module fumagic, short for fileutil::magic.
Output from /etc/mime-magic is here mime type discriminator ... about 500 LOC, and you'll need magiclib.tcl from here.
magiccompile.tcl:
# file to compile the magic file from magic(5) into a tcl program package require fileutil source magiclib.tcl source magictree.tcl namespace eval magic {} set magic::debug 0 # parse an individual line proc magic::parseline {line {maxlevel 10000}} { # calculate the line's level set unlevel [string trimleft $line >] set level [expr [string length $line] - [string length $unlevel]] if {$level > $maxlevel} { return -code continue "Skip - too high a level" } # regexp parse line into (offset, type, value, command) set parse [regexp -expanded -inline {^(\S+)\s+(\S+)\s*((\S|(\B\s))*)\s*(.*)$} $unlevel] if {$parse == {}} { error "Can't parse: '$unlevel'" } # unpack parsed line set value "" set command "" foreach {junk offset type value junk1 junk2 command} $parse break # handle trailing spaces if {[string index $value end] eq "\\"} { append value " " } if {[string index $command end] eq "\\"} { append command " " } if {$value == ""} { error "no value" ;# badly formatted line } variable debug if {$debug > 1} { puts "level:$level offset:$offset type:$type value:'$value' command:'$command'" } # return the line's fields return [list $level $offset $type $value $command] } # process a magic file proc magic::process {file {maxlevel 10000}} { variable level ;# level of line variable linenum ;# line number set level 0 set script {} set linenum 0 ::fileutil::foreachLine line $file { incr linenum set line [string trim $line " "] if {[string index $line 0] eq "#"} { continue ;# skip comments } elseif {$line == ""} { continue ;# skip blank lines } else { # parse line if {[catch {parseline $line $maxlevel} parsed]} { continue ;# skip erroring lines } # got a valid line foreach {level offset type value message} $parsed break # strip comparator out of value field, # (they are combined) set compare [string index $value 0] switch -glob -- $value { [<>]=* { set compare [string range $value 0 1] set value [string range $value 2 end] } <* - >* - &* - ^* { set value [string range $value 1 end] } =* { set compare "==" set value [string range $value 1 end] } !* { set compare "!=" set value [string range $value 1 end] } x { # this is the 'don't care' match # used for collecting values set value "" } default { # the default comparator is equals set compare "==" if {[string match {\\[<!>=]*} $value]} { set value [string range $value 1 end] } } } # process type field set qual "" switch -glob -- $type { pstring* - string* { # String or Pascal string type # extract string match qualifiers set type [split $type /] set qual [lindex $type 1] set type [lindex $type 0] # convert pstring to string + qualifier if {$type eq "pstring"} { append qual "p" set type "string" } # protect hashes in output script value set value [string map [list "\#" "\\#" \" \\\" \{ \\\{ \} \\\}] $value] if {($value eq "\\0") && ($compare eq ">")} { # record 'any string' match set value "" set compare x } elseif {$compare eq "!="} { # string doesn't allow !match set value !$value set compare "==" } if {$type ne "string"} { # don't let any odd string types sneak in puts stderr "Reject String: ${file}:$linenum $type - $line" continue } } regex { # I am *not* going to handle regex puts stderr "Reject Regex: ${file}:$linenum $type - $line" continue } *byte* - *short* - *long* - *date* { # Numeric types # extract numeric match &qualifiers set type [split $type &] set qual [lindex $type 1] if {$qual ne ""} { # this is an &-qualifier set qual &$qual } else { # extract -qualifier from type set type [split $type -] set qual [lindex $type 1] if {$qual ne ""} { set qual -$qual } } set type [lindex $type 0] # perform value adjustments if {$compare ne "x"} { # trim redundant Long value qualifier set value [string trimright $value L] if {[catch {set value [expr $value]} x eo]} { # check that value is representable in tcl puts stderr "Reject Value Error: ${file}:$linenum '$value' '$line' - $eo" continue; } # coerce numeric value into hex set value [format "0x%x" $value] } } default { # this is not a type we can handle puts stderr "Reject Unknown Type: ${file}:$linenum $type - $line" continue } } } # collect some summaries variable debug if {$debug} { variable types set types($type) $type variable quals set quals($qual) $qual } #puts "$linenum level:$level offset:$offset type:$type qual:$qual compare:$compare value:'$value' message:'$message'" # protect hashes in output script message set message [string map [list "\#" "\\\#" \" \\\" \} \\\} ( \\( ) \\)] $message] if {![string match "(*)" $offset]} { catch {set offset [expr $offset]} } # record is the complete match command, # encoded for tcl code generation set record [list $linenum $type $qual $compare $offset $value $message] if {$script == {}} { # the original script has level 0, # regardless of what the script says set level 0 } if {$level == 0} { # add a new 0-level record lappend script $record } else { # find the growing edge of the script set depth [lrepeat [expr $level] end] while {[catch { # get the insertion point set insertion [lindex $script {*}$depth] }]} { # handle scripts which jump levels, # reduce depth to current-depth+1 set depth [lreplace $depth end end] } # add the record at the insertion point lappend insertion $record # re-insert the record into its correct position lset script {*}$depth $insertion } } #puts "Script: $script" return $script } proc magic::install {args} { foreach arg $args { proc magic::/[file tail $arg] {} [magic::compile $arg] } } # compile up magic files or directories of magic files proc magic::compile {args} { set tcl "" set script {} foreach arg $args { if {[file type $arg] == "directory"} { foreach file [glob [file join $arg *]] { set script1 [process $file] lappend script [list file $file] {*}$script1 #append tcl "magic::file_start $file" \n #append tcl [run $script1] \n } } else { set file $arg set script1 [process $file] lappend script [list file $file] {*}$script1 #append tcl "magic::file_start $file" \n #append tcl [run $script1] \n } } #puts stderr $script puts "\# $args" set t [2tree $script] set tcl [treegen $t root] puts [treedump $t] #set tcl [run $script] return $tcl } proc magic::generate {args} { foreach arg $args { set out [::open [file tail $arg].tcl w] puts $out "proc magic::/[file tail $arg] {} \{" puts $out [magic::compile $arg] puts $out "return {}" puts $out \} ::close $out } } #puts [magic::compile {*}$argv] #puts stderr "typemap: [array get magic::typemap]" magic::generate {*}$argv #set script [magic::compile {} /usr/share/misc/file/magic] #puts "\# types:[array names magic::types]" #puts "\# quals:[array names magic::quals]" #puts "Script: $script" return ----- magiclib.tcl: # TODO: # Required Functionality: # implement full offset language # implement pstring (pascal string, blerk) # implement regex form (blerk!) # implement string qualifiers # Optimisations: # reorder tests according to expected or observed frequency # this conflicts with reduction in strength optimisations. # Rewriting within a level will require pulling apart the # list of tests at that level and reordering them. # There is an inconsistency between handling at 0-level and # deeper level - this has to be removed or justified. # Hypothetically, every test at the same level should be # mutually exclusive, but this is not given, and should be # detected. If true, this allows reduction in strength to switch # on Numeric tests # reduce Numeric tests at the same level to switches # # - first pass through clauses at same level to categorise as # variant values over same test (type and offset). # work out some way to cache String comparisons # Reduce seek/reads for String comparisons at same level: # # - first pass through clauses at same level to determine string ranges. # # - String tests at same level over overlapping ranges can be # written as sub-string comparisons over the maximum range # this saves re-reading the same string from file. # # - common prefix strings will have to be guarded against, by # sorting string values, then sorting the tests in reverse length order. namespace eval magic {} set magic::debug 0 set magic::optimise 1 # open the file to be scanned proc magic::open {file} { variable fd set fd [::open $file] fconfigure $fd -translation binary # fill the string cache # the vast majority of magic strings are in the first 4k of the file. variable strbuf set strbuf [read $fd 4096] # clear the fetch cache variable cache catch {unset cache} variable result set result "" variable string set string "" variable numeric set numeric -9999 return $fd } proc magic::close {file} { variable fd ::close $fd } # mark the start of a magic file in debugging proc magic::file_start {name} { variable debug if {$debug} { puts stderr "File: $name" } } # return the emitted result proc magic::result {{msg ""}} { variable result if {$msg != ""} { emit $msg } return -code return $result } # emit a message proc magic::emit {msg} { variable string variable numeric set msg [::string map [list \\b "" %s $string %ld $numeric %d $numeric] $msg] variable result append result " " $msg set result [string trim $result " "] } # handle complex offsets - TODO proc magic::offset {where} { variable debug #if {$debug} { puts stderr "OFFSET: $where" #} return 0 } # fetch and cache a value from the file proc magic::fetch {where what scan} { variable cache variable numeric if {![info exists cache($where,$what,$scan)]} { variable fd seek $fd $where binary scan [read $fd $what] $scan numeric set cache($where,$what,$scan) $numeric } else { set numeric $cache($where,$what,$scan) } return $numeric } # maps magic typenames to field characteristics: size, binary scan format array set magic::typemap { byte {1 c} ubyte {1 c} short {2 S} ushort {2 S} beshort {2 S} leshort {2 s} ubeshort {2 S} uleshort {2 s} long {4 I} belong {4 I} lelong {4 i} ubelong {4 I} ulelong {4 i} date {2 S} bedate {2 S} ledate {2 s} ldate {4 I} beldate {4 I} leldate {4 i} } # generate short form names foreach {n v} [array get magic::typemap] { foreach {len scan} $v { #puts stderr "Adding $scan - [list $len $scan]" set magic::typemap($scan) [list $len $scan] break } } proc magic::Nv {type offset {qual ""}} { variable typemap variable numeric # unpack the type characteristics foreach {size scan} $typemap($type) break # fetch the numeric field set numeric [fetch $offset $size $scan] if {$qual != ""} { # there's a mask to be applied set numeric [expr $numeric $qual] } variable debug if {$debug} { puts stderr "NV $type $offset $qual: $numeric" } return $numeric } # Numeric - get bytes of $type at $offset and $compare to $val # qual might be a mask proc magic::N {type offset comp val {qual ""}} { variable typemap variable numeric # unpack the type characteristics foreach {size scan} $typemap($type) break # fetch the numeric field set numeric [fetch $offset $size $scan] if {$comp == "x"} { # anything matches - don't care return 1 } # get value in binary form, then back to numeric # this avoids problems with sign, as both values are # [binary scan]-converted identically binary scan [binary format $scan $val] $scan val if {$qual != ""} { # there's a mask to be applied set numeric [expr $numeric $qual] } set c [expr $val $comp $numeric] ;# perform comparison variable debug if {$debug} { puts stderr "numeric $type: $val $comp $numeric / $qual - $c" } return $c } proc magic::getString {offset len} { # cache the first 1k of the file variable string set end [expr {$offset + $len - 1}] if {$end < 4096} { # in the string cache variable strbuf set string [string range $strbuf $offset $end] } else { # an unusual one variable fd seek $fd $offset ;# move to the offset set string [read $fd $len] } return $string } proc magic::S {offset comp val {qual ""}} { variable fd variable string # convert any backslashes set val [subst -nocommands -novariables $val] if {$comp eq "x"} { # match anything - don't care, just get the value set string "" seek $fd $offset ;# move to the offset while {([::string length $string] < 100) && [::string is print [set c [read $fd 1]]]} { if {[string is space $c]} { break } append string $c } return 1 } # get the string and compare it set string [getString $offset [::string length $val]] set cmp [::string compare $val $string] set c [expr $cmp $comp 0] variable debug if {$debug} { puts "String '$val' $comp '$string' - $c" if {$c} { puts "offset $offset - $string" } } return $c } ---- magictree.tcl package require struct::list package require struct::tree namespace eval magic {} proc magic::path {tree} { $tree set root path {} foreach child [$tree children root] { $tree walk $child -type dfs node { set path [$tree get [$tree parent $node] path] lappend path [$tree index $node] $tree set $node path $path } } } proc magic::tree_el {tree parent file line type qual comp offset val message args} { set node [$tree insert $parent end] set path [$tree get $parent path] lappend path [$tree index $node] $tree set $node path $path # generate a proc call type for the type, Numeric or String variable typemap switch -glob -- $type { *byte* - *short* - *long* - *date* { set otype N set type [lindex $typemap($type) 1] } *string { set otype S } default { puts stderr "Unknown type: '$type'" } } foreach key {line type qual comp offset val message file otype} { if {[catch { $tree set $node $key [set $key] } result eo]} { puts "Tree: $eo - $file $line $type" } } # now add children foreach el $args { tree_el $tree $node $file {*}$el } return $node } proc magic::2tree {script} { variable tree; set tree [::struct::tree] $tree set root path "" $tree set root otype Root $tree set root type root $tree set root message "unknown" # generate a test for each match set file "unknown" foreach el $script { #puts "EL: $el" if {[lindex $el 0] eq "file"} { set file [lindex $el 1] } else { append result [tree_el $tree root $file {*}$el] } } optNum $tree root #optStr $tree root puts stderr "Script contains [llength [$tree children root]] discriminators" path $tree return $tree } proc magic::isStr {tree node} { return [expr {"S" eq [$tree get $node otype]}] } proc magic::sortRegion {r1 r2} { set cmp 0 if {[catch { if {[string match (*) $r1] || [string match (*) $r2]} { set cmp [string compare $r1 $r2] } else { set cmp [expr {[lindex $r1 0] - [lindex $r2 0]}] if {!$cmp} { set cmp 0 set cmp [expr {[lindex $r1 1] - [lindex $r2 1]}] } } } result eo]} { set cmp [string compare $r1 $r2] } return $cmp } proc magic::optStr {tree node} { variable regions catch {unset regions} array set regions {} optStr1 $tree $node puts stderr "Regions [array statistics regions]" foreach region [lsort -index 0 -command magic::sortRegion [array name regions]] { puts "$region - $regions($region)" } } proc magic::optStr1 {tree node} { # traverse each numeric element of this node's children, # categorising them set kids [$tree children $node] foreach child $kids { optStr1 $tree $child } set strings [$tree children $node filter magic::isStr] #puts stderr "optstr: $node: $strings" variable regions foreach el $strings { #if {[$tree get $el otype] eq "String"} { # puts "[$tree getall $el] - [string length [$tree get $el val]]" #} if {[$tree get $el comp] eq "x"} { continue } set offset [$tree get $el offset] set len [string length [$tree get $el val]] lappend regions([list $offset $len]) $el } } proc magic::isNum {tree node} { return [expr {"N" eq [$tree get $node otype]}] } proc magic::switchNSort {tree n1 n2} { return [expr {[$tree get $n1 val] - [$tree get $n1 val]}] } proc magic::optNum {tree node} { # traverse each numeric element of this node's children, # categorising them set kids [$tree children $node] foreach child $kids { optNum $tree $child } set numerics [$tree children $node filter magic::isNum] #puts stderr "optNum: $node: $numerics" if {[llength $numerics] < 2} { return } array set offsets {} foreach el $numerics { if {[$tree get $el comp] ne "=="} { continue } lappend offsets([$tree get $el type],[$tree get $el offset],[$tree get $el qual]) $el } #puts "Offset: stderr [array get offsets]" foreach {match nodes} [array get offsets] { if {[llength $nodes] < 2} { continue } catch {unset matcher} foreach n $nodes { set nv [expr [$tree get $n val]] if {[info exists matcher($nv)]} { puts stderr "Node <[$tree getall $n]> clashes with <[$tree getall $matcher($nv)]>" } else { set matcher($nv) $n } } foreach {type offset qual} [split $match ,] break set switch [$tree insert $node [$tree index [lindex $nodes 0]]] $tree set $switch otype Switch $tree set $switch message $match $tree set $switch offset $offset $tree set $switch type $type $tree set $switch qual $qual set nodes [lsort -command [list magic::switchNSort $tree] $nodes] $tree move $switch end {*}$nodes set path [$tree get [$tree parent $switch] path] lappend path [$tree index $switch] $tree set $switch path $path } } proc magic::treedump {tree} { set result "" $tree walk root -type dfs node { set path [$tree get $node path] set depth [llength $path] append result [string repeat " " $depth] [list $path] ": " [$tree get $node type] if {[$tree keyexists $node offset]} { append result ,[$tree get $node offset] } if {[$tree keyexists $node qual]} { set q [$tree get $node qual] if {$q ne ""} { append result ,$q } } if {[$tree keyexists $node comp]} { append result [$tree get $node comp] } if {[$tree keyexists $node val]} { append result [$tree get $node val] } if {$depth == 1} { set msg [$tree get $node message] set n $node while {($n != {}) && ($msg == "")} { set n [lindex [$tree children $n] 0] if {$n != {}} { set msg [$tree get $n message] } } append result " " ( $msg ) if {[$tree keyexists $node file]} { append result " - " [$tree get $node file] } } #append result " <" [$tree getall $node] > append result \n } return $result } proc magic::treegen {tree node} { return "[treegen1 $tree $node]\nresult\n" } proc magic::treegen1 {tree node} { set result "" foreach k {otype type offset comp val qual message} { if {[$tree keyexists $node $k]} { set $k [$tree get $node $k] } } if {$otype eq "N"} { set type [list N $type] } elseif {$otype eq "S"} { set type S } switch $otype { N - S { # this is a complex offset - call the offset interpreter if {[string match "(*)" $offset]} { set offset "\[offset $offset\]" } append result "if \{\[$type $offset $comp [list $val] $qual\]\} \{" if {[$tree isleaf $node]} { if {$message != ""} { append result "emit [list $message]" } else { append result "emit [$tree get $node path]" } } else { if {$message != ""} { append result "emit [list $message]\n" } foreach child [$tree children $node] { append result [treegen1 $tree $child] } #append result "\nreturn \$result" } append result "\}\n" } Root { foreach child [$tree children $node] { append result [treegen1 $tree $child] } } Switch { # this is a complex offset - call the offset interpreter if {[string match "(*)" $offset]} { set offset "\[offset $offset\]" } append result "switch -- \[Nv $type $offset $qual\] " variable typemap set scan [lindex $typemap($type) 1] foreach child [$tree children $node] { binary scan [binary format $scan [$tree get $child val]] $scan val append result "$val \{" if {[$tree isleaf $child]} { append result "emit [list [$tree get $child message]]" } else { append result "emit [list [$tree get $child message]]\n" foreach grandchild [$tree children $child] { append result [treegen1 $tree $grandchild] } } append result "\} " } append result "\n" } } return $result } return
Whew ... let us never speak of file discrimination programs again :)