A tiny primitive x86 compiler

George Peter Staplin July 3, 2008 -- With the skill of a monkey I created a little compiler this morning. It uses a recursive decent parser, and understands a small set of primitives.

It will compile this simple program:

 proc main {argc argv} {
         puts "hello"
         puts "WOW"

         return 0
 }

 $ tclsh8.5 atc.tcl main.atc output.S
 $ gcc output.S && ./a.out

I use gcc to compile the assembly code, because the C runtime files need to be linked in, and gcc knows how to do that portably better than as/gas.

Someday I would like to finish a compiler like this, written in Tcl, so that I can get rid of my need for C. Unfortunately the further the deviation from C, and assembly, the harder it is, because nearly every OS uses a C ABI. I had a little compiler at one point that used a garbage collector I wrote in assembly. It was frustrating when dealing with OS syscalls. For now the dream of a good compiler written in Tcl will wait until I have the time to create a proper design (mostly likely with reference counted objects this time).

FF 2008-07-03 - How about creating a Tcl-to-C compiler and use the Tcl C library for stuff like lists, dicts and such? IMHO would be more(!) feasible and reliable, and probably would require less effort. Perhaps I miss the point of this compiler? ;p

Scott Beasley 2008-07-04 - Does there always have to be point to code? Back in the 70's and 80's we wrote code to learn, show-off and just because would could! :)


 set state ""
 set token ""
 set cmd [list]
 set lastchar ""
 set bracecount 0
 set labelcount 0
 
 array set commands {
         proc proc-command
         puts puts-command
         return return-command
 }
 
 proc assert-type {obj type} {
         lassign $obj objtype objstr
         if {$type != $objtype} {
                 return -code error "wrong type: $objtype.  expected $type.  object: $obj"
         }        
 }
 
 proc get-label {} {
         global labelcount
         return _alcLabel[incr labelcount]
 }
 
 proc proc-command {tokens} {
         set code ""
         if {3 != [llength $tokens]} {
                 return -code error "wrong # args: proc name args body: $tokens"
         }
 
         lassign $tokens name args body
 
         assert-type $name NORMAL
         set pname [lindex $name 1]
 
         assert-type $body BRACED
 
         append code ".text\n"
         append code ".globl $pname\n"
         append code "${pname}:\n"
         append code "pushl %ebp\n"
         append code "movl %esp,%ebp\n"
         append code [code        [compile [lindex $body 1]]]
         append code "leave\n"
         append code "ret\n"
 
         return $code
 }
 
 proc puts-command {tokens} {
         set code ""
 
         if {1 != [llength $tokens]} {
                 return -code error "wrong # args: puts string: $tokens"
         }
 
         set string [lindex $tokens 0]
 
         #This is dumb.  It only allows for a string literal.  
         #Ideally we could do runtime substitution like Tcl...
         #With this you'd need some sort of <string.h> function to work with strings.
         assert-type $string QUOTE
 
         set label [get-label]
 
         append code ".section .rodata\n"
         append code "${label}: .string \"[lindex $string 1]\"\n"
         append code ".text\n"
         append code "pushl \$$label\n"
         append code "call puts\n"
         append code "addl \$4,%esp\n"
 }
 
 proc return-command {tokens} {
         set value [lindex $tokens 0]
 
         puts TOKENS:$tokens
 
         assert-type $value NORMAL
 
         set code ""
         append code "movl \$[lindex $value 1],%eax\n"
         return $code
 }
 
 proc state-machine {data offset_var tokens_var c} {
         global state token lastchar cmd bracecount
         upvar 1 $offset_var offset
         upvar 1 $tokens_var tokens
 
         if {"BRACES" eq $state} {
                 if {"\}" eq $c} {
                         incr bracecount -1
                 } elseif {"\{" eq $c} {
                         incr bracecount
                 }
 
                 if {$bracecount <= 0} {
                         set state ""
                         lappend cmd [list BRACED $token]
                         set token ""
                 } else {
                         append token $c
                 }                
         } elseif {"QUOTE" eq $state} {
                 if {"\"" eq $c && $lastchar != "\\"} {
                         set state ""
                         lappend cmd [list QUOTE $token]
                         set token ""
                 } else {
                         append token $c
                 }
         } elseif {"\"" eq $c} {
                 set state QUOTE
         } elseif {"\{" eq $c} {
                 set state BRACES
                 incr bracecount
         } elseif {"\[" eq $c} {
                 return [parse $data offset tokens]
         } elseif {"\]" eq $c} {
                 return DORETURN                
         } elseif {" " eq $c || "\t" eq $c} {
                 if {[string length $token]} {
                         lappend cmd [list NORMAL $token]
                         set token ""
                 }
         } elseif {"\n" eq $c || ";" eq $c} {
                 if {[string length $token]} {
                         lappend cmd [list NORMAL $token]
                         set token ""
                 }
 
                 if {[llength $cmd]} {
                         lappend tokens $cmd
                         set cmd [list]
                 }
         } else {
                 append token $c
         }
 
         set lastchar $c
         return ""
 }
 
 proc parse {data offset_var tokens_var} {
         upvar 1 $offset_var offset
         upvar 1 $tokens_var tokens
         set datalen [string length $data]
         global token cmd
 
 
         for {set i $offset} {$i < $datalen} {incr i} {
                 set c [string index $data $i]
                 set result [state-machine $data offset tokens $c]
 
                 if {"DORETURN" eq $result} {
                         return $result
                 }
         }
         
         return ""
 }
 
 proc compile data {
         set offset 0
         set tokens [list]
         parse $data offset tokens
         return $tokens
 }
 
 proc command-to-code cmd {
         global commands
 
         set obj [lindex $cmd 0]
         lassign $obj objtype objstr
         if {![info exists        commands($objstr)] || "NORMAL" ne $objtype} {
                 return -code error "invalid command name: $objstr with type $objtype"
         }
         return [$commands($objstr) [lrange $cmd 1 end]]
 }
 
 proc code tokens {
         set code ""
         puts TOKENS:$tokens
         foreach cmd $tokens {
                 append code [command-to-code $cmd]\n
         }
         return $code
 }
 
 proc main {argc argv} {
         if {2 != $argc} {
                 puts stderr "syntax is: [info script] input.atc output.S"
                 exit 1
         }
 
         set fd [open [lindex $argv 0] r]
         set data [read $fd]
         close $fd
         
         set outfd [open [lindex $argv 1] w]
         puts $outfd [code [compile $data]]
         close $outfd
 
         exit 0
 }
 
 main $argc $argv