Version 5 of Tclas - an assembler

Updated 2007-11-07 13:42:50 by jdc

George Peter Staplin Nov 07, 2007 - This is the start of something that Richard Suchenwirth wanted me to share on the wiki. It's an x86 assembler with AT&Tish/gas syntax. This is just something to play with at the moment, but it will grow. I'm hoping that Richard, Mark and possibly others will help out :) BSD/Tcl licensed of course.

If there's interest I could create a code.google.com project for this, and add developers...

This is revision 4:


 #Tclas (a runtime assembler written in Tcl)
 #By George Peter Staplin

 #INSTRUCTION PREFIXES
 set prefixes(lock) 0xf0
 set prefixes(repne) 0xf2
 set prefixes(repnz) 0xf2
 set prefixes(rep) 0xf3
 set prefixes(repe) 0xf3
 set prefixes(repz) 0xf3

 #SEGMENT OVERRIDE PREFIXES
 set sop(cs) 0x2e
 set sop(ss) 0x36
 set sop(ds) 0x3e
 set sop(es) 0x26
 set sop(fs) 0x64
 set sop(gs) 0x65

 set branch_hints(not_taken) 0x2e
 set branch_hints(taken) 0x3e

 proc reg32 {name value} {
     global regs
     set regs($name) $value
     set regs($name,type) 32
 }

 proc reg16 {name value} {
     global regs 
     set regs($name) $value
     set regs($name,type) 16
 }

 proc reg8 {name value} {
     global regs
     set regs($name) $value
     set regs($name,type) 8
 }

 reg32 eax 0
 reg32 ecx 1
 reg32 edx 2
 reg32 ebx 3
 reg32 esp 4
 reg32 ebp 5
 reg32 esi 6
 reg32 edi 7

 reg16 ax 0
 reg16 cx 1
 reg16 dx 2
 reg16 bx 3
 reg16 sp 4
 reg16 bp 5
 reg16 si 6
 reg16 di 7

 reg8 al 0
 reg8 cl 1
 reg8 dl 2
 reg8 bl 3
 reg8 ah 4
 reg8 ch 5
 reg8 dh 6
 reg8 bh 7

 proc op {name opvec} {
     global ops
     set ops($name) 1
     lappend ops($name,data) $opvec
     interp alias {} $name {} opcall $name
 }

 proc opvec {base reqarg size types} {
     list $base $reqarg $size $types
 }

 # Operands with a % prefix are assumed to be registers.
 # Operands with a ~ prefix are symbols. TODO use dlopen/dlsym to lookup symbols.
 # Operands with a @ prefix are assumed to be absolute addresses.
 # Operands with no prefix are assumed to be immediate values.
 # Operands with possibly a number and ( ending with ) are memory operands.
 proc typeof {o resultvar} {
     upvar $resultvar result
     global regs
     set c [string index $o 0]
     switch -- $c {
         % {
             set regname [string range $o 1 end]
             if {![info exists regs($regname)]} {
                 return -code error "invalid register name: $regname"
             }
             set result $regname
             return register
         }
         ~ {
             set result [string range $o 1 end]
             return symbol
         }
         @ {
             set result [string range $o 1 end]
             return address
         }

         default {
             if {[regexp {([0-9]*)\(%(.*?)\)} $o all offset reg]} {
                 set result [list $offset $reg]
                 return memory
             } else {
                 set result $o
                 return immediate
             }
         }
     } 
 }

 proc find-matching-instruction {name atype btype} {
     global ops
     set l [list $atype $btype]
     foreach oplist $ops($name,data) {
         if {[lindex $oplist 3] eq $l} {
             return $oplist
         }
     }
     return ""
 }

 proc binout out {
     puts BINOUT:0x[format %x $out]
 }

 proc binout-long out {
     puts BINOUT:0x[format %8.8x $out]
 }

 proc opcall-2 {name a b} {
     global ops regs

     puts "OPERANDS:$a $b"
     set atype [typeof $a ares]
     set btype [typeof $b bres]
     if {"" eq [set oplist [find-matching-instruction $name $atype $btype]]} {
         return -code error "invalid instruction pattern: $name with $atype $btype :: $args"
     }
     lassign $oplist code operandcount sizes types
     lassign $sizes asize bsize

     puts "TYPES:$atype $btype"

     switch -- $atype {
         immediate {
             binout [expr {$code + $regs($bres)}]
             if {32 == $bsize} {
                 binout-long $ares
             } else {
                 binout [expr {$ares & 0xff}]
             }
         }
         memory {
             lassign $ares offset areg
             binout $code ;#opcode
             set rm [expr {8 * $regs($bres) + $regs($areg)}]
             if {"" eq $offset} {
                 binout $rm
             } else {
                 #TODO handle (1 << 7) for the 32-bit displacement/offset
                 #or possibly make 32-bit displacement the default.
                 binout [expr {(1 << 6) + $rm}] ;#modrm
                 binout $offset
             }
         }
         register {
             if {"register" eq $btype} {
                 binout $code
                 binout [expr {(1 << 6 | 1 << 7) + (8 * $regs($ares)) + $regs($bres)}] ;#modrm
             } else {
                 #memory
                 lassign $bres offset breg
                 binout $code
                 set rm [expr {(8 * $regs($ares)) + $regs($breg)}] ;#rm
                 if {"" eq $offset} {
                     binout $rm
                 } else {
                     binout [expr {(1 << 6) + $rm}]
                     binout $offset
                 }
             }
         }
     }
 }

 proc opcall-0 name {
     global ops
     lassign [lindex $ops($name,data) 0] code operandcount sizes types
     binout $code
 }

 proc opcall {name args} {
     switch -- [llength $args] {
         0 {
             opcall-0 $name
         }
         2 {
             opcall-2 $name {*}$args
         }
         default {
             return -code error "invalid arguments: $name $args"
         }
     }
 }

 op movb [opvec 0xb0 2 [list 8 8] [list immediate register]]
 op movb [opvec 0x88 2 [list 8 8] [list register register]]
 op movb [opvec 0x8a 2 [list 32 8] [list memory register]]
 op movb [opvec 0x88 2 [list 8 32] [list register memory]]

 op movl [opvec 0xb8 2 [list 32 32] [list immediate register]]
 op movl [opvec 0x89 2 [list 32 32] [list register register]]
 op movl [opvec 0x8b 2 [list 32 32] [list memory register]]
 op movl [opvec 0x89 2 [list 32 32] [list register memory]]

 op movzbl [opvec 0x0fb6 2 [list 8 32] [list register register]]

 op addb [opvec 0x04 2 [list 8 8] [list immediate register]]
 op addb [opvec 0x00 2 [list 8 8] [list register register]]
 op addb [opvec 0x02 2 [list 32 8] [list memory register]]
 op addb [opvec 0x00 2 [list 8 32] [list register memory]]

 op addl [opvec 0x05 2 [list 32 8] [list immediate register]]
 op addl [opvec 0x01 2 [list 32 32] [list register register]]
 op addl [opvec 0x03 2 [list 32 32] [list memory register]]
 op addl [opvec 0x01 2 [list 32 32] [list register memory]]


 op nop [opvec 0x90 0 [list] [list]]

 #Test code
 movl 0xffaaddee %eax
 nop 
 movl (%eax) %eax
 movl (%ecx) %ecx
 nop

 movl %eax %eax
 movl %ecx %eax
 movl %edx %eax
 movl %eax %ecx

 movl %eax (%eax)
 movl %eax (%ecx)
 movl %ecx (%eax)

 movl %ebp (%eax)

 movb %al %al
 movzbl %al %eax

 nop 
 nop
 nop

 movl (%eax) %eax
 movl 4(%eax) %eax
 movl (%ecx) %ecx
 movl 4(%ecx) %ecx

 movl 20(%ecx) %edx
 movl 20(%eax) %edx

 movl 20(%ecx) %eax
 movl 20(%ebp) %eax
 movl %eax 20(%ebp)
 movl %eax %ebp
 movl %ebp %eax

RS 2007-11-07: Interesting! After not having done real assembler for many years, I experimented with the classic

 #include <stdio.h>
 int main(int argc, char* argv[]) {
  printf("hello. world!\n");
  return 0;
 }

which, saved to hello.c and compiled with gcc -S hello.c, gives a file hello.s, which contains:

        .file   "hello.c"
 gcc2_compiled.:
 ___gnu_compiled_c:
        .def    ___main;        .scl    2;      .type   32;     .endef
 .text
 LC0:
        .ascii "hello. world!\12\0"
        .align 4
 .globl _main
        .def    _main;  .scl    2;      .type   32;     .endef
 _main:
        pushl %ebp
        movl %esp,%ebp
        subl $8,%esp
        call ___main
        addl $-12,%esp
        pushl $LC0
        call _printf
        addl $16,%esp
        xorl %eax,%eax
        jmp L10
        .align 4
 L10:
        movl %ebp,%esp
        popl %ebp
        ret
        .def    _printf;        .scl    2;      .type   32;     .endef

George Peter Staplin Nov 7, 2007 - JDC asked some questions about the assembler. This lead to me writing this response about how it will probably work in the future.

  1. a Tcl_Obj command in C allocates memory using mmap (unix) or VirtualAlloc (Windows) with PROT_EXEC permissions
  2. a Tcl_Obj command for writing bytes to the pages allocated is used to write to the allocated memory.
  3. addresses that refer to the mmaped region get added to a linked list of relocation structs.
  4. symbols are looked up with dlopen (NULL, ..); and dlsym();
  5. any relocations as the mmaped section grows are done by iterating the relocation entries, and adjusting the pointers for the new base.

Something like this struct will be used for relocations:

 struct assembly_relocation {
  void *base; /* The base allocation from mmap */
  void *ptr;  /* The value of the pointer prior to any relocation. */
  /* This is the offset from the base to the pointer's location: */
  uintptr_t ptroffset;
  struct assembly_relocation *next;
 };

DKF suggested in the chat that we use a Tcl ByteArray. The problem as I see it with that, is making sure the memory has the proper permissions. We can only mprotect() a page-aligned pointer, so some of the ByteArray would go wasted, and we'd need metadata to indicate where it actually starts having PROT_EXEC. Otherwise we might end up giving PROT_EXEC to something that really shouldn't have it. I'm still thinking about the idea though. -GPS

DKF writes: Actually, I was thinking in terms of using the bytearray to accumulate the machine code during building, and then, when converting to a runnable command (that is the goal, yes?) copying to a suitably-allocated and protected block elsewhere. The "convert to command" ought to do any linking needed too. (I should emphasize that I've not thought hard about this.)


jdc Added support for labels and the jmp instruction:

#Tclas (a runtime assembler written in Tcl)
 #By George Peter Staplin

set binout_offset 0

 #INSTRUCTION PREFIXES
 set prefixes(lock) 0xf0
 set prefixes(repne) 0xf2
 set prefixes(repnz) 0xf2
 set prefixes(rep) 0xf3
 set prefixes(repe) 0xf3
 set prefixes(repz) 0xf3

 #SEGMENT OVERRIDE PREFIXES
 set sop(cs) 0x2e
 set sop(ss) 0x36
 set sop(ds) 0x3e
 set sop(es) 0x26
 set sop(fs) 0x64
 set sop(gs) 0x65

 set branch_hints(not_taken) 0x2e
 set branch_hints(taken) 0x3e

 proc reg32 {name value} {
     global regs
     set regs($name) $value
     set regs($name,type) 32
 }

 proc reg16 {name value} {
     global regs
     set regs($name) $value
     set regs($name,type) 16
 }

 proc reg8 {name value} {
     global regs
     set regs($name) $value
     set regs($name,type) 8
 }

 reg32 eax 0
 reg32 ecx 1
 reg32 edx 2
 reg32 ebx 3
 reg32 esp 4
 reg32 ebp 5
 reg32 esi 6
 reg32 edi 7

 reg16 ax 0
 reg16 cx 1
 reg16 dx 2
 reg16 bx 3
 reg16 sp 4
 reg16 bp 5
 reg16 si 6
 reg16 di 7

 reg8 al 0
 reg8 cl 1
 reg8 dl 2
 reg8 bl 3
 reg8 ah 4
 reg8 ch 5
 reg8 dh 6
 reg8 bh 7

 proc op {name opvec} {
     global ops
     set ops($name) 1
     lappend ops($name,data) $opvec
     interp alias {} $name {} opcall $name
 }

 proc opvec {base reqarg size types} {
     list $base $reqarg $size $types
 }

 # Operands with a % prefix are assumed to be registers.
 # Operands with a ~ prefix are symbols. TODO use dlopen/dlsym to lookup symbols.
 # Operands with a @ prefix are assumed to be absolute addresses.
 # Operands with no prefix are assumed to be immediate values.
 # Operands with possibly a number and ( ending with ) are memory operands.
 proc typeof {o resultvar} {
     upvar $resultvar result
     global regs
     set c [string index $o 0]
     switch -- $c {
 	% {
 	    set regname [string range $o 1 end]
 	    if {![info exists regs($regname)]} {
 		return -code error "invalid register name: $regname"
 	    }
 	    set result $regname
 	    return register
 	}
 	~ {
 	    set result [string range $o 1 end]
 	    return symbol
 	}
 	@ {
 	    set result [string range $o 1 end]
 	    return address
 	}

 	default {
 	    if {[regexp {([0-9]*)\(%(.*?)\)} $o all offset reg]} {
 		set result [list $offset $reg]
 		return memory
	    } elseif { [string is integer -strict $o] } {
 		set result $o
 		return immediate
 	    } else {
		set result $o
		return labelref
	    }
 	}
     }
 }

 proc find-matching-instruction-2 {name atype btype} {
     global ops
     set l [list $atype $btype]
     foreach oplist $ops($name,data) {
 	if {[lindex $oplist 3] eq $l} {
 	    return $oplist
 	}
     }
     return ""
 }

 proc find-matching-instruction-1 {name atype} {
     global ops
     set l [list $atype]
     foreach oplist $ops($name,data) {
 	if {[lindex $oplist 3] eq $l} {
 	    return $oplist
 	}
     }
     return ""
 }

 proc binout out {
     puts BINOUT:0x[format %x $out]
     incr ::binout_offset
 }

 proc binout-long out {
     puts BINOUT:0x[format %8.8x $out]
     incr ::binout_offset 4
 }

 proc opcall-2 {name a b} {
     global ops regs

     puts "OPERANDS:$a $b"
     set atype [typeof $a ares]
     set btype [typeof $b bres]
     if {"" eq [set oplist [find-matching-instruction-2 $name $atype $btype]]} {
 	return -code error "invalid instruction pattern: $name with $atype $btype :: $args"
     }
     lassign $oplist code operandcount sizes types
     lassign $sizes asize bsize

     puts "TYPES:$atype $btype"

     switch -- $atype {
 	immediate {
 	    binout [expr {$code + $regs($bres)}]
 	    if {32 == $bsize} {
 		binout-long $ares
 	    } else {
 		binout [expr {$ares & 0xff}]
 	    }
 	}
 	memory {
 	    lassign $ares offset areg
 	    binout $code ;#opcode
 	    set rm [expr {8 * $regs($bres) + $regs($areg)}]
 	    if {"" eq $offset} {
 		binout $rm
 	    } else {
 		#TODO handle (1 << 7) for the 32-bit displacement/offset
 		#or possibly make 32-bit displacement the default.
 		binout [expr {(1 << 6) + $rm}] ;#modrm
 		binout $offset
 	    }
 	}
 	register {
 	    if {"register" eq $btype} {
 		binout $code
 		binout [expr {(1 << 6 | 1 << 7) + (8 * $regs($ares)) + $regs($bres)}] ;#modrm
 	    } else {
 		#memory
 		lassign $bres offset breg
 		binout $code
 		set rm [expr {(8 * $regs($ares)) + $regs($breg)}] ;#rm
 		if {"" eq $offset} {
 		    binout $rm
 		} else {
 		    binout [expr {(1 << 6) + $rm}]
 		    binout $offset
 		}
 	    }
 	}
     }
 }

 proc opcall-1 {name a } {
     global ops regs

     set atype [typeof $a ares]
     if {"" eq [set oplist [find-matching-instruction-1 $name $atype]]} {
 	return -code error "invalid instruction pattern: $name with $atype $btype :: $args"
     }
     lassign $oplist code operandcount sizes types
     lassign $sizes asize

     switch -- $atype {
 	immediate {
 	    binout [expr {$code}]
 	    if {32 == $asize} {
 		binout-long $ares
 	    } else {
 		binout [expr {$ares & 0xff}]
 	    }
 	}
        labelref {
 	    binout [expr {$code}]
	    set ::relocators($::binout_offset) [list $asize $ares]
 	    if {32 == $asize} {
 		binout-long 0
 	    } else {
 		binout [expr {0 & 0xff}]
 	    }
	}
     }
 }

 proc opcall-0 name {
     global ops
     lassign [lindex $ops($name,data) 0] code operandcount sizes types
     binout $code
 }

 proc opcall {name args} {
     switch -- [llength $args] {
 	0 {
 	    opcall-0 $name
 	}
 	1 {
 	    opcall-1 $name {*}$args
 	}
 	2 {
 	    opcall-2 $name {*}$args
 	}
 	default {
 	    return -code error "invalid arguments: $name $args"
 	}
     }
 }



proc label { name } { 
    puts "label $name @ $::binout_offset"
    set ::label_map($name) $::binout_offset
}

 op movb [opvec 0xb0 2 [list 8 8] [list immediate register]]
 op movb [opvec 0x88 2 [list 8 8] [list register register]]
 op movb [opvec 0x8a 2 [list 32 8] [list memory register]]
 op movb [opvec 0x88 2 [list 8 32] [list register memory]]

 op movl [opvec 0xb8 2 [list 32 32] [list immediate register]]
 op movl [opvec 0x89 2 [list 32 32] [list register register]]
 op movl [opvec 0x8b 2 [list 32 32] [list memory register]]
 op movl [opvec 0x89 2 [list 32 32] [list register memory]]

 op movzbl [opvec 0x0fb6 2 [list 8 32] [list register register]]

 op addb [opvec 0x04 2 [list 8 8] [list immediate register]]
 op addb [opvec 0x00 2 [list 8 8] [list register register]]
 op addb [opvec 0x02 2 [list 32 8] [list memory register]]
 op addb [opvec 0x00 2 [list 8 32] [list register memory]]

 op addl [opvec 0x05 2 [list 32 8] [list immediate register]]
 op addl [opvec 0x01 2 [list 32 32] [list register register]]
 op addl [opvec 0x03 2 [list 32 32] [list memory register]]
 op addl [opvec 0x01 2 [list 32 32] [list register memory]]

 op nop [opvec 0x90 0 [list] [list]]

 op jmp [opvec 0xea 1 [list 32] [list immediate]]
 op jmp [opvec 0xea 1 [list 32] [list labelref]]

 #Test code
 movl 0xffaaddee %eax
 nop
 movl (%eax) %eax
 movl (%ecx) %ecx
 nop

 movl %eax %eax
 movl %ecx %eax
 movl %edx %eax
 movl %eax %ecx

 movl %eax (%eax)
 movl %eax (%ecx)
 movl %ecx (%eax)

 movl %ebp (%eax)

 jmp an_entry

 label start_loop

 movb %al %al
 movzbl %al %eax

 nop
 nop
 nop

 movl (%eax) %eax
 movl 4(%eax) %eax
 movl (%ecx) %ecx
 movl 4(%ecx) %ecx

 label an_entry

 movl 20(%ecx) %edx
 movl 20(%eax) %edx

 movl 20(%ecx) %eax
 movl 20(%ebp) %eax
 movl %eax 20(%ebp)
 movl %eax %ebp
 movl %ebp %eax

 jmp start_loop

 if { [info exists label_map] } {
     parray label_map
 }
 if { [info exists relocators] } { 
     parray relocators
 }