Version 2 of Tclas - an assembler

Updated 2007-11-07 12:59:35 by GPS

George Peter Staplin Nov 07, 2007 - This is the start of something that Richard Suchenwirth wanted me to share on the wiki. It's an x86 assembler with AT&Tish/gas syntax. This is just something to play with at the moment, but it will grow. I'm hoping that Richard, Mark and possibly others will help out :) BSD/Tcl licensed of course.

If there's interest I could create a code.google.com project for this, and add developers...

This is revision 4:


 #Tclas (a runtime assembler written in Tcl)
 #By George Peter Staplin

 #INSTRUCTION PREFIXES
 set prefixes(lock) 0xf0
 set prefixes(repne) 0xf2
 set prefixes(repnz) 0xf2
 set prefixes(rep) 0xf3
 set prefixes(repe) 0xf3
 set prefixes(repz) 0xf3

 #SEGMENT OVERRIDE PREFIXES
 set sop(cs) 0x2e
 set sop(ss) 0x36
 set sop(ds) 0x3e
 set sop(es) 0x26
 set sop(fs) 0x64
 set sop(gs) 0x65

 set branch_hints(not_taken) 0x2e
 set branch_hints(taken) 0x3e

 proc reg32 {name value} {
     global regs
     set regs($name) $value
     set regs($name,type) 32
 }

 proc reg16 {name value} {
     global regs 
     set regs($name) $value
     set regs($name,type) 16
 }

 proc reg8 {name value} {
     global regs
     set regs($name) $value
     set regs($name,type) 8
 }

 reg32 eax 0
 reg32 ecx 1
 reg32 edx 2
 reg32 ebx 3
 reg32 esp 4
 reg32 ebp 5
 reg32 esi 6
 reg32 edi 7

 reg16 ax 0
 reg16 cx 1
 reg16 dx 2
 reg16 bx 3
 reg16 sp 4
 reg16 bp 5
 reg16 si 6
 reg16 di 7

 reg8 al 0
 reg8 cl 1
 reg8 dl 2
 reg8 bl 3
 reg8 ah 4
 reg8 ch 5
 reg8 dh 6
 reg8 bh 7

 proc op {name opvec} {
     global ops
     set ops($name) 1
     lappend ops($name,data) $opvec
     interp alias {} $name {} opcall $name
 }

 proc opvec {base reqarg size types} {
     list $base $reqarg $size $types
 }

 # Operands with a % prefix are assumed to be registers.
 # Operands with a ~ prefix are symbols. TODO use dlopen/dlsym to lookup symbols.
 # Operands with a @ prefix are assumed to be absolute addresses.
 # Operands with no prefix are assumed to be immediate values.
 # Operands with possibly a number and ( ending with ) are memory operands.
 proc typeof {o resultvar} {
     upvar $resultvar result
     global regs
     set c [string index $o 0]
     switch -- $c {
         % {
             set regname [string range $o 1 end]
             if {![info exists regs($regname)]} {
                 return -code error "invalid register name: $regname"
             }
             set result $regname
             return register
         }
         ~ {
             set result [string range $o 1 end]
             return symbol
         }
         @ {
             set result [string range $o 1 end]
             return address
         }

         default {
             if {[regexp {([0-9]*)\(%(.*?)\)} $o all offset reg]} {
                 set result [list $offset $reg]
                 return memory
             } else {
                 set result $o
                 return immediate
             }
         }
     } 
 }

 proc find-matching-instruction {name atype btype} {
     global ops
     set l [list $atype $btype]
     foreach oplist $ops($name,data) {
         if {[lindex $oplist 3] eq $l} {
             return $oplist
         }
     }
     return ""
 }

 proc binout out {
     puts BINOUT:0x[format %x $out]
 }

 proc binout-long out {
     puts BINOUT:0x[format %8.8x $out]
 }

 proc opcall-2 {name a b} {
     global ops regs

     puts "OPERANDS:$a $b"
     set atype [typeof $a ares]
     set btype [typeof $b bres]
     if {"" eq [set oplist [find-matching-instruction $name $atype $btype]]} {
         return -code error "invalid instruction pattern: $name with $atype $btype :: $args"
     }
     lassign $oplist code operandcount sizes types
     lassign $sizes asize bsize

     puts "TYPES:$atype $btype"

     switch -- $atype {
         immediate {
             binout [expr {$code + $regs($bres)}]
             if {32 == $bsize} {
                 binout-long $ares
             } else {
                 binout [expr {$ares & 0xff}]
             }
         }
         memory {
             lassign $ares offset areg
             binout $code ;#opcode
             set rm [expr {8 * $regs($bres) + $regs($areg)}]
             if {"" eq $offset} {
                 binout $rm
             } else {
                 #TODO handle (1 << 7) for the 32-bit displacement/offset
                 #or possibly make 32-bit displacement the default.
                 binout [expr {(1 << 6) + $rm}] ;#modrm
                 binout $offset
             }
         }
         register {
             if {"register" eq $btype} {
                 binout $code
                 binout [expr {(1 << 6 | 1 << 7) + (8 * $regs($ares)) + $regs($bres)}] ;#modrm
             } else {
                 #memory
                 lassign $bres offset breg
                 binout $code
                 set rm [expr {(8 * $regs($ares)) + $regs($breg)}] ;#rm
                 if {"" eq $offset} {
                     binout $rm
                 } else {
                     binout [expr {(1 << 6) + $rm}]
                     binout $offset
                 }
             }
         }
     }
 }

 proc opcall-0 name {
     global ops
     lassign [lindex $ops($name,data) 0] code operandcount sizes types
     binout $code
 }

 proc opcall {name args} {
     switch -- [llength $args] {
         0 {
             opcall-0 $name
         }
         2 {
             opcall-2 $name {*}$args
         }
         default {
             return -code error "invalid arguments: $name $args"
         }
     }
 }

 op movb [opvec 0xb0 2 [list 8 8] [list immediate register]]
 op movb [opvec 0x88 2 [list 8 8] [list register register]]
 op movb [opvec 0x8a 2 [list 32 8] [list memory register]]
 op movb [opvec 0x88 2 [list 8 32] [list register memory]]

 op movl [opvec 0xb8 2 [list 32 32] [list immediate register]]
 op movl [opvec 0x89 2 [list 32 32] [list register register]]
 op movl [opvec 0x8b 2 [list 32 32] [list memory register]]
 op movl [opvec 0x89 2 [list 32 32] [list register memory]]

 op movzbl [opvec 0x0fb6 2 [list 8 32] [list register register]]

 op addb [opvec 0x04 2 [list 8 8] [list immediate register]]
 op addb [opvec 0x00 2 [list 8 8] [list register register]]
 op addb [opvec 0x02 2 [list 32 8] [list memory register]]
 op addb [opvec 0x00 2 [list 8 32] [list register memory]]

 op addl [opvec 0x05 2 [list 32 8] [list immediate register]]
 op addl [opvec 0x01 2 [list 32 32] [list register register]]
 op addl [opvec 0x03 2 [list 32 32] [list memory register]]
 op addl [opvec 0x01 2 [list 32 32] [list register memory]]


 op nop [opvec 0x90 0 [list] [list]]

 #Test code
 movl 0xffaaddee %eax
 nop 
 movl (%eax) %eax
 movl (%ecx) %ecx
 nop

 movl %eax %eax
 movl %ecx %eax
 movl %edx %eax
 movl %eax %ecx

 movl %eax (%eax)
 movl %eax (%ecx)
 movl %ecx (%eax)

 movl %ebp (%eax)

 movb %al %al
 movzbl %al %eax

 nop 
 nop
 nop

 movl (%eax) %eax
 movl 4(%eax) %eax
 movl (%ecx) %ecx
 movl 4(%ecx) %ecx

 movl 20(%ecx) %edx
 movl 20(%eax) %edx

 movl 20(%ecx) %eax
 movl 20(%ebp) %eax
 movl %eax 20(%ebp)
 movl %eax %ebp
 movl %ebp %eax

RS 2007-11-07: Interesting! After not having done real assembler for many years, I experimented with the classic

 #include <stdio.h>
 int main(int argc, char* argv[]) {
  printf("hello. world!\n");
  return 0;
 }

which, saved to hello.c and compiled with gcc -S hello.c, gives a file hello.s, which contains:

        .file   "hello.c"
 gcc2_compiled.:
 ___gnu_compiled_c:
        .def    ___main;        .scl    2;      .type   32;     .endef
 .text
 LC0:
        .ascii "hello. world!\12\0"
        .align 4
 .globl _main
        .def    _main;  .scl    2;      .type   32;     .endef
 _main:
        pushl %ebp
        movl %esp,%ebp
        subl $8,%esp
        call ___main
        addl $-12,%esp
        pushl $LC0
        call _printf
        addl $16,%esp
        xorl %eax,%eax
        jmp L10
        .align 4
 L10:
        movl %ebp,%esp
        popl %ebp
        ret
        .def    _printf;        .scl    2;      .type   32;     .endef

George Peter Staplin Nov 7, 2007 - JDC asked some questions about the assembler. This lead to me writing this response about how it will probably work in the future.

  1. a Tcl_Obj command in C allocates memory using mmap (unix) or VirtualAlloc (Windows) with PROT_EXEC permissions
  2. a Tcl_Obj command for writing bytes to the pages allocated is used to write to the allocated memory.
  3. addresses that refer to the mmaped region get added to a linked list of relocation structs.
  4. symbols are looked up with dlopen (NULL, ..); and dlsym();
  5. any relocations as the mmaped section grows are done by iterating the relocation entries.

Something like this struct will be used for relocations:

 struct assembly_relocation {
  void *base; /* The base allocation from mmap */
  void *ptr;  /* The value of the pointer prior to any relocation. */
  /* This is the offset from the base to the pointer's location: */
  uintptr_t ptroffset;
  struct assembly_relocation *next;
 };

DKF suggested in the chat that we use a Tcl ByteArray. The problem as I see it with that, is making sure the memory has the proper permissions. We can only mprotect() a page-aligned pointer, so some of the ByteArray would go wasted, and we'd need metadata to indicate where it actually starts having PROT_EXEC. Otherwise we might end up giving PROT_EXEC to something that really shouldn't have it. I'm still thinking about the idea though. -GPS