George Peter Staplin Nov 07, 2007 - This is the start of something that Richard Suchenwirth wanted me to share on the wiki. It's an x86 assembler with AT&Tish/gas syntax. This is just something to play with at the moment, but it will grow. I'm hoping that Richard, Mark and possibly others will help out :) BSD/Tcl licensed of course.
If there's interest I could create a code.google.com project for this, and add developers...
Nov 10, 2007, the latest code is available from here:
The latest code is available in tarballs named tclas here: http://www.xmission.com/~georgeps/implementation/software/tcl/
The tarballs include the C extension used to provide PROT_EXEC memory, and other misc. things needed.
This is revision 16:
#Tclas (a runtime assembler written in Tcl) #By George Peter Staplin set dir [file dirname [info script]] load [file join $dir asm.so] namespace eval ::asm { variable prefixes #INSTRUCTION PREFIXES set prefixes(lock) 0xf0 set prefixes(repne) 0xf2 set prefixes(repnz) 0xf2 set prefixes(rep) 0xf3 set prefixes(repe) 0xf3 set prefixes(repz) 0xf3 variable sop #SEGMENT OVERRIDE PREFIXES set sop(cs) 0x2e set sop(ss) 0x36 set sop(ds) 0x3e set sop(es) 0x26 set sop(fs) 0x64 set sop(gs) 0x65 variable branch_hints set branch_hints(not_taken) 0x2e set branch_hints(taken) 0x3e variable regs proc reg32 {name value} { variable regs set regs($name) $value set regs($name,type) 32 } proc reg16 {name value} { variable regs set regs($name) $value set regs($name,type) 16 } proc reg8 {name value} { variable regs set regs($name) $value set regs($name,type) 8 } reg32 eax 0 reg32 ecx 1 reg32 edx 2 reg32 ebx 3 reg32 esp 4 reg32 ebp 5 reg32 esi 6 reg32 edi 7 reg16 ax 0 reg16 cx 1 reg16 dx 2 reg16 bx 3 reg16 sp 4 reg16 bp 5 reg16 si 6 reg16 di 7 reg8 al 0 reg8 cl 1 reg8 dl 2 reg8 bl 3 reg8 ah 4 reg8 ch 5 reg8 dh 6 reg8 bh 7 variable ops proc op {name opvec} { variable ops set ops($name) 1 lappend ops($name,data) $opvec interp alias {} $name {} asm::opcall $name } proc opvec {base reqarg size types args} { list [expr {$base}] $reqarg $size $types $args } # Operands with a % prefix are assumed to be registers. # Operands with a ~ prefix are symbols. TODO use dlopen/dlsym to lookup symbols. # Operands with a @ prefix are assumed to be absolute addresses. # Operands with no prefix are assumed to be immediate values. # Operands with possibly a number and ( ending with ) are memory operands. proc typeof {o resultvar} { upvar $resultvar result variable regs set c [string index $o 0] switch -- $c { % { set regname [string range $o 1 end] if {![info exists regs($regname)]} { return -code error "invalid register name: $regname" } set result $regname return register } ~ { set result [string range $o 1 end] return symbol } @ { set result [string range $o 1 end] return address } default { if {[regexp {([0-9]*)\(%(.*?)\)} $o all offset reg]} { set result [list $offset $reg] return memory } else { set result $o return immediate } } } } proc find-matching-instruction {name args} { variable ops set l $args foreach oplist $ops($name,data) { if {[lindex $oplist 3] eq $l} { return $oplist } } return "" } variable asmobj proc binout out { variable asmobj $asmobj append-bytes [binary format c* $out] puts BINOUT:0x[format %x $out] } proc binout-long out { variable asmobj $asmobj append-long $out puts BINOUT:0x[format %8.8x $out] } proc opcall-2 {name a b} { variable ops variable regs puts "NAME:$name OPERANDS:$a $b" set atype [typeof $a ares] set btype [typeof $b bres] if {"" eq [set oplist [find-matching-instruction $name $atype $btype]]} { return -code error "invalid instruction pattern: $name with $atype $btype :: $args" } lassign $oplist code operandcount sizes types extra lassign $sizes asize bsize puts "TYPES:$atype $btype" switch -- $atype { immediate { if {[lsearch -exact $extra modrm] >= 0} { binout $code set rm [expr {(1 << 7 | 1 << 6 | 1 << 5 | 1 << 3) + $regs($bres)}] binout $rm if {32 == $asize} { binout-long $ares } else { binout [expr {$ares & 0xff}] } } else { binout [expr {$code + $regs($bres)}] if {32 == $asize} { binout-long $ares } else { binout [expr {$ares & 0xff}] } } } memory { lassign $ares offset areg binout $code ;#opcode set rm [expr {8 * $regs($bres) + $regs($areg)}] if {"" eq $offset} { binout $rm } else { #TODO handle (1 << 7) for the 32-bit displacement/offset #or possibly make 32-bit displacement the default. binout [expr {(1 << 6) + $rm}] ;#modrm binout $offset } } register { if {"register" eq $btype} { binout $code binout [expr {(1 << 6 | 1 << 7) + (8 * $regs($ares)) + $regs($bres)}] ;#modrm } else { #memory lassign $bres offset breg binout $code set rm [expr {(8 * $regs($ares)) + $regs($breg)}] ;#rm if {"" eq $offset} { binout $rm } else { binout [expr {(1 << 6) + $rm}] if {$breg eq "esp"} { binout [expr {0x24}] } binout $offset } } } } } proc opcall-1 {name arg} { variable ops variable regs puts "$name $arg" set type [typeof $arg result] if {"" eq [set oplist [find-matching-instruction $name $type]]} { #TODO improve this error message return -code error "unable to find instruction pattern" } lassign $oplist code operandcount sizes types lassign $sizes size if {"register" eq $type} { if {"pushl" eq $name || "popl" eq $name} { #These are special opcodes. binout [expr {$code + $regs($result)}] } else { binout $code #WHY is 1 << 4 set in gas for call and notl? binout [expr {(1 << 6 | 1 << 7 | 1 << 4) + $regs($result)}] } } else { binout $code if {32 == $size} { binout-long $result } else { binout $result } } } proc opcall-0 name { variable ops lassign [lindex $ops($name,data) 0] code operandcount sizes types binout $code } proc opcall {name args} { switch -- [llength $args] { 0 { opcall-0 $name } 1 { opcall-1 $name {*}$args } 2 { opcall-2 $name {*}$args } default { return -code error "invalid arguments: $name $args" } } } variable labels proc label name { variable labels variable asmobj set labels($name) [$asmobj get-offset] } proc set-object obj { variable asmobj set asmobj $obj } #More of these may need modrm. op movb [opvec 0xb0 2 [list 8 8] [list immediate register]] op movb [opvec 0x88 2 [list 8 8] [list register register]] op movb [opvec 0x8a 2 [list 32 8] [list memory register]] op movb [opvec 0x88 2 [list 8 32] [list register memory]] op movl [opvec 0xb8 2 [list 32 32] [list immediate register]] op movl [opvec 0x89 2 [list 32 32] [list register register]] op movl [opvec 0x8b 2 [list 32 32] [list memory register]] op movl [opvec 0x89 2 [list 32 32] [list register memory]] op movzbl [opvec 0x0fb6 2 [list 8 32] [list register register]] op addb [opvec 0x04 2 [list 8 8] [list immediate register]] op addb [opvec 0x00 2 [list 8 8] [list register register]] op addb [opvec 0x02 2 [list 32 8] [list memory register]] op addb [opvec 0x00 2 [list 8 32] [list register memory]] op addl [opvec 0x05 2 [list 32 32] [list immediate register]] op addl [opvec 0x01 2 [list 32 32] [list register register]] op addl [opvec 0x03 2 [list 32 32] [list memory register]] op addl [opvec 0x01 2 [list 32 32] [list register memory]] op subb [opvec 0x80 2 [list 8 8] [list immediate register] modrm] op subb [opvec 0x28 2 [list 8 8] [list register register] modrm] op subb [opvec 0x2a 2 [list 32 8] [list memory register] modrm] op subb [opvec 0x28 2 [list 8 32] [list register memory] modrm] op subl [opvec 0x81 2 [list 32 32] [list immediate register] modrm] op subl [opvec 0x29 2 [list 32 32] [list register register] modrm] op subl [opvec 0x2b 2 [list 32 32] [list memory register] modrm] op subl [opvec 0x29 2 [list 32 32] [list register memory] modrm] op call [opvec 0xff 1 [list 32] [list register]] op call [opvec 0xff 1 [list 32] [list immediate]] op notl [opvec 0xf7 1 [list 32] [list register]] op notb [opvec 0xf6 1 [list 8] [list register]] op pushl [opvec 0x50 1 [list 32] [list register]] op popl [opvec 0x58 1 [list 32] [list register]] op nop [opvec 0x90 0 [list] [list]] op leave [opvec 0xc9 0 [list] [list]] op ret [opvec 0xc3 0 [list] [list]] op lock [opvec $prefixes(lock) 0 [list] [list]] } set aobj [asm] asm::set-object $aobj set addfunc [$aobj get-offset] set Tcl_GetIntFromObj [$aobj get-symbol Tcl_GetIntFromObj] $aobj create-command add $addfunc namespace eval ::asm { pushl %ebp movl %esp %ebp subl 8 %esp # This is objv movl 20(%ebp) %eax addl 4 %eax movl (%eax) %ecx movl %ecx 4(%esp) movl 0 %eax leave ret } add puts DONE if 0 { #Test code movl 0xffaaddee %eax nop movl (%eax) %eax movl (%ecx) %ecx nop movl %eax %eax movl %ecx %eax movl %edx %eax movl %eax %ecx movl %eax (%eax) movl %eax (%ecx) movl %ecx (%eax) movl %ebp (%eax) movb %al %al movzbl %al %eax nop nop nop movl (%eax) %eax movl 4(%eax) %eax movl (%ecx) %ecx movl 4(%ecx) %ecx movl 20(%ecx) %edx movl 20(%eax) %edx movl 20(%ecx) %eax movl 20(%ebp) %eax movl %eax 20(%ebp) movl %eax %ebp movl %ebp %eax call %eax call 0xffaabbcc notl %eax notb %al pushl %eax pushl %ebp pushl %edi popl %eax popl %ebp popl %edi leave ret }
RS 2007-11-07: Interesting! After not having done real assembler for many years, I experimented with the classic
#include <stdio.h> int main(int argc, char* argv[]) { printf("hello. world!\n"); return 0; }
which, saved to hello.c and compiled with gcc -S hello.c, gives a file hello.s, which contains:
.file "hello.c" gcc2_compiled.: ___gnu_compiled_c: .def ___main; .scl 2; .type 32; .endef .text LC0: .ascii "hello. world!\12\0" .align 4 .globl _main .def _main; .scl 2; .type 32; .endef _main: pushl %ebp movl %esp,%ebp subl $8,%esp call ___main addl $-12,%esp pushl $LC0 call _printf addl $16,%esp xorl %eax,%eax jmp L10 .align 4 L10: movl %ebp,%esp popl %ebp ret .def _printf; .scl 2; .type 32; .endef
George Peter Staplin Nov 7, 2007 - JDC asked some questions about the assembler. This led to me writing this response about how it will probably work in the future.
Something like this struct will be used for relocations:
struct assembly_relocation { void *base; /* The base allocation from mmap */ void *ptr; /* The value of the pointer prior to any relocation. */ /* This is the offset from the base to the pointer's location: */ uintptr_t ptroffset; struct assembly_relocation *next; };
DKF suggested in the chat that we use a Tcl ByteArray. The problem as I see it with that, is making sure the memory has the proper permissions. We can only mprotect() a page-aligned pointer, so some of the ByteArray would go wasted, and we'd need metadata to indicate where it actually starts having PROT_EXEC. Otherwise we might end up giving PROT_EXEC to something that really shouldn't have it. I'm still thinking about the idea though. -GPS
DKF writes: Actually, I was thinking in terms of using the bytearray to accumulate the machine code during building, and then, when converting to a runnable command (that is the goal, yes?) copying to a suitably-allocated and protected block elsewhere. The "convert to command" ought to do any linking needed too. (I should emphasize that I've not thought hard about this.)
jdc Added support for labels and the jmp instruction:
#Tclas (a runtime assembler written in Tcl) #By George Peter Staplin set binout_offset 0 #INSTRUCTION PREFIXES set prefixes(lock) 0xf0 set prefixes(repne) 0xf2 set prefixes(repnz) 0xf2 set prefixes(rep) 0xf3 set prefixes(repe) 0xf3 set prefixes(repz) 0xf3 #SEGMENT OVERRIDE PREFIXES set sop(cs) 0x2e set sop(ss) 0x36 set sop(ds) 0x3e set sop(es) 0x26 set sop(fs) 0x64 set sop(gs) 0x65 set branch_hints(not_taken) 0x2e set branch_hints(taken) 0x3e proc reg32 {name value} { global regs set regs($name) $value set regs($name,type) 32 } proc reg16 {name value} { global regs set regs($name) $value set regs($name,type) 16 } proc reg8 {name value} { global regs set regs($name) $value set regs($name,type) 8 } reg32 eax 0 reg32 ecx 1 reg32 edx 2 reg32 ebx 3 reg32 esp 4 reg32 ebp 5 reg32 esi 6 reg32 edi 7 reg16 ax 0 reg16 cx 1 reg16 dx 2 reg16 bx 3 reg16 sp 4 reg16 bp 5 reg16 si 6 reg16 di 7 reg8 al 0 reg8 cl 1 reg8 dl 2 reg8 bl 3 reg8 ah 4 reg8 ch 5 reg8 dh 6 reg8 bh 7 proc op {name opvec} { global ops set ops($name) 1 lappend ops($name,data) $opvec interp alias {} $name {} opcall $name } proc opvec {base reqarg size types} { list $base $reqarg $size $types } # Operands with a % prefix are assumed to be registers. # Operands with a ~ prefix are symbols. TODO use dlopen/dlsym to lookup symbols. # Operands with a @ prefix are assumed to be absolute addresses. # Operands with no prefix are assumed to be immediate values. # Operands with possibly a number and ( ending with ) are memory operands. proc typeof {o resultvar} { upvar $resultvar result global regs set c [string index $o 0] switch -- $c { % { set regname [string range $o 1 end] if {![info exists regs($regname)]} { return -code error "invalid register name: $regname" } set result $regname return register } ~ { set result [string range $o 1 end] return symbol } @ { set result [string range $o 1 end] return address } default { if {[regexp {([0-9]*)\(%(.*?)\)} $o all offset reg]} { set result [list $offset $reg] return memory } elseif { [string is integer -strict $o] } { set result $o return immediate } else { set result $o return labelref } } } } proc find-matching-instruction-2 {name atype btype} { global ops set l [list $atype $btype] foreach oplist $ops($name,data) { if {[lindex $oplist 3] eq $l} { return $oplist } } return "" } proc find-matching-instruction-1 {name atype} { global ops set l [list $atype] foreach oplist $ops($name,data) { if {[lindex $oplist 3] eq $l} { return $oplist } } return "" } proc binout out { puts BINOUT:0x[format %x $out] incr ::binout_offset } proc binout-long out { puts BINOUT:0x[format %8.8x $out] incr ::binout_offset 4 } proc opcall-2 {name a b} { global ops regs puts "OPERANDS:$a $b" set atype [typeof $a ares] set btype [typeof $b bres] if {"" eq [set oplist [find-matching-instruction-2 $name $atype $btype]]} { return -code error "invalid instruction pattern: $name with $atype $btype :: $args" } lassign $oplist code operandcount sizes types lassign $sizes asize bsize puts "TYPES:$atype $btype" switch -- $atype { immediate { binout [expr {$code + $regs($bres)}] if {32 == $bsize} { binout-long $ares } else { binout [expr {$ares & 0xff}] } } memory { lassign $ares offset areg binout $code ;#opcode set rm [expr {8 * $regs($bres) + $regs($areg)}] if {"" eq $offset} { binout $rm } else { #TODO handle (1 << 7) for the 32-bit displacement/offset #or possibly make 32-bit displacement the default. binout [expr {(1 << 6) + $rm}] ;#modrm binout $offset } } register { if {"register" eq $btype} { binout $code binout [expr {(1 << 6 | 1 << 7) + (8 * $regs($ares)) + $regs($bres)}] ;#modrm } else { #memory lassign $bres offset breg binout $code set rm [expr {(8 * $regs($ares)) + $regs($breg)}] ;#rm if {"" eq $offset} { binout $rm } else { binout [expr {(1 << 6) + $rm}] binout $offset } } } } } proc opcall-1 {name a } { global ops regs set atype [typeof $a ares] if {"" eq [set oplist [find-matching-instruction-1 $name $atype]]} { return -code error "invalid instruction pattern: $name with $atype $btype :: $args" } lassign $oplist code operandcount sizes types lassign $sizes asize switch -- $atype { immediate { binout [expr {$code}] if {32 == $asize} { binout-long $ares } else { binout [expr {$ares & 0xff}] } } labelref { binout [expr {$code}] set ::relocators($::binout_offset) [list $asize $ares] if {32 == $asize} { binout-long 0 } else { binout [expr {0 & 0xff}] } } } } proc opcall-0 name { global ops lassign [lindex $ops($name,data) 0] code operandcount sizes types binout $code } proc opcall {name args} { switch -- [llength $args] { 0 { opcall-0 $name } 1 { opcall-1 $name {*}$args } 2 { opcall-2 $name {*}$args } default { return -code error "invalid arguments: $name $args" } } } proc label { name } { puts "label $name @ $::binout_offset" set ::label_map($name) $::binout_offset } op movb [opvec 0xb0 2 [list 8 8] [list immediate register]] op movb [opvec 0x88 2 [list 8 8] [list register register]] op movb [opvec 0x8a 2 [list 32 8] [list memory register]] op movb [opvec 0x88 2 [list 8 32] [list register memory]] op movl [opvec 0xb8 2 [list 32 32] [list immediate register]] op movl [opvec 0x89 2 [list 32 32] [list register register]] op movl [opvec 0x8b 2 [list 32 32] [list memory register]] op movl [opvec 0x89 2 [list 32 32] [list register memory]] op movzbl [opvec 0x0fb6 2 [list 8 32] [list register register]] op addb [opvec 0x04 2 [list 8 8] [list immediate register]] op addb [opvec 0x00 2 [list 8 8] [list register register]] op addb [opvec 0x02 2 [list 32 8] [list memory register]] op addb [opvec 0x00 2 [list 8 32] [list register memory]] op addl [opvec 0x05 2 [list 32 8] [list immediate register]] op addl [opvec 0x01 2 [list 32 32] [list register register]] op addl [opvec 0x03 2 [list 32 32] [list memory register]] op addl [opvec 0x01 2 [list 32 32] [list register memory]] op nop [opvec 0x90 0 [list] [list]] op jmp [opvec 0xea 1 [list 32] [list immediate]] op jmp [opvec 0xea 1 [list 32] [list labelref]] #Test code movl 0xffaaddee %eax nop movl (%eax) %eax movl (%ecx) %ecx nop movl %eax %eax movl %ecx %eax movl %edx %eax movl %eax %ecx movl %eax (%eax) movl %eax (%ecx) movl %ecx (%eax) movl %ebp (%eax) jmp an_entry label start_loop movb %al %al movzbl %al %eax nop nop nop movl (%eax) %eax movl 4(%eax) %eax movl (%ecx) %ecx movl 4(%ecx) %ecx label an_entry movl 20(%ecx) %edx movl 20(%eax) %edx movl 20(%ecx) %eax movl 20(%ebp) %eax movl %eax 20(%ebp) movl %eax %ebp movl %ebp %eax jmp start_loop if { [info exists label_map] } { parray label_map } if { [info exists relocators] } { parray relocators }
jdc Saving the binary output in a list like this:
proc binout out { puts BINOUT:0x[format %x $out] lappend ::binout 0x[format %x $out] incr ::binout_offset } proc binout-long out { puts BINOUT:0x[format %8.8x $out] lappend ::binout 0x[format %x [expr {($out>>24)&0xff}]] 0x[format %x [expr {($out>>16)&0xff}]] 0x[format %x [expr {($out>>8)&0xff}]] 0x[format %x [expr {$out&0xff}]] incr ::binout_offset 4 }
so it can be relocated with this function:
proc relocate { binout addr } { global label_map relocators foreach offset [array names relocators] { lassign $relocators($offset) lsize lname if { ![info exists label_map($lname)] } { error "Could not resolve symbol '$lname'" } set out [expr {$addr + $label_map($lname)}] switch -exact -- $lsize { 32 { set binout [lreplace $binout $offset [expr {$offset+3}] 0x[format %x [expr {($out>>24)&0xff}]] 0x[format %x [expr {($out>>16)&0xff}]] 0x[format %x [expr {($out>>8)&0xff}]] 0x[format %x [expr {$out&0xff}]]] } } } return $binout }
An example:
nop nop nop nop label start_loop nop nop nop nop nop jmp start_loop
Before relocation:
BINOUT:0x90 BINOUT:0x90 BINOUT:0x90 BINOUT:0x90 label start_loop @ 4 BINOUT:0x90 BINOUT:0x90 BINOUT:0x90 BINOUT:0x90 BINOUT:0x90 BINOUT:0xea BINOUT:0x00000000 label_map(start_loop) = 4 relocators(10) = 32 start_loop 0x90 0x90 0x90 0x90 0x90 0x90 0x90 0x90 0x90 0xea 0x0 0x0 0x0 0x0
After relocation to address 10:
puts [relocate $binout 10] 0x90 0x90 0x90 0x90 0x90 0x90 0x90 0x90 0x90 0xea 0x0 0x0 0x0 0xe