George Peter Staplin: Jan 29, 2005 - As an experiment I decided to try to build a prototype for a Tcl VM that uses threaded code. Threaded code is an interesting concept that is fairly well documented on the web. Threaded code has been used in Ken Thompson's original B language, and Charles Moore's Forth.
This prototype converts a series of bytecodes into addresses. The addresses are then used to jump around in the threaded code.
Note: this is not related to POSIX threads, or concurrent programming techniques.
Tcl_Types.h
#define TCL_TYPE_STRING 2 #define TCL_TYPE_INT 4 #define TCL_TYPE_DOUBLE 8
main.c
#include <stdio.h> #include <stdlib.h> #include <string.h> #include "Tcl_Types.h" /* We define an operator table that is used with the symbols for bytecodes to lookup code addresses. */ struct { void *code; } ops[3]; /* These are our bytecodes: */ enum { TCL_ADD, TCL_PUSH, TCL_RETURN }; /* These must match the .equ in vm.S */ typedef struct { int type; int ref; /* 4 */ int i; /* 8 */ char *s; /* 12 */ } Tcl_Obj; /* This is our execution token. It stores a linear series of addresses. */ typedef struct { caddr_t *start; /* Our cursor is the current position after the last address was appended. */ caddr_t *cursor; size_t remaining; size_t total; } Tcl_Xt; extern void Tcl_InitOperators (); extern Tcl_Obj *Tcl_Run (); extern void Tcl_SetStringObj (Tcl_Obj *obj, char *str); /* bytecode to address */ #define BCTOA(bc) ops[bc].code /* append code to xt */ #define ACTOXT(xt,sym) \ *(xt->cursor) = sym; \ xt->cursor += 1 void *Tcl_Alloc (size_t s) { void *r = malloc (s); if (NULL == r) { perror ("malloc"); exit (EXIT_FAILURE); } return r; } void Tcl_Free (void *p) { free (p); } void *Tcl_Realloc (void *old, size_t s) { void *new; new = realloc (old, s); if (NULL == new) { perror ("realloc"); exit (EXIT_FAILURE); } return new; } void Tcl_EnlargeXt (Tcl_Xt *xt) { size_t new_size; size_t used; void *p; new_size = (xt->total * 2); used = (xt->remaining - xt->total); p = Tcl_Realloc (xt->start, new_size); xt->remaining = (new_size - used); xt->total = new_size; xt->cursor = (p + (xt->cursor - xt->start)); xt->start = p; } void Tcl_CompileExampleAdd (Tcl_Obj *objv[], Tcl_Xt *xt) { if ((sizeof (void *) * 7) < xt->remaining) { Tcl_EnlargeXt (xt); } ACTOXT (xt, BCTOA (TCL_PUSH)); ACTOXT (xt, (caddr_t)objv[0]); ACTOXT (xt, BCTOA (TCL_PUSH)); ACTOXT (xt, (caddr_t)objv[1]); ACTOXT (xt, BCTOA (TCL_ADD)); ACTOXT (xt, BCTOA (TCL_RETURN)); xt->remaining -= (sizeof (void *) * 7); } Tcl_Obj *Tcl_NewObj (void) { Tcl_Obj *obj = Tcl_Alloc (sizeof (Tcl_Obj)); obj->type = TCL_TYPE_STRING; obj->ref = 0; return obj; } Tcl_Xt *Tcl_NewXt (void) { Tcl_Xt *xt = Tcl_Alloc (sizeof (Tcl_Xt)); xt->total = xt->remaining = 2000; xt->cursor = xt->start = Tcl_Alloc (xt->remaining); return xt; } void Tcl_SetStringObj (Tcl_Obj *obj, char *str) { obj->type = TCL_TYPE_STRING; obj->s = str; } extern void test_convert (Tcl_Obj *obj); int main () { Tcl_Obj **objv; Tcl_Obj *result; Tcl_Xt *xt; Tcl_InitOperators ( &(ops[TCL_ADD].code), &(ops[TCL_PUSH].code), &(ops[TCL_RETURN].code)); printf ("add_op code is %p\n", ops[TCL_ADD].code); printf ("push_op code is %p\n", ops[TCL_PUSH].code); printf ("return code is %p\n", ops[TCL_RETURN].code); objv = Tcl_Alloc (sizeof (Tcl_Obj *) * 2); objv[0] = Tcl_NewObj (); objv[1] = Tcl_NewObj (); Tcl_SetStringObj (objv[0], "255"); Tcl_SetStringObj (objv[1], "456"); xt = Tcl_NewXt (); Tcl_CompileExampleAdd (objv, xt); #if 0 printf ("type before %d\n", objv[0]->type); test_convert (objv[1]); printf ("type after %d i %d\n", objv[0]->type, objv[0]->i); test_convert (objv[1]); return 0; #endif result = Tcl_Run (xt->start); printf ("RESULT is %d\n", result->i); printf ("after return from Tcl_Run\n"); return EXIT_SUCCESS; }
registers.h
/* We may need to change the register allocation in the future, so we use these macros. */ /* OPERAND STACK */ #define ops esi /* TOP OF STACK */ #define tos edi /* VIRTUAL INSTRUCTION POINTER */ #define vip edx
vm.S
#include "registers.h" #include "Tcl_Types.h" /* We cache the top-of-stack (TOS) in a register. * This is a technique that can result in better performance. */ .macro NEXT movl (%vip),%eax jmp *%eax .endm .macro POP_INTO reg movl %tos,\reg movl (%ops),%tos addl $4,%ops .endm .macro PUSH reg subl $4,%ops movl %tos,(%ops) movl \reg,%tos .endm .macro RESTORE_TOS movl (%ops),%tos addl $4,%ops .endm .macro SAVE_TOS subl $4,%ops movl %tos,(%ops) .endm .macro VM_WORD word \word: addl $4,%vip .endm /* our Tcl_Obj structure layout */ .equ type_offset, 0 .equ ref_offset, 4 .equ int_offset, 8 .equ string_offset, 12 /**** EXPORTED SYMBOLS ****/ .global Tcl_InitOperators .global Tcl_Run /**** SETUP VARIOUS SECTIONS ****/ .data .comm operand_stack,2000 .text .section .rodata emit_hex_fmt: .string "HEX 0x%x\n" emit_invalid_integer: .string "This isn't an integer string.\n" emit_string_fmt: .string "DEBUG STRING %s\n" emit_ptr_fmt: .string "ptr %p\n" emit_internal_error: .string "internal error\n" .text /**** UTILITY CODE ****/ /* convert the integer in the Tcl_Obj stored in %eax (saving any registers that could be clobbered) We use: %eax result %cl low bits for the character %esi our offset into the string %edi our Tcl_Obj (after we move it from %eax) %edx for multiplication */ convert_to_int: pushl %edi pushl %esi pushl %ecx pushl %edx movl %eax,%edi movl string_offset(%eax),%esi /* our initial value */ movl $0,%eax repeat: /* clear our temporary register's high bits */ movl $0,%ecx movb (%esi),%cl cmpb $0,%cl je got_zero_byte /* %eax * 10 */ movl $10,%edx mull %edx cmpb $'0',%cl jl not_integer cmpb $'9',%cl jg not_integer subb $'0',%cl addl %ecx,%eax /* advance to the next char */ addl $1,%esi jmp repeat got_zero_byte: /* save the result in our Tcl_Obj's memory */ movl $TCL_TYPE_INT,type_offset(%edi) movl %eax,int_offset(%edi) movl %edi,%eax /* now restore the registers in reverse order */ popl %edx popl %ecx popl %esi popl %edi ret not_integer: pushl $emit_invalid_integer call printf addl $4,%esp /* XXX lovely error handling. I think in the final version we will use a longjmp. */ call abort /**** ENTRY POINTS ****/ Tcl_InitOperators: movl 4(%esp),%eax movl $add_op,(%eax) movl 8(%esp),%eax movl $push_op,(%eax) movl 12(%esp),%eax movl $exit_Tcl_Run,(%eax) ret /* Tcl_Run (start) */ Tcl_Run: movl 4(%esp),%vip movl $operand_stack,%ops /* our stack grows downward */ addl $2000,%ops NEXT exit_Tcl_Run: /* return our Tcl_Obj result in %eax */ POP_INTO %eax ret .global test_convert test_convert: movl 4(%esp),%eax call convert_to_int pushl int_offset(%eax) pushl $emit_hex_fmt call printf addl $8,%esp ret emit_debug: pusha pushl %eax pushl $emit_hex_fmt call printf addl $8,%esp popa ret /**** OPERATORS ****/ /* The idea is that we have various VM words that get threaded together. * Each word ends with a NEXT macro that expands to jump to the next address. */ /* add $a $b */ VM_WORD add_op /* convert object $b to int if needed */ POP_INTO %eax cmpl $TCL_TYPE_INT,type_offset(%eax) je 1f call convert_to_int 1: /* save the Tcl_Obj for $b */ pushl %eax POP_INTO %eax cmpl $TCL_TYPE_INT,type_offset(%eax) je 1f call convert_to_int 1: movl int_offset(%eax),%ecx popl %eax addl int_offset(%eax),%ecx /* Now we save the critical registers that the C code will possibly clobber. */ pushl %ecx pushl %tos pushl %vip pushl %ops call Tcl_NewObj popl %ops popl %vip popl %tos popl %ecx movl %ecx,int_offset(%eax) movl $TCL_TYPE_INT,type_offset(%eax) PUSH %eax NEXT VM_WORD push_op SAVE_TOS /* set TOS to the Tcl_Obj */ movl (%vip),%tos addl $4,%vip NEXT /*END*/
1. Ken Thompson's B language (the precursor to C): http://cm.bell-labs.com/cm/cs/who/dmr/kbman.html
2. Threaded Code Definition at FOLDOC: http://wombat.doc.ic.ac.uk/foldoc/foldoc.cgi?query=threaded+code
3. My revisions of this code (tcltcvm): http://www.xmission.com/~georgeps/engineering/prototype/
PWQ 31 Jan 05, If the bytecode interpreter was implemented with call threading then it would be easier to add new byte codes. From C it is not easy to implement direct or indirect threading, I wonder how many takers there will be to reimplement the interpreter for new CPU's in assembly.
George Peter Staplin: Jan 30, 2005 - I've examined several implementations of high performance threaded code that are mainly written in C. Some rely on allocating a global register as a virtual instruction pointer (which may not always work with every library, unless the libraries are compiled for this, and the compiler supports this). Others use the gcc feature of labels as values (&& on a label to get a void * for goto *ptr;).
I've been pondering doing an experiment with the Tcl sources. It would consist of implementing enough of the VM instructions in threaded code (written in assembly) to run small test programs. I would like to retain the bytecode code branch for platforms that don't have the assembly.
PWQ, Ideally the functions would call NEXT rather than returning to a main loop, gcc supports function attributes such as naked and noreturn but they are not supported on all targets. The scoping rules for C don't allow functions direct access to their caller variables even if called via goto *ptr (no uplevel equiv). This means passing around a structure pointer for all the state information.
Still this technique may be more efficient than the massive switch statement approach. Plus it can be extended by adding more function pointers to the jump table.
atp: According to the papers by M. Anton Ertl and David Gregg, e.g., "The Structure and Performance of EFFICIENT Interpreters", "switch threading" - aka, the "massive switch statement approach" - is pretty much always slower than direct or indirect threading.
The advantage of the switch statment approach is that it is portable across non-gcc compilers, while doing direct or indirect threading typically means either using gcc's "C Labels as Values" extension to Standard C, or some lower level technique, like assembler.
http://www.complang.tuwien.ac.at/forth/threaded-code.html http://www.complang.tuwien.ac.at/projects/interpreters.html
The Tcl Core Team appears to be aware of this, as this 2002 paper does discuss threaded code as a future possibility for Tcl:
"Tcl bytecode optimization: some experiences" by Kevin Kenny, Miguel Sofer, Jeffrey Hobbs http://aspn.activestate.com/ASPN/Tcl/TclConferencePapers2002/Tcl2002papers/kenny-bytecode/paperKBK.html