Version 26 of llvmtcl

Updated 2010-05-28 14:25:28 by jos

llvmtcl logo

News

Requirements

You need to fix LLVM Bug 7019, which is not fixed in the downloadable LLVM 2.7 package. More info at http://llvm.org/viewvc/llvm-project?view=rev&revision=102959

Building the wrapper

The package is a TEA package, build it like this:

$ cd llvmtcl
$ ./configure --with-tcl=... --with-llvm=...
$ make
$ make install

Using the LLVM API

Building and verifying a LLVM module and function:

lappend auto_path .
package require llvmtcl

# Initialize the JIT
llvmtcl LinkInJIT
llvmtcl InitializeNativeTarget

# Create a module and builder
set m [llvmtcl ModuleCreateWithName "testmodule"]
set bld [llvmtcl CreateBuilder]

# Create a plus10 function, taking one argument and adding 6 and 4 to it
set ft [llvmtcl FunctionType [llvmtcl Int32Type] [list [llvmtcl Int32Type]] 0]
set plus10 [llvmtcl AddFunction $m "plus10" $ft]

# Create constants
set c6 [llvmtcl ConstInt [llvmtcl Int32Type] 6 0]
set c4 [llvmtcl ConstInt [llvmtcl Int32Type] 4 0]

# Create the basic blocks
set entry [llvmtcl AppendBasicBlock $plus10 entry]

# Put arguments on the stack to avoid having to write select and/or phi nodes
llvmtcl PositionBuilderAtEnd $bld $entry
set arg0_1 [llvmtcl GetParam $plus10 0]
set arg0_2 [llvmtcl BuildAlloca $bld [llvmtcl Int32Type] arg0]
set arg0_3 [llvmtcl BuildStore $bld $arg0_1 $arg0_2]

# Do add 10 in two steps to see the optimizer @ work

# Add 6
set arg0_4 [llvmtcl BuildLoad $bld $arg0_2 "arg0"]
set add6 [llvmtcl BuildAdd $bld $arg0_4 $c6 "add6"]

# Add 4
set add4 [llvmtcl BuildAdd $bld $add6 $c4 "add4"]

# Set return
llvmtcl BuildRet $bld $add4

# Show input
puts "----- Input --------------------------------------------------"
puts [llvmtcl DumpModule $m]

# Verify the module
lassign [llvmtcl VerifyModule $m LLVMReturnStatusAction] rt msg
if {$rt} {
    error $msg
}

This results in the following LLVM bit code:

define i32 @plus10(i32) {
entry:
  %arg0 = alloca i32                              ; <i32*> [#uses=2]
  store i32 %0, i32* %arg0
  %arg01 = load i32* %arg0                        ; <i32> [#uses=1]
  %add6 = add i32 %arg01, 6                       ; <i32> [#uses=1]
  %add4 = add i32 %add6, 4                        ; <i32> [#uses=1]
  ret i32 %add4
}

Now execute it:

# Execute
lassign [llvmtcl CreateJITCompilerForModule $m 0] rt EE msg
set i [llvmtcl CreateGenericValueOfInt [llvmtcl Int32Type] 4 0]
set res [llvmtcl RunFunction $EE $plus10 $i]
puts "plus10(4) = [llvmtcl GenericValueToInt $res 0]\n"

Now optimize the LLVM module:

# Optimize
llvmtcl Optimize $m $plus10

Result of optimization:

define i32 @plus10(i32) readnone {
entry:
  %add4 = add i32 %0, 10                          ; <i32> [#uses=1]
  ret i32 %add4
}

Transforming Tcl into LLVM bit code

The LLVM wrapper has limited support for converting Tcl into LLVM bit code and is based on the output of tcl::unsupported::disassemble. Current (stringent) limitation are:

  • all variables are 32 bit integers (no strings, floats, lists, arrays, dicts, ...)
  • all proc's return a single 32 bit integer
  • all proc's must be know at convert time

Simple example

Take this simple Tcl procedure as input:

proc test2 {a b c d e} {
    return [expr {4+$a+6}]
}

The tcl::unsupported::disassemble output of this example looks like this:

ByteCode 0x0x9b4ffe8, refCt 1, epoch 4, interp 0x0x9a9f3b0 (epoch 4)
  Source "\n    return [expr {4+$a+6}]\n"
  Cmds 2, src 28, inst 10, litObjs 2, aux 0, stkDepth 2, code/src 0.00
  Proc 0x0x9b44b38, refCt 1, args 5, compiled locals 5
      slot 0, scalar, arg, "a"
      slot 1, scalar, arg, "b"
      slot 2, scalar, arg, "c"
      slot 3, scalar, arg, "d"
      slot 4, scalar, arg, "e"
  Commands 2:
      1: pc 0-8, src 5-26        2: pc 0-7, src 13-25
  Command 1: "return [expr {4+$a+6}]"
  Command 2: "expr {4+$a+6}"
    (0) push1 0         # "4"
    (2) loadScalar1 %v0         # var "a"
    (4) add 
    (5) push1 1         # "6"
    (7) add 
    (8) done 
    (9) done 

Translating it to llvm with the llvmtcl::Tcl2LLVM command results in:

define i32 @test2(i32, i32, i32, i32, i32) {
block0:
  %5 = alloca [100 x i8*]                         ; <[100 x i8*]*> [#uses=10]
  %6 = alloca i32                                 ; <i32*> [#uses=20]
  store i32 0, i32* %6
  %7 = alloca i32                                 ; <i32*> [#uses=2]
  store i32 %0, i32* %7
  %8 = alloca i32                                 ; <i32*> [#uses=1]
  store i32 %1, i32* %8
  %9 = alloca i32                                 ; <i32*> [#uses=1]
  store i32 %2, i32* %9
  %10 = alloca i32                                ; <i32*> [#uses=1]
  store i32 %3, i32* %10
  %11 = alloca i32                                ; <i32*> [#uses=1]
  store i32 %4, i32* %11
  %push = alloca i32                              ; <i32*> [#uses=2]
  store i32 4, i32* %push
  %push1 = load i32* %6                           ; <i32> [#uses=2]
  %push2 = getelementptr [100 x i8*]* %5, i32 0, i32 %push1 ; <i8**> [#uses=1]
  %12 = bitcast i32* %push to i8*                 ; <i8*> [#uses=1]
  store i8* %12, i8** %push2
  %push3 = add i32 %push1, 1                      ; <i32> [#uses=1]
  store i32 %push3, i32* %6
  %13 = load i32* %7                              ; <i32> [#uses=1]
  %push4 = alloca i32                             ; <i32*> [#uses=2]
  store i32 %13, i32* %push4
  %push5 = load i32* %6                           ; <i32> [#uses=2]
  %push6 = getelementptr [100 x i8*]* %5, i32 0, i32 %push5 ; <i8**> [#uses=1]
  %14 = bitcast i32* %push4 to i8*                ; <i8*> [#uses=1]
  store i8* %14, i8** %push6
  %push7 = add i32 %push5, 1                      ; <i32> [#uses=1]
  store i32 %push7, i32* %6
  %pop = load i32* %6                             ; <i32> [#uses=1]
  %pop8 = add i32 %pop, -1                        ; <i32> [#uses=2]
  store i32 %pop8, i32* %6
  %pop9 = getelementptr [100 x i8*]* %5, i32 0, i32 %pop8 ; <i8**> [#uses=1]
  %pop10 = load i8** %pop9                        ; <i8*> [#uses=1]
  %pop11 = bitcast i8* %pop10 to i32*             ; <i32*> [#uses=1]
  %pop12 = load i32* %pop11                       ; <i32> [#uses=1]
  %pop13 = load i32* %6                           ; <i32> [#uses=1]
  %pop14 = add i32 %pop13, -1                     ; <i32> [#uses=2]
  store i32 %pop14, i32* %6
  %pop15 = getelementptr [100 x i8*]* %5, i32 0, i32 %pop14 ; <i8**> [#uses=1]
  %pop16 = load i8** %pop15                       ; <i8*> [#uses=1]
  %pop17 = bitcast i8* %pop16 to i32*             ; <i32*> [#uses=1]
  %pop18 = load i32* %pop17                       ; <i32> [#uses=1]
  %15 = add i32 %pop18, %pop12                    ; <i32> [#uses=1]
  %push19 = alloca i32                            ; <i32*> [#uses=2]
  store i32 %15, i32* %push19
  %push20 = load i32* %6                          ; <i32> [#uses=2]
  %push21 = getelementptr [100 x i8*]* %5, i32 0, i32 %push20 ; <i8**> [#uses=1]
  %16 = bitcast i32* %push19 to i8*               ; <i8*> [#uses=1]
  store i8* %16, i8** %push21
  %push22 = add i32 %push20, 1                    ; <i32> [#uses=1]
  store i32 %push22, i32* %6
  %push23 = alloca i32                            ; <i32*> [#uses=2]
  store i32 6, i32* %push23
  %push24 = load i32* %6                          ; <i32> [#uses=2]
  %push25 = getelementptr [100 x i8*]* %5, i32 0, i32 %push24 ; <i8**> [#uses=1]
  %17 = bitcast i32* %push23 to i8*               ; <i8*> [#uses=1]
  store i8* %17, i8** %push25
  %push26 = add i32 %push24, 1                    ; <i32> [#uses=1]
  store i32 %push26, i32* %6
  %pop27 = load i32* %6                           ; <i32> [#uses=1]
  %pop28 = add i32 %pop27, -1                     ; <i32> [#uses=2]
  store i32 %pop28, i32* %6
  %pop29 = getelementptr [100 x i8*]* %5, i32 0, i32 %pop28 ; <i8**> [#uses=1]
  %pop30 = load i8** %pop29                       ; <i8*> [#uses=1]
  %pop31 = bitcast i8* %pop30 to i32*             ; <i32*> [#uses=1]
  %pop32 = load i32* %pop31                       ; <i32> [#uses=1]
  %pop33 = load i32* %6                           ; <i32> [#uses=1]
  %pop34 = add i32 %pop33, -1                     ; <i32> [#uses=2]
  store i32 %pop34, i32* %6
  %pop35 = getelementptr [100 x i8*]* %5, i32 0, i32 %pop34 ; <i8**> [#uses=1]
  %pop36 = load i8** %pop35                       ; <i8*> [#uses=1]
  %pop37 = bitcast i8* %pop36 to i32*             ; <i32*> [#uses=1]
  %pop38 = load i32* %pop37                       ; <i32> [#uses=1]
  %18 = add i32 %pop38, %pop32                    ; <i32> [#uses=1]
  %push39 = alloca i32                            ; <i32*> [#uses=2]
  store i32 %18, i32* %push39
  %push40 = load i32* %6                          ; <i32> [#uses=2]
  %push41 = getelementptr [100 x i8*]* %5, i32 0, i32 %push40 ; <i8**> [#uses=1]
  %19 = bitcast i32* %push39 to i8*               ; <i8*> [#uses=1]
  store i8* %19, i8** %push41
  %push42 = add i32 %push40, 1                    ; <i32> [#uses=1]
  store i32 %push42, i32* %6
  %top = load i32* %6                             ; <i32> [#uses=1]
  %top43 = add i32 %top, -1                       ; <i32> [#uses=1]
  %top44 = getelementptr [100 x i8*]* %5, i32 0, i32 %top43 ; <i8**> [#uses=1]
  %top45 = load i8** %top44                       ; <i8*> [#uses=1]
  %top46 = bitcast i8* %top45 to i32*             ; <i32*> [#uses=1]
  %top47 = load i32* %top46                       ; <i32> [#uses=1]
  ret i32 %top47
}

Note the 100 location stack being allocated at the beginning, the stack pushes and the stack pops. Running all this through the llvm optimizer results in:

define i32 @test2(i32, i32, i32, i32, i32) readonly {
block0:
  %5 = add i32 %0, 10                             ; <i32> [#uses=1]
  ret i32 %5
}

IIR Filter example

Now consider an IIR filter implemented in Tcl:

proc low_pass {x x1 x2 y1 y2 C0 C1 C2 C3 C4} {
    return [expr {$x*$C0 + $x1*$C1 + $x2*$C2 + $y1*$C3 + $y2*$C4}]
}

Converting and optimizing it with llvmtcl gives the following result:

define i32 @low_pass(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone {
block0:
  %10 = mul i32 %5, %0                            ; <i32> [#uses=1]
  %11 = mul i32 %6, %1                            ; <i32> [#uses=1]
  %12 = mul i32 %7, %2                            ; <i32> [#uses=1]
  %13 = mul i32 %8, %3                            ; <i32> [#uses=1]
  %14 = mul i32 %9, %4                            ; <i32> [#uses=1]
  %15 = add i32 %11, %10                          ; <i32> [#uses=1]
  %16 = add i32 %15, %12                          ; <i32> [#uses=1]
  %17 = add i32 %16, %13                          ; <i32> [#uses=1]
  %18 = add i32 %17, %14                          ; <i32> [#uses=1]
  ret i32 %18
}

Now also convert a driver function:

proc filter { } {
    set y 0
    set x1 0
    set x2 0
    set y1 0
    set y2 0
    for {set i 0} {$i < 1000} {incr i} {
        set y [low_pass $i $x1 $x2 $y1 $y2 1 3 -2 4 -5]
        # Messing with the result to stay within 32 bit
        if {$y > 1000 || $y < -1000} {
            set y 1
        } else {
            set y1 $y
        }
        set y2 $y1
        set y1 $y
        set x2 $x1
        set x1 [expr {$i}]
    }
    return $y
}

The result of low_pass is modified so the results stay well within 32 bit boundaries. The optimized result becomes:

define i32 @filter() readnone {
bb.nph:
  %0 = alloca [100 x i8*], align 8                ; <[100 x i8*]*> [#uses=2]
  %push2 = getelementptr [100 x i8*]* %0, i64 0, i64 0 ; <i8**> [#uses=1]
  %push410 = getelementptr [100 x i8*]* %0, i64 0, i64 1 ; <i8**> [#uses=1]
  %push408474 = alloca i32, align 4               ; <i32*> [#uses=2]
  store i32 1000, i32* %push408474
  %1 = bitcast i32* %push408474 to i8*            ; <i8*> [#uses=1]
  store i8* %1, i8** %push410
  %push424475 = alloca i32, align 4               ; <i32*> [#uses=2]
  store i32 -1, i32* %push424475
  %2 = bitcast i32* %push424475 to i8*            ; <i8*> [#uses=1]
  store i8* %2, i8** %push2
  br label %block89

block89:                                          ; preds = %block89, %bb.nph
  %.0461480 = phi i32 [ 0, %bb.nph ], [ %.0462, %block89 ] ; <i32> [#uses=1]
  %.0464479 = phi i32 [ 0, %bb.nph ], [ %.0465478, %block89 ] ; <i32> [#uses=1]
  %.0465478 = phi i32 [ 0, %bb.nph ], [ %indvar476, %block89 ] ; <i32> [#uses=2]
  %.0466477 = phi i32 [ 0, %bb.nph ], [ %storemerge472, %block89 ] ; <i32> [#uses=2]
  %indvar476 = phi i32 [ 0, %bb.nph ], [ %indvar.next, %block89 ] ; <i32> [#uses=4]
  %tmp721 = add i32 %indvar476, 1000              ; <i32> [#uses=1]
  %indvar.next = add i32 %indvar476, 1            ; <i32> [#uses=2]
  %tmp722 = shl i32 %.0466477, 2                  ; <i32> [#uses=2]
  %tmp723 = add i32 %tmp721, %tmp722              ; <i32> [#uses=1]
  %tmp724 = mul i32 %.0465478, 3                  ; <i32> [#uses=2]
  %tmp725 = add i32 %tmp723, %tmp724              ; <i32> [#uses=1]
  %tmp726 = mul i32 %.0461480, 5                  ; <i32> [#uses=1]
  %tmp727 = shl i32 %.0464479, 1                  ; <i32> [#uses=1]
  %tmp728 = add i32 %tmp726, %tmp727              ; <i32> [#uses=2]
  %tmp506.off = sub i32 %tmp725, %tmp728          ; <i32> [#uses=1]
  %not. = icmp ult i32 %tmp506.off, 2001          ; <i1> [#uses=2]
  %tmp706 = add i32 %indvar476, %tmp722           ; <i32> [#uses=1]
  %tmp708 = add i32 %tmp706, %tmp724              ; <i32> [#uses=1]
  %tmp506 = sub i32 %tmp708, %tmp728              ; <i32> [#uses=2]
  %storemerge472 = select i1 %not., i32 %tmp506, i32 1 ; <i32> [#uses=2]
  %.0462 = select i1 %not., i32 %tmp506, i32 %.0466477 ; <i32> [#uses=1]
  %exitcond = icmp eq i32 %indvar.next, 1000      ; <i1> [#uses=1]
  br i1 %exitcond, label %block256, label %block89

block256:                                         ; preds = %block89
  ret i32 %storemerge472
}

Some time data:

tcl [filter]: 1757.0 microseconds per iteration
llvm [filter]: 18.5 microseconds per iteration

Discussion

FB - 2010-05-27 04:29:35

Impressive. You may want to look at the Hindley-Milner algorithm for type inference:

http://www.codecommit.com/blog/scala/what-is-hindley-milner-and-why-is-it-cool

Another option is trace tree compiling à la TraceMonkey, using the bytecode stream as input:

http://www.ics.uci.edu/~franz/Site/pubs-pdf/ICS-TR-06-16.pdf


schlenk - 2010-05-27

I didn't like all the namespace imports and the long names so i defined a little namespace ensemble like this, maybe you think its a good idea to use that too?

proc create_llvm_ensemble {} {
  set map {}
  foreach cmd [info commands ::llvmtcl::LLVM*] {
    lappend map [string range $cmd 15 end] $cmd
  }
  namespace ensemble create -command ::llvm -map [dict create {*}$map]
}

Usage like this:

package require llvmtcl
create_llvm_ensemble

llvm LinkInJIT
llvm InitializeNativeTarget
...

jdc - 27-may-2010

Good proposal. I'll add it to the package.