Playing with parsing

Richard Suchenwirth 2008-03-17 - Once again, I had some waiting time, and wanted to experiment with Tcl - in this case some toys to

  • parse infix expressions like 1+(2*3)
  • convert them to some RPN-like notation, like {push 1} {push 2} {push 3} * +
  • execute the resulting "code", so that the result is in this case 7

What is not covered is the precedence of operators. The code below associates to the left if not parenthesized, so 2+3*4 gives 20... On the other hand, I hope it is nicely readable, with the e.g. tests/comments sprinkled in :^)

#-------------- Testing framework first, so we can test immediately
set verbose [expr {[lsearch $argv -v] >= 0}]
set N 0
set Nfail 0
proc e.g. {cmd -> expected} {
    incr ::N
    catch {uplevel 1 $cmd} res
    if {$res ne $expected} {
        puts [list ($::N) $cmd -> $res ***** expected $expected]
        incr ::Nfail
    } elseif $::verbose {puts [list ($::N) $cmd -> $res]}
}
#------------------------------------------------------- real stuff

proc typeof char {
    switch -- $char {
        0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - . {return digit}
        + - "-" - * - / - ( - ) {return op}
        default {error "bad character $char"}
    }
}
e.g. {typeof 7} -> digit
e.g. {typeof .} -> digit
e.g. {typeof *} -> op
e.g. {typeof ?} -> {bad character ?}

#----------------------------------------------------------------
proc tokenize str {
    set res ""
    set previous ""
    set parens 0
    foreach char [split $str ""] {
        if {$char eq "("} {incr parens; append res " "}
        if {$char eq ")"} {incr parens -1}
        if {$char eq " "} continue
        set type [typeof $char]
        if {$previous ne "" && $type ne $previous} {append res " "}
        set previous $type
        append res $char
        if {$char eq ")"} {append res " "}
    }
    if $parens {error "unmatched parens $parens"}
    string trim $res
}
e.g. {tokenize 2+3} -> {2 + 3}
e.g. {tokenize "2 + 3"} -> {2 + 3}
e.g. {tokenize "2 + (3)"} -> {2 + ( 3 )}
e.g. {tokenize 12+43} -> {12 + 43}
e.g. {tokenize (} -> {unmatched parens 1}
e.g. {tokenize )} -> {unmatched parens -1}


#----------------------------------------------------------------- Stack
proc pop {{_stack ""}} {
    if {$_stack eq ""} {set _stack ::S} ;# default: global stack
    upvar 1 $_stack stack
    K [lindex $stack end] [set stack [lrange $stack 0 end-1]]
}
proc K {a b} {set a}

proc push what {lappend ::S $what}

e.g. {push hello; set S} -> hello
e.g. {pop; set S} -> {}

#----------------------------------------------------------------- Parser
proc parse tokens {
    #puts [info level 0]
    set res {}
    set opstack {}
    set braces 0
    set recurs {}
    foreach token $tokens {
        if $braces {
            lappend recurs $token
            if {$token eq "("} {incr braces}
            if {$token eq ")"} {
                incr braces -1
                if {$braces == 0} {
                    eval lappend res [parse [lrange $recurs 0 end-1]]
                    set recurs {}
                }
            }
        } else {
            if [string is double -strict $token] {
                lappend res [list push $token] 
                if {[llength $opstack]} {lappend res [pop opstack]}
            } elseif {$token eq "("} {
                incr braces
            } elseif {[typeof $token] eq "op"} {
                lappend opstack $token
            }
        }
        #puts token:$token,braces:$braces,recurs:$recurs
    }
    while {[llength $opstack]} {lappend res [pop opstack]}
    set res
}
e.g. {parse "2 + 3"} -> {{push 2} {push 3} +}
e.g. {parse "2 + 3 + 4"} -> {{push 2} {push 3} + {push 4} +}
e.g. {parse "( 2 + 3 ) * 4"} -> {{push 2} {push 3} + {push 4} *}

#----------------------------------------------------------------
foreach op {+ - * /} {
    proc $op {} "set a \[pop\]; push \[expr {\[pop\] $op \$a}\]"
}
e.g. {info body +} -> {set a [pop]; push [expr {[pop] + $a}]}

#----------------------------------------------------------- RPN evaluator
proc rpn cmdlist {
    if $::verbose {puts $cmdlist}
    global S
    foreach cmd $cmdlist {eval $cmd}
    if {[llength $S] != 1} {error "bad stack: $S"}
    pop
}
e.g. {rpn {{push 6} {push 7} *}} -> 42

#----------------------------------------------------------------
proc i2r {expr} {rpn [parse [tokenize $expr]]}

e.g. {i2r 2+3}     -> 5
e.g. {i2r 2/3}     -> 0
e.g. {i2r 1/2.}    -> 0.5
e.g. {i2r 1./2}    -> 0.5
e.g. {i2r 2/0}     -> "divide by zero"
e.g. {i2r 2/3.}    -> 0.666666666667
e.g. {i2r (2+3)*4} -> 20
e.g. {i2r 2+3*4}   -> 20 ;# yes... left-associative, no precedence
e.g. {i2r 2+(3*4)}   -> 14
e.g. {i2r 2+3+4}   -> 9
e.g. {i2r (1+2)*(3+4)} -> 21
e.g. {i2r 2*3+4}   -> 10
e.g. {i2r (2*3)+4}   -> 10
e.g. {i2r (2*3)+(4)}   -> 10
e.g. {i2r 3*(3+4)}   -> 21
e.g. {i2r {3 + ( 4 * ( 5 + 6 ) )}} -> 47


#------------------------------ Final tally
incr N -1 ;# last call to e.g. will incr again
puts ---
e.g. {list $Nfail / $N = [format %.2f [expr 100.*$Nfail/$N]] %} -> 0
puts ""
foreach arg $argv {e.g. $arg -> ?}