parsing with coroutine

A simple arithmetic expression parser

Description

Here's a simple example of how to do lexing and parsing with coroutines. The lexer makes a lot of use of switch's -regexp option, and is run in a coroutine so that it is easier to connect to the parser (which is not a coroutine). This code was originally developed by DKF for Rosetta Code .

Code

package require Tcl 8.6

# Utilities to make the coroutine easier to use: running [provide foo bar]
# inside the lexer coroutine (see below) has the lexer yield
#     {foo bar}, {}, {foo bar}, {}...
# in a loop while the parser runs [pushback $lexerContextCommand] on it.
# The parser runs [next $lexerContextCommand] to have the lexer break out of
# the yield loop and go to the next token.
proc provide args {while {![yield $args]} {yield}}
proc next lexer {$lexer 1}
proc pushback lexer {$lexer 0} ;# *Simulate* pushpack.

# Lexical analyzer coroutine core
proc lexer {str} {
    yield [info coroutine]
    set symbols {+ PLUS - MINUS * MULT / DIV ( LPAR ) RPAR}
    set idx 0
    while 1 {
        switch -regexp -matchvar m -- $str {
            {^\s+} {
                # No special action for whitespace
            }
            {^([-+*/()])} {
                provide [dict get $symbols [lindex $m 1]] [lindex $m 1] $idx
            }
            {^(\d+)} {
                provide NUMBER [lindex $m 1] $idx
            }
            {^$} {
                provide EOT "EOT" $idx
                return
            }
            . {
                provide PARSE_ERROR [lindex $m 0] $idx
            }
        }
        # Trim the matched string
        set str [string range $str [string length [lindex $m 0]] end]
        incr idx [string length [lindex $m 0]]
    }
}

# Utility functions to help with making an LL(1) parser; ParseLoop handles
# EBNF looping constructs, ParseSeq handles sequence constructs.
proc ParseLoop {lexer def} {
    upvar 1 token token payload payload index index
    # Generate arguments for the switch statement below.
    foreach {a b} $def {
        if {$b ne "-"} {set b [list set c $b]}
        lappend m $a $b
    }
    lappend m default {pushback $lexer; break}
    while 1 {
        lassign [next $lexer] token payload index
        switch -- $token {*}$m
        # $c is now a script for parsing tokens of the matched type.
        # We run it in the caller's stack frame. We propagate non-zero
        # return codes in the script (e.g., in the case of [return]
        # being used) up the call stack.
        if {[set c [catch {uplevel 1 $c} res opt]]} {
            dict set opt -level [expr {[dict get $opt -level]+1}]
            return -options $opt $res
        }
    }
}
proc ParseSeq {lexer def} {
    upvar 1 token token payload payload index index
    foreach {t s} $def {
        lassign [next $lexer] token payload index
        switch -- $token $t {
            if {[set c [catch {uplevel 1 $s} res opt]]} {
                dict set opt -level [expr {[dict get $opt -level]+1}]
                return -options $opt $res
            }
        } EOT {
            throw SYNTAX "end of text at position $index"
        } default {
            throw SYNTAX "\"$payload\" at position $index"
        }
    }
}

# Main parser driver; contains "master" grammar that ensures that the whole
# text is matched and not just a prefix substring. Note also that the parser
# runs the lexer as a coroutine (with a fixed name in this basic demonstration
# code).
proc parse {str} {
    set lexer [coroutine l lexer $str]
    try {
        set parsed [parse.expr $lexer]
        ParseLoop $lexer {
            EOT {
                return $parsed
            }
        }
        throw SYNTAX "\"$payload\" at position $index"
    } trap SYNTAX msg {
        return -code error "syntax error: $msg"
    } finally {
        catch {rename $lexer ""}
    }
}

# Now the descriptions of how to match each production in the grammar...
proc parse.expr {lexer} {
    set expr [parse.term $lexer]
    ParseLoop $lexer {
        PLUS - MINUS {
            set expr [list $token $expr [parse.term $lexer]]
        }
    }
    return $expr
}
proc parse.term {lexer} {
    set term [parse.factor $lexer]
    ParseLoop $lexer {
        MULT - DIV {
            set term [list $token $term [parse.factor $lexer]]
        }
    }
    return $term
}
proc parse.factor {lexer} {
    ParseLoop $lexer {
        NUMBER {
            return $payload
        }
        MINUS {
            ParseSeq $lexer {
                NUMBER {return -$payload}
            }
        }
        LPAR {
            set result [parse.expr $lexer]
            ParseSeq $lexer {
                RPAR {return $result}
            }
            break
        }
        EOT {
            throw SYNTAX "end of text at position $index"
        }
    }
    throw SYNTAX "\"$payload\" at position $index"
}

# Demonstration code
puts [parse "1 - 2 - -3 * 4 + 5"]
puts [parse "1 - 2 - -3 * (4 + 5)"]

Which produces this output:

PLUS {MINUS {MINUS 1 2} {MULT -3 4}} 5
MINUS {MINUS 1 2} {MULT -3 {PLUS 4 5}}

Parsing expressions with typedlist

Description

FM This is an experimentation to parse with coroutine. It uses the typedlist package.

Code

#! /bin/sh
# this is a -*-Tcl-*- file

package require typedlist

typedlist create BNF::nombre

coroutine nombre apply {{} {
    while 1 {
        if {[regexp {[0-9]} [set n [yield]]]} {
            if {[info exist ::String($::Instruction)] && \
                    [[lindex $::String($::Instruction) end] type] eq "nombre"} {
                [lindex $::String($::Instruction) end] lappend $n
            } else {
                lappend ::String($::Instruction) [BNF::nombre $n]
            }
        } 
    }
}}

typedlist create BNF::mot
coroutine mot apply {{} {
    while 1 {
        set n [yield]
        if {[regexp {[a-zA-Z]} $n]} {
            if {[info exist ::String($::Instruction)]} {
                if {[[lindex $::String($::Instruction) end] type] eq "mot"} {
                    [lindex $::String($::Instruction) end] lappend $n
                } elseif {[[lindex $::String($::Instruction) end] type] eq "var"} {
                    [lindex $::String($::Instruction) end] lappend $n
                } else {
                    lappend ::String($::Instruction) [BNF::mot $n]
                }
            } else {
                lappend ::String($::Instruction) [BNF::mot $n]
            }
        } elseif {[regexp {[0-9]} $n]} {
            if {[info exist ::String($::Instruction)]} {
                if {[[lindex $::String($::Instruction) end] type] eq "mot"} {
                    [lindex $::String($::Instruction) end] lappend $n
                } elseif {[[lindex $::String($::Instruction) end] type] eq "var"} {
                    [lindex $::String($::Instruction) end] lappend $n
                } 
            }
        }
    }
}}

typedlist create BNF::égal
coroutine égal apply {{} {
    while 1 {
        if {[regexp {=} [set n [yield]]]} {
            lappend ::String($::Instruction) [BNF::égal $n]
        } 
    }
}}

typedlist create BNF::plus
coroutine plus apply {{} {
    while 1 {
        if {[regexp {\+} [set n [yield]]]} {
            lappend ::String($::Instruction) [BNF::plus $n]
        } 
    }
}}
typedlist create BNF::fois
coroutine fois apply {{} {
    while 1 {
        if {[regexp -- {\*} [set n [yield]]]} {
            lappend ::String($::Instruction) [BNF::fois $n]
        } 
    }
}}
typedlist create BNF::moins
coroutine moins apply {{} {
    while 1 {
        if {[regexp -- {-} [set n [yield]]]} {
            lappend ::String($::Instruction) [BNF::moins $n]
        } 
    }
}}
typedlist create BNF::rapport
coroutine rapport apply {{} {
    while 1 {
        if {[regexp -- {/} [set n [yield]]]} {
            lappend ::String($::Instruction) [BNF::rapport $n]
        } 
    }
}}
typedlist create BNF::ParenthèseOuverte
coroutine ParenthèseOuverte apply {{} {
    while 1 {
        if {[regexp -- {\(} [set n [yield]]]} {
            lappend ::String($::Instruction) [BNF::ParenthèseOuverte $n]
        } 
    }
}}
typedlist create BNF::ParenthèseFermée
coroutine ParenthèseFermée apply {{} {
    while 1 {
        if {[regexp -- {\)} [set n [yield]]]} {
            lappend ::String($::Instruction) [BNF::ParenthèseFermée $n]
        } 
    }
}}

typedlist create BNF::var
coroutine var apply {{} {
    while 1 {
        set n [yield]
        if {[regexp {\$} $n]} {
            lappend ::String($::Instruction) [BNF::var $n]
        } 
    }
}}
typedlist create BNF::FinDeInstruction
coroutine FinDeInstruction apply {{} {
    while 1 {
        set n [yield]
        if {[regexp {\;} $n] || [regexp {\n} $n]} {
            incr ::Instruction
        } 
    }
}}

typedlist create BNF::point
coroutine point apply {{} {
    while 1 {
        set n [yield]
        if {[regexp {\.} $n]} {
            if {[info exist ::String($::Instruction)] && \
                    [[lindex $::String($::Instruction) end] type] eq "nombre"} {
                [lindex $::String($::Instruction) end] lappend $n
            } else {
                lappend ::String($::Instruction) [BNF::nombre $n]
            }
        } 
    }
}}

typedlist create BNF::sousScript 
coroutine sousScript apply {{} {
    while 1 {
        set n [yield]
        if {[regexp {\[} $n]} {
            if {$::sousScript == 0} {
                lappend ::String($::Instruction) [BNF::sousScript $n]
            } else {
                [lindex $::String($::Instruction) end] lappend $n
            }
            incr ::sousScript 
        } elseif {[regexp {\]} $n]} {
            [lindex $::String($::Instruction) end] lappend $n
            incr ::sousScript -1
        } elseif {$::sousScript >= 1} {
            ToutVenant $n
        }
    }
}}

typedlist create BNF::FinDeMot 
coroutine FinDeMot apply {{} {
    while 1 {
        set n [yield]
        if {[regexp {[ \t]} $n]} {
            lappend ::String($::Instruction) [BNF::FinDeMot $n]
        }
    }
}}

typedlist create BNF::ToutVenant 
coroutine ToutVenant apply {{} {
    set i 0
    set N {}
    set ::SousExpression 0
    while 1 {
        set n [yield]
        if {$n eq " " && $i == 0} {
            lappend N " "
            incr i
        } elseif {$n eq "\{" && $i == 1} {
            lappend N \{
            incr i
        } elseif {$n eq "=" && $i == 2} {
            lappend N =
            incr i
        } elseif {$n eq "\}" && $i == 3} {
            lappend N \}
            incr i
        } elseif {$n eq "(" && $i == 4} {
            set ::SousExpression 1
            set ::Parenthèse 1
            [lindex $::String($::Instruction) end] lappend \[ e x p r { } \{ $n
            set N {}
            set i 5
        } elseif {$::SousExpression} {
            sousExpression $n
            if {!$::SousExpression} {
                set i 1                
            }
        } else {
            [lindex $::String($::Instruction) end] lappend {*}$N $n
            set N {}
        }
    }
}}

typedlist create BNF::sousExpression
coroutine sousExpression apply {{} {
    set i 0
    while 1 {
        set n [yield]
        if {$n eq ")" && ${::Parenthèse} == 1} {
            [lindex $::String($::Instruction) end] lappend $n \} \]
            set ::SousExpression 0
        } elseif {$n eq ")" && ${::Parenthèse} > 1} {
            [lindex $::String($::Instruction) end] lappend $n
            incr ::Parenthèse -1
        } elseif {$n eq "("} {
            [lindex $::String($::Instruction) end] lappend $n
            incr ::Parenthèse
        } else {
            [lindex $::String($::Instruction) end] lappend $n
        }
    }
}}

proc lexpr {sz} {
    catch {array unset ::String}
    array set ::String {}
    set ::Instruction 0
    set ::sousScript 0
    # lexer
    foreach c [split $sz ""] {
        sousScript $c
        if {$::sousScript} continue
        mot $c
        égal $c
        var $c
        nombre $c
        plus $c
        fois $c
        moins $c
        rapport $c
        point $c
        ParenthèseOuverte $c
        ParenthèseFermée $c
        FinDeInstruction $c
    }
    set Res []
    # interpreteur
    for {set i 0} {$i <= $::Instruction} {incr i} {
        set res []
        set var []
        set Silencieux 0

        if {![info exist ::String($i)]} continue
        if {[set range [lsearch $::String($i) *::égal:*]] ne {}} {
            foreach e [lrange $::String($i) 0 ${range}-1] {
                append var [$e get]
            }
            set ::String($i) [lrange $::String($i) ${range}+1 end]
            if {$range == 0} {
                set Silencieux 1
            }
        }
        foreach e $::String($i) {
            append res [join [$e get] ""]
        }
        if {$var ne {}} {
            set $var [expr $res]
        } elseif {$Silencieux} {
            eval subst $res
        } else {
            lappend Res [expr $res]
        }
    }
    return $Res
}


# Test

lexpr {
    =[wm withdraw .
      toplevel [set t .top_tmp_]
      wm geom $t +100+100
      wm attributes $t -alpha 0.0
      wm state $t zoomed
      update idletask
      lassign [split [winfo geometry $t] x+] W H X Y]
    w=[winfo width .]; h=[winfo height .]
    =[wm geom . +{=}(($W-$w)/2+$X)+{=}(($H-$h)/2+$Y)
      wm deiconify .
      destroy $t
      update idletask]
}

the lexpr proc here do :

if the line begin with v =, v is seen as a variable and so is set to the value calculate after the equal sign (variable is only one letter long for the moment)
if the line begin with =, the action isn't return as result of the command (else it will be appended to a list which will be return)
what is between [ and ] is a seen as a subscript to be evaluated
the construct {=}( ... ) is seen as an expression to be calculated in a subscript.

Category String Processing

Category Example

Category Parsing