Version 0 of Parser using recursive descent

Updated 2003-12-23 07:50:59

Arjen Markus (23 december 2003) I am fascinated and awed by the - for me - frequent task of reading input files of all kinds of complexity. Even though most input files I deal with have a simple structure, the program units to read them (in as robust a way as necessary) can be devastatingly contorted.

So, any tool I can muster that makes life easier is welcome. These currently include:

  • Fortran routines, as many of the files I deal with are written by Fortran programs and Fortran is surprisingly well fit for the job. (I intend to write another page about this)
  • Yacc/Lex parsers if I need to do the job in C - C itself is, IMHO, hopeless.
  • Designing the input as Tcl source, if I can get away with that

But I am interested too in the theoretical part of parsing. So, after some altercations with a very simple file that turned out to be tough to read directly in C, and reading a book about concepts of programming languages, I decided to have a go with the method of recursive descent.

The script below is not perfect - it does not handle empty lists of dependents for a rule, it does not check for ambiguities, it does not pay any attention to errors in the input - but I do consider it a proof of concept.

It takes an LL grammar and parses a list of lexemes (the stuff input is made of) based on that grammar. No code generation is necessary, it is all done in memory.

Enjoy and comment!


 # parser.tcl --
 #    Experiment with creating a parser based on a simple grammar
 #
 # Note:
 #    The type of parser is LL, so not as general as possible
 #

 # Parser --
 #    Namespace for the variables and procedures
 #
 namespace eval ::Parser {
    variable rule_dependents
    variable rule_code
    variable lexeme        ""
    variable prev_lexeme
    variable lexeme_list
    variable lexeme_count
    variable token
    variable end

    namespace export init define | rule getLexeme
 }

 # init --
 #    Initialise the parser by giving it a list of lexemes
 # Arguments:
 #    input      Input for the parser
 # Result:
 #    None
 # Side effects:
 #    Set the variables defining the state of the parser
 #
 proc ::Parser::init { input } {
    variable end
    variable lexeme_count
    variable lexeme_list

    set end           0
    set lexeme_count -1
    set lexeme_list  $input
    set lexeme       ""

    NextLexeme
 }

 # getLexeme --
 #    Get the lexeme that was last examined (for access in user-code)
 # Arguments:
 #    None
 # Result:
 #    Value of the lexeme (actually the previous one!)
 # Side effects:
 #    Store the rule
 #
 proc ::Parser::getLexeme {} {
    variable prev_lexeme
    return $prev_lexeme
 }

 # define --
 #    Define the first rule for an item
 # Arguments:
 #    item       Name of the item to be defined
 #    depends    Dependents for the rule
 #    code       (Optional) code to be run if the rule matches
 # Result:
 #    None
 # Side effects:
 #    Store the rule
 #
 proc ::Parser::define { item depends {code {}} } {
    variable rule_dependents
    variable rule_code
    variable last_item

    set last_item              $item
    set rule_dependents($item) [list $depends]
    set rule_code($item)       [list $code]
 }

 # | --
 #    Define alternatives for the first rule for an item
 # Arguments:
 #    depends    Dependents for the rule
 #    code       (Optional) code to be run if the rule matches
 # Result:
 #    None
 # Side effects:
 #    Append the new information to the rule
 #
 proc ::Parser::| { depends {code {}} } {
    variable rule_dependents
    variable rule_code
    variable last_item

    lappend rule_dependents($last_item) [list $depends]
    lappend rule_code($last_item)       [list $code]
 }

 # rule --
 #    Match the input to the given rule
 # Arguments:
 #    item       Root item that starts the parsing
 # Result:
 #    1 if matched, 0 if end of input, "error" if no match
 #
 proc ::Parser::rule { item } {
    variable rule_dependents
    variable rule_code
    variable lexeme
    variable end

    if { $end } { return 0 }

    #
    # Try all the rules in turn
    #
    puts "Rule: $item"
    set rule_count 0
    foreach dependents $rule_dependents($item) code $rule_code($item) {
       puts "   Dependents: $dependents"
       #
       # Work our way along the dependents
       #
       set retcode 1
       incr rule_count
       foreach dep $dependents {
          puts "      Dependent: $dep"
          #
          # By convention: upper-case names mean terminals
          #
          if { [string toupper $dep] != $dep } {
             set retcode [rule $dep]
             if { $retcode == 0 } {
                puts "==> did not work"
                break ;# Try the next rule
             } elseif { $retcode == "error" } {
                return "error"
             }
          } else {
             #
             # We are dealing with a terminal - does the token match?
             # If so, accept it
             #
             if { [getToken $lexeme] == $dep } {
                puts "$item: $dep = $lexeme"
                NextLexeme
             } else {
                #
                # No match - tell the caller
                #
                return 0
             }
          }
       }
       #
       # We have completed the list of dependents, so
       # the rule is satisfied
       #
       if { $retcode == 1 } {
          puts "      Completed"
          namespace eval :: $code
          return 1
       } elseif { $rule_count == [llength $rule_dependents($item)] } {
          #
          # This is a hack - my grammar should be expanded to include
          # empty rules ...
          #
          if { $dep == $item } {
             return 1
          } else {
             return 0
          }
       }
    }

    #
    # We have tried all the rules - no match. This means
    # an error in the input
    #
    return "error"
 }

 # NextLexeme --
 #    Get the next lexeme from the list
 # Arguments:
 #    None
 # Result:
 #    None
 # Side effects:
 #    Sets the variable "lexeme" and if the end of the input is
 #    reached, sets the variable "end"
 #
 proc ::Parser::NextLexeme {} {
    variable lexeme_count
    variable lexeme_list
    variable prev_lexeme
    variable lexeme
    variable end

    incr lexeme_count
    set  prev_lexeme $lexeme
    if { $lexeme_count < [llength $lexeme_list] } {
       set lexeme [lindex $lexeme_list $lexeme_count]
       puts "NextLexeme: $lexeme"
    } else {
       puts "NextLexeme: -- end --"
       set end 1
    }
 }

 # getToken --
 #    Identify the token to the given lexeme
 # Arguments:
 #    lexeme        Lexeme to be identified
 # Result:
 #    The symbolic name for the token
 # Note:
 #    Should become a user-definable procedure
 #    Right now:
 #    - if "*", return "MARKER"
 #    - if integer, return "INTEGER"
 #    - else return "STRING"
 #
 proc getToken { lexeme } {
    if { $lexeme == "*" } {
       puts "getToken: $lexeme = MARKER"
       return "MARKER"
    }
    if { [string is integer -strict $lexeme] } {
       puts "getToken: $lexeme = INTEGER"
       return "INTEGER"
    }

    puts "getToken: $lexeme = STRING"
    return "STRING"
 }

 # main --
 #   The grammar should parse this input:
 #   A 4       ; a list of attributes for "nodes"
 #   B 5
 #   C 2
 #   *         ; a separator
 #   A B 3     ; a list of connections between "nodes" with a weight
 #   A C 1
 #

 namespace import ::Parser::*

 define input     {nodes separator links}
 define nodes     {node nodes}
 define separator MARKER
 define node      {name weight} {
    set Node($name1) $weight
 }
 define name      STRING {
    set name1 [getLexeme]
 }
 define weight    INTEGER {
    set weight [getLexeme]
 }
 define links     {link links}
 define link      {name name2 weight} {
    set Link($name1,$name2) $weight
 }
 define name2     STRING {
    set name2 [getLexeme]
 }

 init {A 4 B 5 C 2 * A B 3 A C 1}
 puts "Result: [rule input]"

 parray Node
 parray Link

[ Category Mathematics

Category Concept Arts and crafts of Tcl-Tk programming

]