Version 33 of Lexing C

[ Scripted Compiler :: Lexing C :: --> Parsing C ]

An important part of any Scripted Compiler is the ability actually process the system language underlying the scripting language. In the case of Tcl this is the C Language.

The first step is always to separate the input stream into tokens, each representing one semantic atom. In compiler speak, lexing.

The following script lexes a string containing C source into a list of tokens. It assumes that the sources are free of preprocessor statements like "#include", "#define", etc.

Also note that the script is built upon the base package provided in Scripted Lexing. While this means the code shown here is quite tailored to parsing for a compiler the general principle used is broad enough to allow for many variations. Examples:

Keep the whitespace as tokens. Might be required for a pretty-printer.
Treat comments as whitespace and remove them. True compiler. Keeping the comments, but not other whitespace as in the script below is more something for a code analyzer looking for additional data (meta-data) in comments. See Source Navigator for a tool in this area.
Modify the definitions, convert the keywords and punctuation into single byte codes, and refrain from splitting/listifying the result. Sort of a special method for compressing C sources.

The next step will be parsing, i.e. adding structure to the token stream under control of a grammar. An existing tool for that is Yeti. See the C Language for grammar references.

I believe that the method I have used below can be used to lex any system language currently in use today, Pascal, Modula, FORTRAN, C++, ... Again this is something of interest to Source Navigator.

Notes

The lexer base from Scripted Lexing is possibly not optimal, but fairly ok in my book so far. Example result:

 [andreask@pliers trans]$ ./driver -noraw -notoken tclIO.c
 __________________________________________________
 tclIO.c:
        242918 characters
        Lexing in 13446065 microseconds
               =  13.446065 seconds
               =  55.35227937 usec/char
 __________________________________________________

Not bad for a lexer written in a scripting language IMHO.

TODO

Read up on C syntax. I believe that I currently do not recognize all possible types of numbers.

clex.tcl (The code, finally :)

# -*- tcl -*-
# Lexing C

package require lexbase
package provide clex 2.0

namespace eval clex {
    # Define the lexer symbols for the language 'C', as an example.

    namespace import ::lexbase::*

    DefStart

    DefP (   LPAREN      ; DefP )  RPAREN    ; DefP ->  DEREF
    DefP <   LT          ; DefP <= LE        ; DefP ==  EQ
    DefP >   GT          ; DefP >= GE        ; DefP !=  NE
    DefP \[  LBRACKET    ; DefP \] RBRACKET  ; DefP =   ASSIGN
    DefP \{  LBRACE      ; DefP \} RBRACE    ; DefP *=  MUL_ASSIGN
    DefP .   DOT         ; DefP ,  COMMA     ; DefP /=  DIV_ASSIGN
    DefP ++  INCR_OP     ; DefP -- DECR_OP   ; DefP %=  REM_ASSIGN
    DefP &   ADDR_BITAND ; DefP *  MULT_STAR ; DefP +=  PLUS_ASSIGN
    DefP +   PLUS        ; DefP -  MINUS     ; DefP -=  MINUS_ASSIGN
    DefP ~   BITNOT      ; DefP !  LOGNOT    ; DefP <<= LSHIFT_ASSIGN
    DefP /   DIV         ; DefP %  REM       ; DefP >>= RSHIFT_ASSIGN
    DefP <<  LSHIFT      ; DefP >> RSHIFT    ; DefP &=  BITAND_ASSIGN
    DefP ^   BITEOR      ; DefP && LOGAND    ; DefP ^=  BITEOR_ASSIGN
    DefP |   BITOR       ; DefP || LOGOR     ; DefP |=  BITOR_ASSIGN
    DefP ?   QUERY       ; DefP :  COLON     ; DefP \;  SEMICOLON
    DefP ... ELLIPSIS    ; DefP ~= BITNOT_ASSIGN

    DefK typedef ; DefK extern   ; DefK static ; DefK auto ; DefK register
    DefK void    ; DefK char     ; DefK short  ; DefK int  ; DefK long
    DefK float   ; DefK double   ; DefK signed ; DefK unsigned
    DefK goto    ; DefK continue ; DefK break  ; DefK return
    DefK case    ; DefK default  ; DefK switch
    DefK struct  ; DefK union    ; DefK enum
    DefK while   ; DefK do       ; DefK for
    DefK const   ; DefK volatile
    DefK if      ; DefK else
    DefK sizeof

    DefM COMMENT        ::clex::C_comment_begin   ::clex::C_comment_end
    DefM COMMENT        ::clex::C99_comment_begin ::clex::C99_comment_end
    DefM STRING_LITERAL ::clex::C_string_begin    ::clex::C_string_end
    DefM STRING_LITERAL ::clex::C_char_begin      ::clex::C_char_end  

    # Floats containing '.'s have to be matched early because the '.'
    # is later seen as punctuation.

    DefM CONSTANT ::clex::C_floatA_begin ::clex::C_floatA_end
    DefM CONSTANT ::clex::C_floatB_begin ::clex::C_floatB_end

    DefI IDENT
    DefWS {[ \t\v\f\r\n]+}

    DefRxM {^0x[[:xdigit:]]+} CONSTANT
    DefRxM {^\d+}             CONSTANT

    DefEnd
}

proc ::clex::C_comment_begin {string start} {
    return [string first "/*" $string $start]
}

proc ::clex::C_comment_end {string start} {
    incr start 2 ; # Skip behind /*
    set  stop [string first "*/" $string $start]
    incr stop 1 ; # Skip to /
    return $stop
}

proc ::clex::C99_comment_begin {string start} {
    string first // $string $start
}

proc ::clex::C99_comment_end {string start} {
    regexp -indices -start $start {//(?:\\.|[^\n\\])*(?:\n|$)} $string range
    lindex $range 1
}

proc ::clex::C_string_begin {string start} {
    return [string first "\"" $string $start]
}

proc ::clex::C_string_end {string start} {
    # The next vari-sized thing is a "-quoted string.
    # Finding its end is bit more difficult, because we have
    # to accept \" as one character inside of the string. "

    set from $start
    while 1 {
        incr from
        set stop  [string first "\"" $string $from]

        # Note that we do not use [string first] to look for a \",
        # but simply check the preceding character. That is less
        # expensive than possibly running through the whole string.

        incr stop -1
        if {[string equal [string index $string $stop] "\\"]} {
            incr stop 2
            set from $stop
            continue
        }
        incr stop
        break
    }
    return $stop
}

proc ::clex::C_char_begin {string start} {
    return [string first "'" $string $start]
}

proc ::clex::C_char_end {string start} {
    # The next vari-sized thing is a '-quoted string.
    # Finding its end is bit more difficult, because we have
    # to accept \' as one character inside of the string. "

    set from $start
    while 1 {
        incr from
        set stop  [string first "'"   $string $from]

        # Note that we do not use [string first] to look for a \",
        # but simply check the preceding character. That is less
        # expensive than possibly running through the whole string.

        incr stop -1
        if {[string equal [string index $string $stop] "\\"]} {
            incr stop 2
            set from $stop
            continue
        }
        incr stop
        break
    }
    return $stop
}

proc ::clex::C_floatA_begin {string start} {
    upvar stash stash
    if {[regexp -indices -start $start {\W([0-9]*\.[0-9]+([eEdD][+-]?[0-9]+)?)\W} $string -> match]} {
        #puts a==[string range $string [lindex $match 0] [lindex $match 1]]
        set stash(float-a) [lindex $match 1]
        return [lindex $match 0]
    }
    return -1
}

proc ::clex::C_floatA_end {string start} {
    upvar   stash stash
    return $stash(float-a)
}

proc ::clex::C_floatB_begin {string start} {
    upvar stash stash
    if {[regexp -indices -start $start {\W([0-9]+\.[0-9]*([eEdD][+-]?[0-9]+)?)\W} $string -> match]} {
        #puts b==[string range $string [lindex $match 0] [lindex $match 1]]
        set stash(float-b) [lindex $match 1]
        return [lindex $match 0]
    }
    return -1
}

proc ::clex::C_floatB_end {string start} {
    upvar   stash stash
    return $stash(float-b)
    return -1
}

driver

#!/usr/bin/env tclsh
# -*- tcl -*-

set time  1
set token 1
set raw   1
while {1} {
    switch -exact -- [lindex $argv 0] {
        -notime  {set time  0}
        -notoken {set token 0}
        -noraw   {set raw   0}
        default {break}
    }
    set argv [lrange $argv 1 end]
}


source lexbase.tcl
source clex.tcl

# Read file, lex it, time the execution to measure performance

set data [read [set fh [open [set fname [lindex $argv 0]]]]][close $fh]
set len  [string length $data]
set usec [lindex [time {set res [lexbase::lex $data]}] 0]

foreach {sym attr}  $res  break
foreach {aidx aval} $attr break

if {$time} {
    # Write performance statistics.
    puts __________________________________________________
    puts "$fname:"
    puts "\t$len characters"
    puts "\tLexing in $usec microseconds"
    puts "\t       =  [expr {double($usec)/1000000}] seconds"
    puts "\t       =  [expr {double($usec)/$len}] usec/char"
}

if {$token} {
    # Generate tokenized listing of the input, using the lexing results as input.
    puts __________________________________________________
    set av 0
    foreach s $sym {
        switch -glob -- $s {
            *- {puts "$s <<[lindex $aval [lindex $aidx $av]]>>" ; incr av}
            *  {puts "$s"}
        }
    }
}

if {$raw} {
    # Dump the raw lexer result.
    puts __________________________________________________
    puts Symbols___________________________________________
    puts $sym
    puts ""
    puts Attribute-Indices_________________________________
    puts $aidx
    puts ""
    puts Attribute-Data____________________________________
    puts \{[join $aval "\} \{"]\}
    puts ""
    puts __________________________________________________
}

puts __________________________________________________

Category Parsing

Category Language