Version 19 of Scripted Lexing

Updated 2012-11-20 17:59:24 by pooryorick

[ Scripted Compiler :: Lexing ]


An important part of any Scripted Compiler is the ability actually process the system language underlying the scripting language.

The text processing capabilities of (most?) scripting languages make this part easier as well. Tcl commands like string map, regsub, split, etc. are powerful tools. Use the first, for example, to the detect the various types of tokens and insert marker characters for separation. After this pre-tokenization phase we can listify the string into the token-list via split. This takes care of the lexer. Easy, fast, ... (And not quite right. See below).

While in the case of Tcl this is the C Language the actual code for such a lexer can (and should be) be split into language-independent and language-dependent parts.

Here we provide the language-independent base for a scripted lexer. One file containing the main data structures, commands to initialize them, and to perform lexing.


Example applications of this base module:

See also

Purpose: To have an representation of finite automatons. So that we can construct them, manipulate them, and use them. The latter whereever state machines are in use, and lexers are a prominent case of that.

The whole thing is incomplete in the sense that we currently have only the core, i.e. finite automaton representation, and interpreters (dexec, dacceptors), but no compiler. The interpreter takes a representation and executes this directly on some input. A compiler would transform the FA into some other code which has the FA essentially hardwired into the structure of the code, in some way, and thus can be optimized for it and therefore faster. Flex for example is a compiler in that sense, taking a lexer definition and spitting out C code implementing a lexer for the given definition.

Related pages:


lexbase.tcl

# -*- tcl -*-
# Lexing in general

package provide lexbase 2.0
namespace eval  lexbase {
    namespace export DefStart DefEnd DefWS DefI DefP DefK DefM DefRxM lex
}

# Three stage lexer for C sources.
# 1. Recognize complex structures, i.e. "strings", 'c'haracters and
#    /* comments */
# 2. Recognize punctuation.
# 3. Recognize keywords.
# 4. Recognize other constructs via regexes
# 5. The remainder is seen as identifiers.
#
# Punctuation is defined in a general data structure mapping them to
# symbols. Ditto for keywords. The complex structures are defined via
# match commands.

proc ::lexbase::DefStart {} {
    variable punctuation [list]
    variable keywords    [list]
    variable matchers    [list]
    variable idsymbol    ""
    variable ws          ""
    variable rxmatchers  [list]
    return
}

proc ::lexbase::DefWS {wspattern} {
    variable ws $wspattern
    return
}

proc ::lexbase::DefI {replacement} {
    # I = Identifier.
    # Define the symbol to use for non-keyword identifiers.
    variable idsymbol $replacement
    return
}

proc ::lexbase::DefP {string {replacement {}}} {
    # P = Punctuation

    if {$replacement == {}} {
        set replacement $string
    }

    variable punctuation
    lappend  punctuation $string \000\001$replacement\000
    return
}

proc ::lexbase::DefK {string {replacement {}}} {
    # K = Keyword

    if {$replacement == {}} {
        set replacement [string toupper $string]
    }

    variable keywords
    lappend  keywords $string $replacement
    return
}

proc ::lexbase::DefM {symbol cmdb cmde} {
    # M = Matcher
    variable matchers
    lappend  matchers $cmdb $cmde $symbol
    return
}

proc ::lexbase::DefRxM {pattern {symbol {}}} {
    # RxM = Regex Matcher

    if {$symbol == {}} {
        set symbol $pattern
    }

    variable rxmatchers
    lappend  rxmatchers $pattern $symbol
    return
}

proc ::lexbase::DefEnd {} {
    variable punctuation
    variable keywords
    variable matchers

    # Sort punctuation alphabetically, incidentially by length, place
    # the longest first. That way the lexer will correctly distinguish
    # between sequences like '>' and '>='.

    array set tmp $punctuation
    set res [list]
    foreach key [lsort -decreasing [array names tmp]] {
        lappend res $key $tmp($key)
    }
    set punctuation $res

    # The keywords we place into an array to allow an easy checking of
    # identifiers for keywordiness.

    unset tmp ; set tmp $keywords
    unset keywords
    array set keywords $tmp

    # Matchers are executed in the same order
    # they were defined in.
    return
}

proc ::::lexbase::storeAttr {valvar value} {
    upvar $valvar valuelist ${valvar}_idx idxlist ${valvar}_cache cache

    if {[info exists cache($value)]} {
        set idx $cache($value)
    } else {
        set idx [llength $valuelist]
        lappend valuelist $value
        set cache($value) $idx
    }
    lappend idxlist $idx
    return
}

proc ::::lexbase::initAttr {valvar} {
    upvar $valvar valuelist ${valvar}_idx idxlist ${valvar}_cache cache

    set valuelist [list]
    set idxlist   [list]
    array set cache {}
    return
}

proc ::::lexbase::lex {string} {
    # Executes the matchers repeatedly until no match is found anymore.
    #
    # Returns a 3-element list. First element is a list of string
    # fragments and symbols. Symbols have \001 as leading characters
    # to distinguish them from the unprocessed string fragments. They
    # also have a '-' character as suffix. See the documentation of
    # the main lexer command for an explanation. The second element is
    # a list of 2 elements, again. The second element is a list of all
    # unique matching entities. The first element is a list of indices
    # into the the second element. In this way multiple occurences of
    # the same entities are collapsed.

    variable matchers

    set res [list]

    initAttr pool

    set start 0
    set end [string length $string]

    # The call to matchers are optimized. Initially each matcher gives
    # a guess where the next occurence of a pattern owned by him
    # is. In each iteration the nearest pattern is selected, completed
    # and processed. Afterward each matchers whose guess was before
    # the end of the current pattern has to reguess based upon the new
    # starting location. Why so "complicated" ? Performance. Example:

    # Let us assume that we have 10 "strings" at the beginning, some
    # substantial uncommented code, followed by a /* comment */ at the
    # end.

    # A simple implementation querying each matcher in each iteration
    # will not only find the string at the very beginning but will
    # also every time search through the whole string to find the
    # comment at the end. This amounts to overall quadratic behaviour.

    # The chosen implementation on the other hand remembers the old
    # guesses and invalidates them only if the last best guess went
    # beyond that, indicating an overlap. In the example above this
    # means that the string guess is invalidated and renewed
    # everytime, but the comment guess is done once and nver touched
    # until the strings are exhausted.

    # Processing tcl/generic/tclIO.c (263252 characters) takes only
    # 10 seconds, 7 less than the 17 seconds the simple algorithm
    # used to do its work.

    # stash is cache for use by all matchers. It allows them to store
    # data pertinent to the last match, possible speeding them up. The
    # float matchers for example know at the time of the 'begin' match
    # already the end of the pattern. There is no need to match it
    # again to refind it.

    array set loc {}
    array set stash {}

    foreach {cmdb cmde sym} $matchers {
        set loc($cmdb) [$cmdb $string $start]
        #puts "$cmd = $loc($cmdb)"
    }

    while {1} {
    # Run through each matcher and ask them for the nearest
    # occurence of a string handled by them. We abort the
    # search early if we find a match directly at the beginning
    # of the string. We stop the whole processing if no match
    # found.

    #puts "$start | $end"

    set nearest $end
    set mcmd {}
    set msym {}

    foreach {cmdb cmde sym} $matchers {
        set begin $loc($cmdb)
        #puts "\t$cmdb = $begin"

        if {($begin >= 0) && ($begin < $nearest)} {
            set nearest $begin
            set mcmd $cmde
            set msym $sym
        }
        if {$nearest == $start} {break}
    }

    #puts "... $nearest $mcmd"
    if {$nearest == $end} {
        #puts "___ done"
        incr nearest -1
        if {$start < $nearest} {
            sublex res [string range $string $start end]
        }
        break
    }

    # A match was found. We now ask the matcher to deliver the end
    # of the match. This is then used to slice the input into the
    # relevant pieces.

    set stop [$mcmd $string $nearest]

    # Note that the code below causes conversion to UTF16 for easy
    # indexing into the string. It also copies only the relevant
    # portions of the string, and avoids to copy down the
    # remainder of the string, something which would incur a heavy
    # performance penalty (quadratic behaviour).

    if {$nearest > 0} {
        incr nearest -1
        sublex res [string range $string $start $nearest]
        incr nearest
    }

    lappend res $msym-

    set value [string range $string $nearest $stop]

    storeAttr pool $value
    set  start $stop
    incr start

    #puts "... $nearest ... $stop ($value) $mcmd $msym"

    # And go looking for more matches.

    # Update invalidated guesses. Danger: We do not have to renew
    # matchers which already told us that the remainder of the
    # input does not contain anything they can match.
 
    foreach {cmdb cmde sym} $matchers {
        set begin $loc($cmdb)
        if {($begin >= 0) && ($begin < $start)} {
            set loc($cmdb) [$cmdb $string $start]
            #puts "$cmdb = $loc($cmdb) renew"
        }
    }
    }
    return [list $res [list $pool_idx $pool]]
}


proc ::lexbase::sublex {result string} {
    upvar pool pool pool_idx pool_idx pool_cache pool_cache $result res

    variable punctuation
    variable keywords
    variable idsymbol
    variable ws
    variable rxmatchers

    #puts stderr "----$ws"

    # The string we got is a fragment without any vari-sized entities contained in it.

    # We have to find punctuation, keywords, and
    # identifiers. Punctuation is [string map]-separated from the
    # rest. Then whitespace is normalized. At last we split and
    # iterate over the result, detecting keywords as we go.

    foreach item [split [string map $punctuation $string] \000] {
        #puts stderr "__ $item __"

        # Pass Punctuation symbols
        if {[string equal \001 [string index $item 0]]} {
            lappend res [string range $item 1 end]
            continue
        }

        regsub -all -- $ws $item \000 tmp

        # Note: It is faster to ignore empty elements after the split,
        # instead of collapsing the offending sequences.
        # set tmp [string map "\000\000\000 \000 \000\000 \000" $tmp]

        #puts stderr "________ $tmp __"

        foreach phrase [split $tmp \000] {
            if {[string length $phrase] == 0} {continue}
            
            # Recognize keywords
            if {[info exists keywords($phrase)]} {
                lappend res $keywords($phrase)
                continue
            }

            # Go through additional regexes to see if there are
            # special symbols which are neither keywords, nor
            # identifiers. Like numeric constants. Whatever is
            # matched, the phrase is added to the phrase pool.

            set found 0
            foreach {p sym} $rxmatchers {
                if {[regexp -- $p $phrase]} {
                    set found 1
                    lappend res ${sym}-
                    break
                }
            }
            if {!$found} {
                # Identifier. Handled like vari-sized entities in that
                # multiple occurences are collapsed.
                    
                lappend res ${idsymbol}-
            }
            storeAttr pool $phrase
        }
    }
    return
}

Scripted Compiler :: Lexing Category Parsing