Parsing TOML files

Arjen Markus (6 june 2022) I made a start with a parser for TOML files - a type of configuration files that is supposed to be easy to read for humans and easy to parse with just about any programming language. While the first may be true, once you understand the semantics, the second is not. At least, the specifications allow for tricky parsing problems (or it may be me who try to cut things short and do not apply the proper techniques). Anyway, here is my valiant attempt at a parser for such files. There are lots of things that still need to be taken care of and the code needs to be cleaned up, but it might be useful.

Quite possibly, a fully compliant parser needs to be set up with the proper techniques from the start. (Note: I borrowed some code from the CSV parser in Tcllib to circumvent long and convoluted code to parse the comma-separated value strings.)

Note also, that while superficially TOML files look like an old-style inifile, the syntax is much richer (and better defined), which makes parsing them quite a bit more complicated. And the code below does not try to write out a dictionary in the form of a TOML file - there would be quite a bit of ambiguity: a TOML array is represented as a list-valued element in the returned dictionary, but there is no way of knowing if it is meant to be a TOML array or a string containing several words. That is a problem typical for Tcl, I'd say.

Here is the example TOML file I used. If you examine the output you should see a few of the tricky bits in the TOML specification:

# This is a TOML document.

title = "TOML Example"
# Duplicate key:
##title = "TOML Example - double"

title2 = "Embedded # - no comment" # Line with a # in the string
title3 = "Embedded \" - again a complication" # Of course, a " in the comment may upset the parser ...

##keynovalue = # Empty value
##= value # Empty key

[owner]
name = "Tom Preston-Werner"
dob = 1979-05-27T07:32:00-08:00 # First class dates

[database]
server = "192.168.1.1"
ports = [ 8000, 8001, 8002 ]
connection_max = 5000
enabled = true

[servers]

  # Indentation (tabs and/or spaces) is allowed but not required
  [servers.alpha]
  ip = "10.0.0.1"
  dc = "eqdc10"

  [servers.beta]
  ip = "10.0.0.2"
  dc = "eqdc10"

[clients]
data = [ ["gamma", "delta"], [1, 2] ]

# Line breaks are OK when inside arrays
hosts = [
  "alpha",
  "omega"
]

And here is the code itself:

# toml.tcl --
#     Rudimentary start of a parser for TOML configuration files
#     See https:/github.com/toml-lang/toml for more information on this type of files
#
#     TOML is supposed to be easy to parse, but there are tricky bits :(
#
#     For the moment:
#     - Tricky comments are ignored
#     - Tricky names are ignored, like 'a "bc"' = "xxx"
#
#     TODO:
#     - arrays of tables
#     - inline tables
#     - substitutions
#     - tricky bits, but they can wait
#
#     Not supported at the moment:
#     - arrays of tables
#     - nested arrays
#     - inline tables
#     - strings with a hash embedded
#     - comments with a quote character (") embedded
#     - arrays with ' (trickiness in the regular expressions)
#     - arrays with embedded comments
#

# From the Wiki, Donal's implementation
proc pdict {dict {pattern *}} {
   global outfile
   set longest [tcl::mathfunc::max 0 {*}[lmap key [dict keys $dict $pattern] {string length $key}]]
   dict for {key value} [dict filter $dict key $pattern] {
      puts $outfile [format "%-${longest}s = %s" $key $value]
   }
}

package provide tomlfile 0.1

namespace eval ::toml {
    variable infile
}

# stripComment --
#     Strip off the comment - watch out for embedded "#" characters
#
# Arguments:
#     line            Line to be examined
#
# Returns:
#     Line with the comment removed
#
proc ::toml::stripComment {line} {

    #
    # Is there a comment character?
    #
    set poshash [string first "#" $line]
    if { $poshash < 0 } {
        return $line
    }

    #
    # Comment character after the last quote character?
    #
    set poslastquote [string last "\"" $line]
    if { $poslastquote < $poshash } {
        return [string range $line 0 [expr {$poshash-1}]]
    }

    #
    # TODO
    #
    return ">>$line"
}

# stripQuotes --
#     Strip off the quoting characters (' and ")
#
# Arguments:
#     string          String to be examined
#
# Returns:
#     String with the outer quoting characters removed
#
proc ::toml::stripQuotes {string} {

    if { [string index $string 0] eq "'" && [string index $string end] eq "'" } {
        set string [string range $string 1 end-1]
    } elseif { [string index $string 0] eq "\"" && [string index $string end] eq "\"" } {
        set string [string range $string 1 end-1]
    }

    return $string
}

# loadAllLines --
#     Load all lines up to the end for literal strings
#
# Arguments:
#     string          The first part of the value
#
# Returns:
#     String with the outer quoting characters removed
#
proc ::toml::loadAllLines {string} {
    variable infile

    set quoting [string range $string 0 2]

    #
    # Do we need to load more?
    #
    if { [string range $string end-2 end] ne $quoting } {

        while { [gets $infile line] >= 0 } {
            append string "\n$line"

            if { [string range $line end-2 end] eq $quoting } {
                break
            }
        }
    }

    return [string range $string 3 end-3]
}

# makeList --
#     Turn a string of comma-separated values into a proper list
#
# Arguments:
#     string          String to be converted
#
# Returns:
#     List of values
#
proc ::toml::makeList {string} {
    set newlist {}

    if { [string first "\"" $string] < 0 && [string first "'" $string] < 0 } {
        set newlist [split $string ,]
    } else {
        #
        # Possibility of embedding commas, use the code from the CSV package
        #

        set sepChar ,
        set delChar "\""
        set sepRE \[\[.${sepChar}.]]
        set delRE \[\[.${delChar}.]]

        set line $string
        if { [string index $string end] eq "," } {
            set line [string range $string 0 end-1]
        }

        regsub -- "$sepRE${delRE}${delRE}$" $line $sepChar\0${delChar}${delChar}\0 line
        regsub -- "^${delRE}${delRE}$sepRE" $line \0${delChar}${delChar}\0$sepChar line
        regsub -all -- {(^${delChar}|${delChar}$)} $line \0 line

        set line [string map [list \
                $sepChar${delChar}${delChar}${delChar} $sepChar\0${delChar} \
                ${delChar}${delChar}${delChar}$sepChar ${delChar}\0$sepChar \
                ${delChar}${delChar}           ${delChar} \
                ${delChar}             \0 \
                ] $line]

        set end 0
        while {[regexp -indices -start $end -- {(\0)[^\0]*(\0)} $line \
                -> start end]} {
            set start [lindex $start 0]
            set end   [lindex $end 0]
            set range [string range $line $start $end]
            if {[string first $sepChar $range] >= 0} {
                set line [string replace $line $start $end \
                        [string map [list $sepChar \1] $range]]
            }
            incr end
        }
        set line [string map [list $sepChar \0 \1 $sepChar \0 {} ] $line]

        set newlist [::split $line \0]
    }

    return $newlist
}

# loadArray --
#     Load all lines up to the end for arrays
#
# Arguments:
#     string          The first part of the value
#
# Returns:
#     String with the outer quoting characters removed
#
proc ::toml::loadArray {string} {
    variable infile

    set quoting "\]"

    set arrayValues [makeList [string range $string 1 end]]

    #
    # Do we need to load more?
    #
    if { [string index $string end] ne $quoting } {

        while { [gets $infile line] >= 0 } {
            set valuestring [string trim $line]
            if { [string index $valuestring end] eq "\]" } {
                set valuestring [string trim [string range $string 0 end-1]] ;# Here we possibly strip off too much - nested arrays
            }

            set arrayValues [concat $arrayValues [makeList $valuestring]]

            if { [string index $line end] eq $quoting } {
                break
            }
        }
    }

    return [list $arrayValues]
}

# keyValuePair --
#     Split the line into a key-value pair
#
# Arguments:
#     line            Line to be examined
#
# Returns:
#     List of two elements, the "key" and the (possibly partial) "value"
#
proc ::toml::keyValuePair {line} {
    set poseq [string first "=" $line]
    set key   [string trim [string range $line 0 [expr {$poseq-1}]]]
    set value [string trim [string range $line [expr {$poseq+1}] end]]

    #
    # Get the actual key and check for emptiness
    #
    set key [stripQuotes [string map {" " ""} $key]]

    if { [string trim $key] eq "" } {
        return -code error "Syntax error in key/value: key is empty - $line"
    }

    #
    # Is this a value that may span several lines? If so, load all lines
    #
    if { [string range $value 0 2] eq "\"\"\"" || [string range $value 0 2] eq "'''" } {
        set value [loadAllLines $string]

    } elseif { [string index $value 0] eq "\[" } {
        set value [loadArray $value]
    } else {
        set value [stripQuotes $value]
    }

    if { $value eq "" } {
        return -code error "Syntax error in key/value: value is empty - $line"
    }

    return [list $key $value]
}

# tableName --
#     Extract the name of the table
#
# Arguments:
#     line            Line to be examined
#
# Returns:
#     Name of the new table
#
proc ::toml::tableName {line} {
    set posopen  [string first \[ $line]
    set posclose [string first \] $line]

    if { $posopen < -1 || $posclose < -1 || $posclose < $posopen } {
        return -code error "Syntax error in table name: $line"
    } else {
        return [split [string range $line [expr {$posopen + 1}] [expr {$posclose - 1}]] .]
    }
}

# tomlParse --
#     Parse the TOML file and return the contents as a dictionary
#
# Arguments:
#     tomlfile           The name of the TOML file
#
# Result:
#     A dictionary containing the contents of the TOML file
#
# Notes:
#     - If the TOML file contains syntax errors, then an error is raised.
#     - Not all valid TOML files are read correctly. There are a number of limitations.
#
proc ::toml::tomlParse {tomlfile} {
    variable infile

    global outfile ;# Debugging/testing

    set infile  [open $tomlfile]
    set outfile [open "report.out" w]

    set contents [dict create]
    set table ""

    while { [gets $::toml::infile line] >= 0 } {
        set line [::toml::stripComment $line]

        if { [string first = $line] >= 0 } {
            set keyvalue [::toml::keyValuePair $line]

            if { [dict exists $contents {*$table} {*}[lindex $keyvalue 0]] } {
                return -code error "Duplicate key: {*}table [lindex $keyvalue 0]"
            } else {
                dict set contents {*}$table {*}$keyvalue
            }

            puts $outfile [join $keyvalue " --> "]
        } else {
            if { [string first \[ $line] >= 0 } {
                set table [::toml::tableName $line]
                puts $outfile "Table: $table"
            } else {
                if { [string trim $line] ne "" } {
                    return -code error "Unknown syntax: $line"
                }
                puts $outfile $line
            }
        }
    }

    return $contents
}

# test --
#     Quick test of the package's one public command

set contents [::toml::tomlParse "example.toml"]

puts $outfile \nResult:"
puts $outfile $contents

puts $outfile [dict get $contents servers alpha]
puts $outfile [dict get $contents servers beta ip]

pdict $contents