Reading a word at a time

Reading a word at a time

Reading a line at a time from a channel is easy with gets:

while {[gets $chan line] >= 0} {
  # Do something
}

but what about doing the same thing with individual words? The following snippet defines functions to allow for writing the same kind of loop, just for a word at a time instead of a line:

package require Tcl 8.6

namespace eval word {
    # This function, run in a coroutine, does all the work
    proc read_words {chan} {
        upvar [yield [info coroutine]] word
        while {[::chan gets $chan line] >= 0} {
            # Split on arbitrary whitespace
            foreach w [regexp -all -inline {\S+} $line] {
                set word $w
                upvar [yield [string length $w]] word
            }
        }
        return -1
    }

    # Return the name of a coroutine that reads a word at a time from
    # the given channel. The coro takes one argument - the name of the
    # variable to store the word in, and yields the length of the word
    # or -1 at end of file. Basically like the two-argument form of
    # `gets`
    proc reader_from_chan {chan} {
        variable counter
        coroutine reader[incr counter] read_words $chan
    }

    proc close_chan {chan oldName newName op} {
        if {$op eq "delete"} {
            ::chan close $chan
        }
    }

    # Return the name of a coroutine that reads a word at a time from
    # the given file (An an optional encoding argument). The coro takes
    # one argument - the name of the variable to store the word in, and
    # yields the length of the word or -1 at end of file. Basically like
    # the two-argument form of `gets`.
    proc reader_from_file {file {encoding {}}} {
        set chan [open $file r]
        try {
            if {$encoding ne ""} {
                ::chan configure $chan -encoding $encoding
            }
            set coro [reader_from_chan $chan]
            # Close the file when the coroutine finishes
            trace add command $coro delete [list ::word::close_chan $chan]
            return $coro
        } on error {result options} {
            ::chan close $chan
            return -options $options $result
        }
    }

    namespace export reader_from_*
    namespace ensemble create
}

It works by creating coroutines that internally read a line at a time, split that line into individual words, and yield each word in turn to the caller until there is no more data left to read from the underlying channel.

Example usage:

set getw [word reader_from_file input.txt]
while {[$getw word] >= 0} {
    puts $word
}

HE 2021-12-30: The example has 4 lines. With 4 extra lines it is possible to achieve the same. This is useful for all who only needs the functionality in one place of a script or want to avoid the overhead of coroutines or ...

set srcfid [open input.txt r]
# fconfigure $srcfid ...
while {![eof $srcfid]} {
        gets $srcfid line
        foreach {-- el} [regexp -inline -all -- {([[:graph:]]+)} $line] {
                puts $el
        }
}
close $srcfid

The commented out line can be used in case you want to influence line ending, encoding and other things.