Version 6 of Markov

Updated 2002-10-10 10:10:18

This is a half-baked Markov chain toy which mangles text. Usage:

 ./markov.tcl [http://some.url.com/file...] [file]

AK: Any place with more info on how you are doing the markov chains ?

 # markov.tcl -- by David N. Welton <[email protected]>

 package require http

 # getdata -- get data either from the web, or file system.

 proc getdata {} {
    global argv
    set flnm [lindex $argv 0]
    if { $flnm == "" } {
        puts stderr "Please supply a filename!"
        exit 1
    }

    # If the file starts with http://, fetch it from the web.
    if { [ string compare -nocase -length 7 http:// $flnm ] } {
        set page [::http::geturl $flnm]
        set data [http::data $page]
        regsub -all {<[^>]*>} $data "" data
        regsub -all {&[^;]*} $data "" data
    } else {
        set fl [open $flnm r]
        set data [read $fl]
        close $fl
    }
    return $data
 }

 # Markov -- run markov chains of length 'chainlength' on data.

 proc Markov {data chainlength} {
    set idx 0
    set wd ""
    incr chainlength -1
    for {set i 0} {$i < $chainlength} {incr i} {lappend prev {}}
    set save ""
    array set hash {}

    # This loop could probably be rewritten using lists somehow,
    # making it faster.

    while { $idx < [string length $data] } {
        set chr [string index $data $idx]
        if { [string is alpha $chr] } {
            lappend prev $wd
            set prev [lrange $prev 1 end]
            set ws [string wordstart $data $idx]
            set we [string wordend $data $idx]
            set wd [string tolower [string range $data $ws [expr $we - 1]]]

            set key [concat $prev $wd]
            if { [info exists hash($key)] } {
                incr hash($key)
            } else {
                set hash($key) 1
            }
            set idx $we
        } elseif { $chr == "." || $chr == "!" || $chr == "?" || $chr == "," } {
            lappend prev $wd
            set prev [lrange $prev 1 end]
            set wd $chr
            set key [concat $prev $wd]
            if { [info exists hash($key)] } {
                incr hash($key)
            } else {
                set hash($key) 1
            }
        }
        if { $save == "" } {
            for {set i 0} {$i < $chainlength} {incr i} {
                if { [lindex $prev $i] == "" } {
                    break
                } else {
                    set save $prev
                }
            }
        }

        incr idx
    }

    foreach {k v} [array get hash] {
        lappend pairs [list $k $v]
    }
    set pairs [lsort -integer -index 1 $pairs]
    set l [llength $pairs]
    set i 0

    set wd $save
    set oput "$wd "
    while { $i < $l } {
        set possibles [array get hash "$wd *"]
        set tot 0
        foreach {k v} $possibles {
            incr tot $v
        }
        set walk [expr {$tot * rand()}]
        set tot 0
        foreach {k v} $possibles {
            incr tot $v
            if { $walk <= $tot } {
                set use [string range $k [expr [string last " " $k] + 1] end]
 #              puts "possibles were '$possibles', using '$use' because of walk $walk and $tot"
                break
            }
        }
        append oput "$use "
        lappend wd $use
        set wd [lrange $wd 1 end]
        incr i
    }
    return $oput
 }

 puts [Markov [getdata] 4]