This page is based on http://www.paulgraham.com/spam.html but it is still missing major pieces of functionality (like the code to build the frequency tables from the message corpora, the tables probably should not be recomputed every time, etc.) Enjoy! ''[DKF]'' Note that Graham's work is based, perhaps indirectly, on such earlier efforts as appear in [http://citeseer.nj.nec.com/397172.html] and [http://www-stat.stanford.edu/~tibs/ElemStatLearn]. ---- set WordRE {[-\w'$]+} proc countwords {table string} { global WordRE upvar 1 $table t set i 0 while {[regexp -indices -start $i $WordRE $string match]} { foreach {j i} $match {} set word [string range $string $j $i] if {[catch {incr t($word)}]} { set t($word) 1 } } } proc genprob {word} { upvar #0 goodTable good $badTable bad set g 0 catch { set g [expr {$good($word) * 2}] } set b 0 catch { set b $bad($word) } if {$g == 0 && $b == 0} { # Not seen before return .2 } if {$g+$b < 5} { # Not frequent enough return .0 } set bfreq [min 1. [expr {double($b)/$badCount}]] set gfreq [min 1. [expr {double($g)/$goodCount}]] return [max .01 [min .99 [expr {$bfreq / ($gfreq + $bfreq)}]]] } proc combine {probs} { set p1 1. set p2 1. foreach prob $probs { set p1 [expr {$p1 * $prob}] set p2 [expr {$p2 * (1. - $prob)}] } return [expr {$p1 / ($p1 + $p2)}] } proc min {x y} {expr {$x<$y ? $x : $y}} proc max {x y} {expr {$x>$y ? $x : $y}} proc isSpam {message} { global WordRE while {[regexp -indices -start $i $WordRE $message match]} { foreach {j i} $match {} set t([string range $string $j $i]) {} } foreach word [array names t] { set p [genprob $word] lappend magic [list [expr {abs($p-.5)}] $p] } foreach l [lrange [lsort -real -index 0 $magic] 0 15] { lappend interesting [lindex $l 1] } return [expr {[combine $interesting] > .9}] } ---- [[Nice work.]] There's something in PERL which uses Bayes here [http://www.ai.mit.edu/~jrennie/ifile/] - it interfaces to exmh, too.