This page is based on http://www.paulgraham.com/spam.html but it is still missing some functionality (like the code to build the frequency tables from the message corpora, etc.)
Enjoy!
Note that Graham's work is based, perhaps indirectly, on such earlier efforts as appear in [L1 ] and [L2 ].
switch $tcl_platform(platform) { unix - mac { set ConfigFile ~/.tclSpamFilter/config.tcl } windows { # This seems to be the right spot on Win98... set ConfigFile "c:/windows/application data/Tcl Spam Filter/config.tcl" } } proc extendTable {type string {direction 1}} { global WordRE access upvar #0 ${type}Table t ${type}Count c set i 0 set now [clock seconds] while {[regexp -indices -start $i $WordRE $string match]} { foreach {j i} $match {} set word [string range $string $j $i] if {[catch { if {[incr t($word) $direction] == 0} { unset t($word) } else { set access($word) $now } }]} then { set t($word) $direction set access($word) $now } } incr c $direction } proc generateProbability {word} { global goodTable goodCount badTable badCount set g 0 catch { set g [expr {$goodTable($word) * 2}] } set b 0 catch { set b $badTable($word) } if {$g == 0 && $b == 0} { # Not seen before return 0.2 } if {$g+$b < 5} { # Not frequent enough return 0.0 } set bfreq [min 1.0 [expr {double($b)/$badCount}]] set gfreq [min 1.0 [expr {double($g)/$goodCount}]] return [max 0.01 [min 0.99 [expr {$bfreq / ($gfreq + $bfreq)}]]] } proc combine {probs} { set p1 1.0 set p2 1.0 foreach prob $probs { set p1 [expr {$p1 * $prob}] set p2 [expr {$p2 * (1.0 - $prob)}] } return [expr {$p1 / ($p1 + $p2)}] } proc min {x y} {expr {$x<$y ? $x : $y}} proc max {x y} {expr {$x>$y ? $x : $y}} proc isSpam {message} { global WordRE reasons while {[regexp -indices -start $i $WordRE $message match]} { foreach {j i} $match {} set t([string range $string $j $i]) {} } foreach word [array names t] { set p [generateProbability $word] lappend magic [list [expr {abs($p-0.5)}] $p $word] } foreach l [lrange [lsort -decreasing -real -index 0 $magic] 0 15] { append reasons "[lindex $l 2] (score=[lindex $l 1]) " lappend interesting [lindex $l 1] } set score [combine $interesting] append reasons "=> Overall Score $score" return [expr {$score > 0.9}] } proc saveTables {} { global TableFile goodTable goodCount badTable badCount set f [open $TableFile w] puts $f [list \ [array get goodTable] $goodCount \ [array get badTable] $badCount \ [array set access]] close $f } proc loadTables {} { global TableFile goodTable goodCount badTable badCount access set list {} catch { set f [open $TableFile r] set list [read $f] close $f } array unset goodTable array unset badTable set done 0; # Flag because of catch! catch { if {[llength $list] == 5} { foreach {gt gc bt bc ac} $list {} if { !([llength $gt] & 1) && !([llength $bt] & 1) && !([llength $ac] & 1) && [string is integer -strict $gc] && [string is integer -strict $bc] } then { array set goodTable $gt set goodCount $gc array set badTable $bt set badCount $bc array set access $ac set done 1 } } } if {!$done} { array set goodTable {} set goodCount 0 array set badTable {} set badCount 0 array set access {} } } proc expireTables {} { global goodTable badTable access Expiry expires if {!$Expiry(Enabled)} { return } set expires [expr {[clock seconds]-$Expiry(Interval)}] foreach {word time} [array get access] { if {$time > $expires} { # Not expired yet! continue } set total 0 catch {incr total $goodTable($word)} catch {incr total $badTable($word)} if {$total > $Expiry(InhibitCount)} { # Too common anyway continue } catch {unset goodTable($word)} catch {unset badTable($word)} unset access($word) } } proc log {string infoMsg {optMsg {}}} { global Log if {!($Log(Enabled) || [string length $optMsg])} { return } set s [clock format [clock seconds]] if {[string length $string] && $Log(Subject)} { if {[regexp -line {^Subject:\s+(.*)} $string -> subject]} { append s ": subject=$subject" } else { append s ": no subject" } } else { append s ":" } if {[string length $string] && $Log(Source)} { if {[regexp -line {^(?:Sender|From):\s+(.*)} $string -> source]} { append s ": source=$source" } else { append s ": no source" } } else { append s ":" } append s $infoMsg if {[string length $optMsg]} { append s "\n$optMsg" } set fid [open $Log(File) a] puts $fid $s close $fid } # Basic functionality interfaces proc addSpam {{fid stdin}} { set message [read $fid] extendTable bad $message log $message "added as spam" saveTables } proc addNonspam {{fid stdin}} { set message [read $fid] extendTable good $message log $message "added as non-spam" saveTables } proc removeSpam {{fid stdin}} { set message [read $fid] extendTable bad $message -1 log $message "removed from spam" saveTables } proc removeNonspam {{fid stdin}} { set message [read $fid] extendTable good $message -1 log $message "removed as non-spam" saveTables } # Transfer message from one table to the other proc convertToSpam {{fid stdin}} { set message [read $fid] extendTable good $message -1 extendTable bad $message log $message "converted to spam" saveTables } proc convertToNonspam {{fid stdin}} { set message [read $fid] extendTable bad $message -1 extendTable good $message log $message "converted to non-spam" saveTables } # Filtering interfaces proc filterSpam {{fid stdin}} { global reasons set flag [isSpam [set message [read $fin]]] log $message "${reasons}: [expr {$flag ? {spam} : {non-spam}}]" exit $flag } proc aggressiveFilterSpam {{fid stdin}} { ## This procedure not only reports via the process exit code ## whether the message is spam, but also updates its internal ## database accordingly. Like that, it should be able to maintain ## the database in the face of slowly changing spam with ## absolutely no user intervention (except in the case of wholly ## new classes of spam.) global reasons set message [read $fid] set flag [isSpam $message] extendTable [expr {$flag ? "bad" : "good"}] $message log $message "${reasons}: [expr {$flag ? {spam} : {non-spam}}]: added" saveTables exit $flag } proc initialize {} { global WordRE ConfigFile set WordRE {[-\w'$]+} if {![file exists $ConfigFile]} { set dir [file dirname $ConfigFile] if {![file exists $dir]} { file mkdir $dir } set cfg {### Tcl Spam Filter Configuration File ### Where to load and save the tables of word frequencies set TableFile @@APPDIR@@/tables.db ### Rarely-used word expiry set Expiry(Enabled) 1 # If a word is not entered into the database for a month # and a half (measured in seconds) it should be removed. set Expiry(Interval) 3888000 # However, if the word has come up at least this number of # times, don't bother. set Expiry(InhibitCount) 10 ### Logging set Log(Enabled) 1 set Log(File) @@APPDIR@@/decisions.log # Log message subjects? set Log(Subject) 1 # Log message senders? set Log(Source) 1 } set fid [open $ConfigFile w] puts $fid [string map [list @@APPDIR@@ $dir] $cfg] close $fid } uplevel #0 {source $ConfigFile} } proc main {} { global argv errorInfo initialize loadTables if {[catch {[lindex $argv 0]} msg]} { set ei $errorInfo catch { log {} $msg $ei exit 2 } # Logging system is stuffed! :^/ puts stderr $ei puts stderr $errorInfo exit 3 } } main # This program must not be run multiple times simultaneously; when # installing it as a mail filter assistant, you *must* provide an # adequate level of locking yourself!
[Nice work.]
There's something in PERL which uses Bayes here [L3 ] - it interfaces to exmh, too.