Version 4 of Bayesian Spam Filtering

Updated 2002-08-17 13:28:03

This page is based on http://www.paulgraham.com/spam.html but it is still missing major pieces of functionality (like the code to build the frequency tables from the message corpora, the tables probably should not be recomputed every time, etc.)

Enjoy!

DKF

Note that Graham's work is based, perhaps indirectly, on such earlier efforts as appear in [L1 ] and [L2 ].


 set WordRE {[-\w'$]+}

 proc countwords {table string} {
    global WordRE
    upvar 1 $table t
    set i 0
    while {[regexp -indices -start $i $WordRE $string match]} {
       foreach {j i} $match {}
       set word [string range $string $j $i]
       if {[catch {incr t($word)}]} {
          set t($word) 1
       }
    }
 }

 proc genprob {word} {
    upvar #0 goodTable good $badTable bad
    set g 0
    catch {
       set g [expr {$good($word) * 2}]
    }
    set b 0
    catch {
       set b $bad($word)
    }
    if {$g == 0 && $b == 0} {
       # Not seen before
       return .2
    }
    if {$g+$b < 5} {
       # Not frequent enough
       return .0
    }
    set bfreq [min 1. [expr {double($b)/$badCount}]]
    set gfreq [min 1. [expr {double($g)/$goodCount}]]
    return [max .01 [min .99 [expr {$bfreq / ($gfreq + $bfreq)}]]]
 }

 proc combine {probs} {
    set p1 1.
    set p2 1.
    foreach prob $probs {
       set p1 [expr {$p1 * $prob}]
       set p2 [expr {$p2 * (1. - $prob)}]
    }
    return [expr {$p1 / ($p1 + $p2)}]
 }

 proc min {x y} {expr {$x<$y ? $x : $y}}
 proc max {x y} {expr {$x>$y ? $x : $y}}

 proc isSpam {message} {
    global WordRE
    while {[regexp -indices -start $i $WordRE $message match]} {
       foreach {j i} $match {}
       set t([string range $string $j $i]) {}
    }
    foreach word [array names t] {
       set p [genprob $word]
       lappend magic [list [expr {abs($p-.5)}] $p]
    }
    foreach l [lrange [lsort -real -index 0 $magic] 0 15] {
       lappend interesting [lindex $l 1]
    }
    return [expr {[combine $interesting] > .9}]
 }


[Nice work.]

There's something in PERL which uses Bayes here [L3 ] - it interfaces to exmh, too.