Version 2 of Bayesian Spam Filtering

Updated 2002-08-16 22:14:06

This page is based on http://www.paulgraham.com/spam.html but it is still missing major pieces of functionality (like the code to build the frequency tables from the message corpora, the tables probably should not be recomputed every time, etc.)

Enjoy!

DKF


 set WordRE {[-\w'$]+}

 proc countwords {table string} {
    global WordRE
    upvar 1 $table t
    set i 0
    while {[regexp -indices -start $i $WordRE $string match]} {
       foreach {j i} $match {}
       set word [string range $string $j $i]
       if {[catch {incr t($word)}]} {
          set t($word) 1
       }
    }
 }

 proc genprob {word} {
    upvar #0 goodTable good $badTable bad
    set g 0
    catch {
       set g [expr {$good($word) * 2}]
    }
    set b 0
    catch {
       set b $bad($word)
    }
    if {$g == 0 && $b == 0} {
       # Not seen before
       return .2
    }
    if {$g+$b < 5} {
       # Not frequent enough
       return .0
    }
    set bfreq [min 1. [expr {double($b)/$badCount}]]
    set gfreq [min 1. [expr {double($g)/$goodCount}]]
    return [max .01 [min .99 [expr {$bfreq / ($gfreq + $bfreq)}]]]
 }

 proc combine {probs} {
    set p1 1.
    set p2 1.
    foreach prob $probs {
       set p1 [expr {$p1 * $prob}]
       set p2 [expr {$p2 * (1. - $prob)}]
    }
    return [expr {$p1 / ($p1 + $p2)}]
 }

 proc min {x y} {expr {$x<$y ? $x : $y}}
 proc max {x y} {expr {$x>$y ? $x : $y}}

 proc isSpam {message} {
    global WordRE
    while {[regexp -indices -start $i $WordRE $message match]} {
       foreach {j i} $match {}
       set t([string range $string $j $i]) {}
    }
    foreach word [array names t] {
       set p [genprob $word]
       lappend magic [list [expr {abs($p-.5)}] $p]
    }
    foreach l [lrange [lsort -real -index 0 $magic] 0 15] {
       lappend interesting [lindex $l 1]
    }
    return [expr {[combine $interesting] > .9}]
 }


[Nice work.]