[JBR] - 20101022 fts.tcl - index a directory tree of random docs. I have been using swish-e and swish++ for a while but they are too slow and don't have very nice incremental index features. Here is a full text search system written in tcl using sqlite3's fts3 virtual tables. I'm a rank novice at sql and implemented the searchrank function as simply as possible so any suggestions are welcome. This entire program was written in 1 day, inspiration to wiki in 6 hours, enjoy. The search system in configured with a configuration file similar to swish++. There are several high level commands, here are some example command lines: ./fts index myindex.conf ./fts search myindex.conf "search terms" A little debugging output has been included and is selectable with a final arg to "fts index". The command gives a short summary if run with no args. john@tdc : ./fts fts: unknown subcommand "" fts index - index a set of directories indicated in verbosity is a comma separated list of the message types insert, update, unindexed , unchanged,exclude or a unique prefix of them. fts search - seach the index for query fts excludes - display the exclude patterns from fts filters - display the filter patterns from fts docs - display a table of documents in the index. fts.conf is the default config file. ------------------------------------- fts.conf: set tmp /var/tmp stopwords stopwords filter *.bz2 bunzip2 -c %f > @%F filter *.gz gunzip -c %f > @%F filter *.Z uncompress -c %f > @%F filter *.doc catdoc %f filter *.xls xls2csv %f filter *.ppt catppt %f filter *.prn pdftotext %f - filter *.pdf pdftotext %f - filter *.PS ps2ascii %f filter *.ps ps2ascii %f filter *.eps ps2ascii %f filter *.txt cat %f filter *.htm lynx -dump %f filter *.html lynx -dump %f filter */page/* cat %f exclude *.dxf exclude *.exe exclude *.gif *.jpg *.png .jpeg exclude *.mpeg *.mov .avi exclude *.mf1 *.mf2 exclude *.sym exclude *.zip exclude *~ exclude *.files/ index-path /data/cgijohn/paige/file index-path /data/cgijohn/paige/page index-path /data/wdocs/john/idoc-db index-path /data/wdocs/dfabricant/www-docs --------------------------------- fts.tcl: --------------------------------- ====== #!/home/john/bin/tclkit8.6 # load /home/john/lib/libtclsqlite3.so set verb QUIET proc verb { type message } { if { [regexp $::verb $type] } { puts "[format %10.10s $type] : $message" } } proc database { database } { set ::database $database } set excludes {} proc exclude { args } { foreach pattern $args { lappend ::excludes $pattern if { [regexp {^\*\.[a-z0-9]+$} $pattern] } { lappend ::excludes [string toupper $pattern] } } } proc exclude? { file } { foreach pattern $::excludes { if { [string match $pattern $file] } { return 1 } } return 0 } proc filter { pattern args } { lappend ::filters $pattern $args if { [regexp {^\*\.[a-z0-9]+$} $pattern] } { lappend ::filters [string toupper $pattern] $args } } proc filter? { file } { foreach { pattern action } $::filters { if { [string match $pattern $file] } { if { [string first @%F $action] == -1 } { return [list [list [string map [list %f $file] $action]]] } else { set f $file set F [string map [list [string range $pattern 1 end] {}] $::tmp/[file tail $file]] set actions [list [string map [list %f $f @%F $F] $action] $F] if { [set action [filter? $F]] eq {} } { return {} } else { return [list $actions {*}$action] } } } } return {} } proc filter! { actions } { set tmpfiles {} foreach action [lrange $actions 0 end-1] { foreach { action tmpfile } $action {} try { exec -ignorestderr {*}$action } on error message { puts $message } lappend tmpfiles $tmpfile } set indx {} set body {} try { set body [exec -ignorestderr {*}[lindex [lindex $actions end] 0]] } on error message { puts $message } finally { if { $tmpfiles ne {} } { exec rm -f $tmpfiles } } return $body } proc stopwords { file } { foreach word [split [read [set fp [open $file]]]] { set ::stops($word) 1 } close $fp } proc index-file { file } { if { ![llength [set actions [filter? $file]]] } { verb unindexed $file return } try { set xtime [file mtime $file] } on error message { puts $message return } db eval { select mtime from documents where file = $file } { if { $mtime >= $xtime } { verb unchanged $file return } } set body [filter! $actions] set body [regsub -all {\m[-+]?([0-9]+\.?[0-9]*|\.[0-9]+)([eE][-+]?[0-9]+)?\M} $body { }] set body [regsub -all {[,<>?\\~!@#$%^&*()_+]} $body { }] foreach word [split $body] { if { [regexp {[[:lower:]]} $word] } { if { [string length $word] <= 3 } { continue } set word [string tolower $word] } if { [info exists ::stops($word)] } { continue } append indx $word " " } if { [db eval { select rowid from documents where file = @file }] eq {} } { verb insert $file db eval { begin transaction ; insert into documents ( file, mtime ) values ( $file, $xtime ) ; insert into searchtext ( docid, body ) values ( last_insert_rowid(), $indx ) ; commit transaction } } else { verb update $file db eval { begin transaction ; update documents set mtime = $xtime where file = $file ; update searchtext set body = @body where docid = ( select rowid from documents where file = $file ) ; commit transaction } } } proc index-path { path } { lappend ::paths $path } proc index-path! { path } { foreach dir [glob -type d -directory $path -nocomplain *] { if { [exclude? $dir/] } { continue } index-path! $dir } foreach file [glob -type f -directory $path -nocomplain *] { if { [exclude? $file] } { continue } index-file $file } } set command [lindex $argv 0] set config [lindex $argv 1] set query [lrange $argv 2 end] if { $config eq {} } { puts "fts: no config file" exit 1 } else { try { source $::config } on error message { puts "fts: $message"; exit 1 } } proc searchrank { matchinfo } { binary scan $matchinfo iiiii nphrase ncol 1 2 3 return $1 } switch $command { docs - index - search { sqlite3 db $database db enable_load_extension 1 db function searchrank searchrank catch { db eval { create virtual table searchtext using fts3(tokenize=porter, body text); } } catch { db eval { create table documents ( file, mtime ) } } catch { db eval { create index on documents ( file ) } } } default { puts "fts: unknown subcommand \"$command\"" puts { fts index - index a set of directories indicated in verbosity is a comma separated list of the message types insert, update, unindexed , unchanged,exclude or a unique prefix of them. fts search - seach the index for query fts excludes - display the exclude patterns from fts filters - display the filter patterns from fts docs - display a table of documents in the index. fts.conf is the default config file. } exit 1 } } switch $command { excludes { foreach exclude $::excludes { puts $exclude } } filters { foreach { pattern action } $::filters { puts "[format %8.8s $pattern] : $action" } } index { if { [llength $argv] == 3 } { set ::verb ^(([string map { , .*)|( } [lindex $argv 2]].*))$ } set ::itime [file mtime $database] foreach path $::paths { index-path! $path } } search { db eval { select docid, searchrank(matchinfo(searchtext)) as rank from searchtext where body match @query order by searchrank(matchinfo(searchtext)) desc; } { db eval { select rowid, file from documents where rowid = $docid ; } { puts "$rank $file" } } } docs { puts "rowid mtime file" puts "----- ----- ----" db eval { select rowid, file, mtime from documents } { puts "$rowid $mtime $file" } } } ====== <>Uncategorized