Indexing and Searching the Wikit with Xapian

schlenk 2006-06-01: These two scripts provide a simple search facility for the wiki based on the xapian package. It could easily be extended into a website providing a better fulltext search for the Tcl'ers wiki.


The indexer

#!/usr/bin/env tclsh
#
# Index the Tcl'ers Wiki
#
# (c) 2006 Michael Schlenker <[email protected]
#
# Index the Tcl'ers wiki with the xapian fulltext searching package.
#

package require Tcl 8.4
package require xapian 0.9.6
package require Mk4tcl
package require logger

set version 0.1
set log [logger::init windex]
logger::import -prefix log_ windex
${log}::setlevel notice

# path to the indexing dir
set indexpath db
# path to the uncompressed wikit metakit file
# (from http://mini.net/cgi-bin/wikit.gz )
#
set wikifile wikit
set dbfile [file join $indexpath xapindex]
set urlprefix https://wiki.tcl-lang.org/

set MAX_PROB_TERM_LENGTH 64

if {![file isdirectory $indexpath]} {
    if {[file exists $indexpath]} {
        log_error "Cannot create index cache dir"
    } else  {
        file mkdir $indexpath
    }
}
        
proc indexWiki {file} {    
    ::mk::file open wk $file -readonly
    set start [clock seconds]
    set db [openIndexDatabase $::dbfile]
    set count 0
    mk::loop i wk.pages {
        incr count [indexOneWikiPage wk $db $i]
    } 
    closeIndexDatabase $db
    set stop [clock seconds]
    set diff [expr {$stop-$start}]
    set persec [expr {$count/$diff}]
    log_notice "Indexed $count documents in $diff seconds ($persec doc/sec)"
}

proc indexOneWikiPage {wiki db cur} {
    
    foreach {name page} [mk::get $cur name page] {break}
    if {![string length $page]} {return 0}
    log_info "Processing \"$name\""    
    xapian::Document doc
    # strip wk.pages! from the index name
    set idx [string range $cur 9 end ]
    # store the url of the page and its title for the result list
    doc set_data [list $::urlprefix$idx $name]
    set pos 0
    indexTextBlock doc $page $pos    
    $db add_document doc
    doc -delete
    return 1
}

proc indexTextBlock {doc text {pos 0}} {
    # A term is one or more alphanumerics, with optional trailing
    # + and/or - (e.g. C++).  But "hyphen-ated" should generate
    # "hyphen" not "hyphen-".
    # set re {([[:alnum:]]+(?:[-+]*(?![[:alnum:]]))?)}
    set re {([[:alnum:]]+)}
    set j 0
    while {[regexp -indices -start $j $re $text -> word]} {
        set i [lindex $word 0]
        set j [lindex $word 1]
        if {($j-$i) <= $::MAX_PROB_TERM_LENGTH} {
        set term [string range $text $i $j]
        set term [string tolower $term]
        set sterm [estem stem_word $term]
        log_debug "Indexing $term"
        $doc add_posting $term $pos
        incr pos
        $doc add_term $sterm
    }
    incr j
    }
    return $pos 
}

proc openIndexDatabase {file} {
    xapian::WritableDatabase xapiandb $file $::xapian::DB_CREATE_OR_OVERWRITE
    xapian::Stem estem "english"
    return xapiandb
}

proc closeIndexDatabase {db} {
    $db -delete
}

indexWiki $wikifile

The simple command line searching utility

 #!/usr/bin/env tclsh
 #
 # Search the Wiki
 #
 # (c) 2006 Michael Schlenker <[email protected]
 #

 package require Tcl 8.4
 package require xapian 0.9.6
 package require logger

 set log [logger::init wsearch]
 logger::import -prefix log_ wsearch

 log_info "Using Xapian API version [package present xapian]"
 set indexpath db
 set dbfile [file join $indexpath xapindex]
 set baseurl https://wiki.tcl-lang.org/
 set MAX_PROB_TERM_LENGTH 64



 proc openIndexDatabase {file} {
    xapian::Database xapiandb $file
    xapian::Stem estem "english"
    return xapiandb
 }

 proc closeIndexDatabase {db} {
    $db -delete
 }



 if {[llength $argv] == 0} {
  log_error "Empty commandline"
  exit 1
 }

 set db [openIndexDatabase $dbfile]
 xapian::Enquire enquire $db            
 log_info "Commandline is $argv"
 log_debug "Building query"
 xapian::QueryParser qparse
 set qp qparse
 $qp set_database $db
 $qp set_stemmer estem
 set query [$qp parse_query [join $argv]]

 log_debug "Performing query [$query get_description]'"
                                                        
 enquire set_query $query
 set matches [enquire get_mset 0 100]
 log_info "[$matches get_matches_estimated] results found"
                                                                    
 for {set i [$matches begin]} {![$i equals [$matches end]]} {$i next} {
        xapian::Document document [$i get_document]
        puts [format {ID %s %s%% [%s]} \
             [$i get_docid] [$i get_percent] [document get_data]]
 }

 closeIndexDatabase $db

 exit 0

Example usage

  • First get the wikit.gz file, then run the indexer on the file.
  • Start searching:
 ./search.tcl schlenk AND xapian
 [Do Jun 01 23:40:45 CEST 2006] [wsearch] [info] 'Using Xapian API version 0.9.6'
 [Do Jun 01 23:40:45 CEST 2006] [wsearch] [info] 'Commandline is schlenk AND xapian'
 [Do Jun 01 23:40:45 CEST 2006] [wsearch] [debug] 'Building query'
 [Do Jun 01 23:40:45 CEST 2006] [wsearch] [debug] 'Performing query Xapian::Query((schlenk:(pos=1) AND xapian:(pos=2)))''
 [Do Jun 01 23:40:45 CEST 2006] [wsearch] [info] '2 results found'
 ID 6873 100% [https://wiki.tcl-lang.org/13173 Xapian]
 ID 8077 96% [https://wiki.tcl-lang.org/15637 {Package feature map}]