Version 1 of Indexing and Searching the Wikit with Xapian

Updated 2006-06-01 21:39:15

schlenk 2006-06-01: These two scripts provide a simple search facility for the wiki based on the xapian package. It could easily be extended into a website providing a better fulltext search for the Tcl'ers wiki.


The indexer

 #!/usr/bin/env tclsh
 #
 # Index the Tcl'ers Wiki
 #
 # (c) 2006 Michael Schlenker <[email protected]
 #
 # Index the Tcl'ers wiki with the xapian fulltext searching package.
 #

 package require Tcl 8.4
 package require xapian 0.9.6
 package require Mk4tcl
 package require logger

 set version 0.1
 set log [logger::init windex]
 logger::import -prefix log_ windex
 ${log}::setlevel notice

 # path to the indexing dir
 set indexpath db
 # path to the uncompressed wikit metakit file
 # (from http://mini.net/cgi-bin/wikit.gz )
 #
 set wikifile wikit
 set dbfile [file join $indexpath xapindex]
 set urlprefix http://wiki.tcl.tk/

 set MAX_PROB_TERM_LENGTH 64

 if {![file isdirectory $indexpath]} {
    if {[file exists $indexpath]} {
        log_error "Cannot create index cache dir"
    } else  {
        file mkdir $indexpath
    }
 }

 proc indexWiki {file} {    
    ::mk::file open wk $file -readonly
    set start [clock seconds]
    set db [openIndexDatabase $::dbfile]
    set count 0
    mk::loop i wk.pages {
        incr count [indexOneWikiPage wk $db $i]
    } 
    closeIndexDatabase $db
    set stop [clock seconds]
    set diff [expr {$stop-$start}]
    set persec [expr {$count/$diff}]
    log_notice "Indexed $count documents in $diff seconds ($persec doc/sec)"
 }

 proc indexOneWikiPage {wiki db cur} {

    foreach {name page} [mk::get $cur name page] {break}
    if {![string length $page]} {return 0}
    log_info "Processing \"$name\""    
    xapian::Document doc
    # strip wk.pages! from the index name
    set idx [string range $cur 9 end ]
    # store the url of the page and its title for the result list
    doc set_data [list $::urlprefix$idx $name]
    set pos 0
    indexTextBlock doc $page $pos    
    $db add_document doc
    doc -delete
    return 1
 }

 proc indexTextBlock {doc text {pos 0}} {
    # A term is one or more alphanumerics, with optional trailing
    # + and/or - (e.g. C++).  But "hyphen-ated" should generate
    # "hyphen" not "hyphen-".
    # set re {([[:alnum:]]+(?:[-+]*(?![[:alnum:]]))?)}
    set re {([[:alnum:]]+)}
    set j 0
    while {[regexp -indices -start $j $re $text -> word]} {
        set i [lindex $word 0]
        set j [lindex $word 1]
        if {($j-$i) <= $::MAX_PROB_TERM_LENGTH} {
        set term [string range $text $i $j]
        set term [string tolower $term]
        set sterm [estem stem_word $term]
        log_debug "Indexing $term"
        $doc add_posting $term $pos
        incr pos
        $doc add_term $sterm
    }
    incr j
    }
    return $pos 
 }

 proc openIndexDatabase {file} {
    xapian::WritableDatabase xapiandb $file $::xapian::DB_CREATE_OR_OVERWRITE
    xapian::Stem estem "english"
    return xapiandb
 }

 proc closeIndexDatabase {db} {
    $db -delete
 }

 indexWiki $wikifile

The simple command line searching utility

 #!/usr/bin/env tclsh
 #
 # Search the Wiki
 #
 # (c) 2006 Michael Schlenker <[email protected]
 #

 package require Tcl 8.4
 package require xapian 0.9.6
 package require logger

 set log [logger::init wsearch]
 logger::import -prefix log_ wsearch

 log_info "Using Xapian API version [package present xapian]"
 set indexpath db
 set dbfile [file join $indexpath xapindex]
 set baseurl http://wiki.tcl.tk/
 set MAX_PROB_TERM_LENGTH 64



 proc openIndexDatabase {file} {
    xapian::Database xapiandb $file
    xapian::Stem estem "english"
    return xapiandb
 }

 proc closeIndexDatabase {db} {
    $db -delete
 }



 if {[llength $argv] == 0} {
  log_error "Empty commandline"
  exit 1
 }

 set db [openIndexDatabase $dbfile]
 xapian::Enquire enquire $db            
 log_info "Commandline is $argv"
 log_debug "Building query"
 xapian::QueryParser qparse
 set qp qparse
 $qp set_database $db
 $qp set_stemmer estem
 set query [$qp parse_query [join $argv]]

 log_debug "Performing query [$query get_description]'"

 enquire set_query $query
 set matches [enquire get_mset 0 100]
 log_info "[$matches get_matches_estimated] results found"

 for {set i [$matches begin]} {![$i equals [$matches end]]} {$i next} {
        xapian::Document document [$i get_document]
        puts [format {ID %s %s%% [%s]} \
             [$i get_docid] [$i get_percent] [document get_data]]
 }

 closeIndexDatabase $db

 exit 0

Example usage

  • First get the wikit.gz file, then run the indexer on the file.
  • Start searching:
 ./search.tcl schlenk AND xapian

schlenk@chronos:~/devel/wikisearch> ./gsearch.tcl schlenk and xapian Do Jun 01 23:36:02 CEST 2006 wsearch info 'Using Xapian API version 0.9.6' Do Jun 01 23:36:02 CEST 2006 wsearch info 'Commandline is schlenk and xapian' Do Jun 01 23:36:02 CEST 2006 wsearch debug 'Building query' Do Jun 01 23:36:02 CEST 2006 wsearch debug 'Performing query Xapian::Query((schlenk:(pos=1) OR and:(pos=2) OR xapian:(pos=3)))'' Do Jun 01 23:36:02 CEST 2006 wsearch info '6330 results found' ID 6873 100% [L1 ] ID 8077 96% [L2 ] ID 34 47% [L3 ] ID 2296 46% [L4 ]


Category Wikit