Search Engine in Javascript

if false {

A colleage of mine publishes technical sheets not only on our Intranet but also on CD-ROM.

Last friday, she asked me how to realise the search page on CD-ROM. I answered: maybe there's the ASP, but no little dwarf to work out the ASP, so no way ...

But, next day was saturday ... maybe the wheather ... I found a quick'n'dirty solution in Javascript. It is a single file search.js to be sourced. Here the source:

 }

 #! /usr/bin/tclsh
 proc echo args {puts $args}

 proc cat {file} {
     # return contents of $file
     set port [open $file]
     set contents [read $port]
     close $port
     set contents
 }

 if {$argv ne ""} then {
     set startFile [lindex $argv 0]
 } else {
     set startFile [lindex [glob *.htm*] 0]
 }

 proc leadFragments word {
     lappend result $word
     while {[string length $word] > 1} {
         set word [string range $word 0 end-1]
         if {[lsearch $result $word] < 0} then {
             lappend result $word
         }
     }
     set result
 }

 proc trailFragments word {
     lappend result $word
     while {[string length $word] > 1} {
         set word [string range $word 1 end]
         lappend result $word
     }
     set result
 }

 proc fragments word {
     set result {}
     foreach lead [leadFragments $word] {
         foreach frag [trailFragments $lead] {
             lappend result $frag
         }
     }
     set result
 }

 proc docFragments doc {
     set contents [regsub -all -- {[^[:alnum:]]+} $doc " "]
     set result {}
     foreach word [split $contents " "] {
         eval lappend result [fragments $word]
     }
     lsort -unique $result
 }

 proc relPathFromTo {fromDir toDir} {
     # return path string relative from $fromDir to $toDir.
     # $fromDir is assumed to be a directory (not a file).
     set from [file normalize $fromDir]
     set to [file normalize $toDir]
     if {$::tcl_platform(platform) eq "windows"} {
         set driveMap {
             a: A: b: B: c: C: d: D: e: E: f: F: g:
             G: h: H: i: I: j: J: k: K: l: L: m: M:
             n: N: o: O: p: P: q: Q: r: R: s: S: t:
             T: u: U: v: V: w: W: x: X: y: Y: z: Z:
         }
         regexp {^[a-zA-Z]:} [pwd] drive
         if {![regexp {^[a-zA-Z]:} $from]} {
             set from $drive$from
         }
         set from [string map $driveMap $from]
         if {![regexp {^[a-zA-Z]:} $to]} {
             set to $drive$to
         }
         set to [string map $driveMap $to]
     }
     set fromList [file split $from]
     set fromLength [llength $fromList]
     set toList [file split $to]
     set toLength [llength $toList]
     set commonList {}
     foreach path1 $fromList path2 $toList {
         if {$path1 ne $path2} {
             break
         } else {
             lappend commonList $path1
         }
     }
     set commonLength [llength $commonList]
     set fromList1 [lrange $fromList $commonLength end]
     set toList1 [lrange $toList $commonLength end]
     set resultList {}
     foreach i $fromList1 {
         lappend resultList ..
     }
     eval lappend resultList $toList1
     if {$resultList ne {}} {
         eval file join $resultList
     }
 }

 array set database {}
 set titles {}

 proc parseFile file {
     variable startFile
     variable links
     variable database
     variable titles
     if {$startFile eq $file} then {
         set links {}
         array unset database
         array set database {}
     }
     set startDir [file dirname $startFile]
     set file [file normalize $file]
     set myPath [relPathFromTo [file normalize $startDir] $file]
     if {[lsearch $links $myPath] >= 0} then {
         return
     } else {
         lappend links [string map [list \\ \\\\ \" \\\"] $myPath]
     }
     echo ... processing $myPath ...
     set myDir [file dirname $myPath]
     # words
     set words {}
     switch -- [string tolower [file extension $file]] {
         .htm - .html - .shtm - .shtml - .xhtm - .xhtml - .txt -
         .php - .php4 - .php5 {
             set contents [cat $file]
             # title
             if {[regexp -nocase {<title>[^<]+</title>} $contents title]} then {
                 set title [regsub -all { *</?title> *} $title ""]
             } else {
                 set title [file rootname [file tail $file]]
             }
             lappend titles [string map [list \\ \\\\ \" \\\"] $title]
             set contents1 [regsub -all -- {<.*?>} $contents ""]
             set contents2 [string map {
                 &auml;   ä
                 &ouml;   ö
                 &uuml;   ü
                 &szlig;  ß
                 &Auml;   Ä
                 &Ouml;   Ö
                 &Uuml;   Ü
             } $contents1]
             # set contents3 [regsub -all -- {[^[:alnum:]]+} $contents2 " "]
             eval lappend words [docFragments [string tolower $contents2]]
             # links verfolgen
             foreach src [regexp -nocase -inline -all\
                              {<a [^>]*?href=['"][^:?]+["']>} $contents] {
                 set href [regexp -inline {(?:href="[^\"]*"|href='[^']*')} $src]
                 set target\
                     [file normalize\
                          [file join  $myDir\
                               [string trim [string range $href 7 end-1]\
                                    '\"\\]]]
                 if {[file isfile $target]} then {
                     if {[catch {parseFile $target} err]} then {
                         puts stderr [list problems parsing $target, but don't panic ...]
                     }
                 } else {
                     puts stderr [list not processed: $target]
                 }
             }
         }
         default {
             echo ... skip non-html file $myPath ...
         }
     }
     foreach word $words {
         if {![info exists database($word)]
             ||
             [lsearch $database($word) $myPath] < 0} then {
             lappend database($word) $myPath
         }
     }
     set words
 }

 parseFile $startFile

 echo parsing is done, creating javascript database ...

 set src {var files = }
 append src {[} \n\t\" [join $links \",\n\t\"] \"\n {]} \n\n\
     {var titles = }\
     {[} \n\t\" [join $titles \",\n\t\"] \"\n {]} \n\n\
     {var database = }  \{
 foreach key [array names database] {
     append src \n\t\" $key \": " " \[
     set indices {}
     foreach target $database($key) {
         lappend indices [lsearch $links $target]
     }
     append src [join $indices ", "]
     append src \],
 }
 # remove trailing comma ...
 set src [string range $src 0 end-1]
 append src \n \}

 set out [open search.js w]
 puts $out $src

 puts $out {
     // from here on fixed javascript

     // arrayContainsElement (arr, el)
     // return true if el is element of arr

     function arrayContainsElement (arr, el)
     {
         for (var i in arr) if (arr[i] == el) return true
         return false
     }

     // commonElementsOf (arr1, arr2)
     // return new array containing elements which are common in array

     function commonElementsOf (arr1, arr2)
     {
         var result=[]
         for (var i in arr1)
         {
          var el = arr1[i]
          if (arrayContainsElement (arr2, el)) result .push(el)
      }
         return result
     }

     var formFields = unescape (location .search) .slice(1) .split("&")

     var queryList = []
     var lang = "de"

     for (var i in formFields)
     {
      var keyVal = formFields[i] .split("=")
      var key = keyVal[0]
      var val = keyVal[1]
      if (key == "query" || key == "q")
      {
          queryList = val .toLowerCase() .split("+")
          // if there should be the search form on position 0 ...
          if (document .forms .length && document .forms[0] [key])
          document .forms [0] [key] .value = val .split("+") .join(" ")
      }
      if (key == "lang") lang = val
     }

     // write some feedback to HTML

     var resultList = []

     for (var i in queryList)
     {
      if (i == 0)
         resultList = database[queryList[i]]
      else if (queryList[i] != "")
         resultList = commonElementsOf(resultList, database[queryList[i]])
     }

     if (resultList && resultList .length)
     {
         document .write( "<ol>")
         for (var i in resultList)
         {
          var result = resultList[i]
          document .write ("\n  <li><a href='", files[result], "'>",
                           titles[result], "</a></li>")
         }
         document .write( "\n</ol>")
     }
 }

 close $out

 if false {

Here the contents of the search page:

 <html>
  <head>
    <title>Suchseite</title>
  </head>
  <body>
    <h1>Suchseite</h1>
    <form>
      <input type="text" name="query" />
      <input type="submit" name="suche" />
    </form>
    <script type="text/javascript" src="search.js"></script>
  </body>
 </html>

Usage:

cd to the directory where the start page of your site resides.
start the script with the start page as argument, e.g. "tclsh makeSearch.tcl index.htm". The script takes this HTML as starting point for web-crawling, then it writes the found data to a file named search.js.
Put the search above, e.g. as an HTML named search.htm, into the same directory.
You can access this HTML not only by its own form but also from any other page by a reference like search.htm?query=My+Request+from+last+year

From now on, you can "google" your private site. Cool.