Version 14 of Search Engine in Javascript

Updated 2006-09-26 04:28:33

if false {

A colleage of mine publishes technical sheets not only on our Intranet but also on CD-ROM.

Last friday, she asked me how to realise the search page on CD-ROM. I answered: maybe there's the ASP, but no little dwarf to work out the ASP, so no way ...

But, next day was saturday ... maybe the wheather ... I found a quick'n'dirty solution in Javascript. It is a single file search.js to be sourced. Here the source:

 }


 proc echo args {puts $args}

 proc cat {file} {
     # return contents of $file
     set port [open $file]
     set contents [read $port]
     close $port
     set contents
 }

 if {$argv ne ""} then {
     set startFile [lindex $argv 0]
 } else {
     set startFile [lindex [glob *.htm*] 0]
 }

 proc leadFragments word {
     lappend result $word
     while {[string length $word] > 1} {
         set word [string range $word 0 end-1]
         if {[lsearch $result $word] < 0} then {
             lappend result $word
         }
     }
     set result
 }

 proc trailFragments word {
     lappend result $word
     while {[string length $word] > 1} {
         set word [string range $word 1 end]
         if {[lsearch $result $word] < 0} then {
             lappend result $word
         }
     }
     set result
 }

 proc fragments word {
     set result {}
     foreach lead [leadFragments $word] {
         foreach frag [trailFragments $lead] {
             if {[lsearch $result $frag] < 0} then {
                 lappend result $frag
             }
         }
     }
     set result
 }

 proc relPathFromTo {fromDir toDir} {
     # return path string relative from $fromDir to $toDir.
     # $fromDir is assumed to be a directory (not a file).
     set from [file normalize $fromDir]
     set to [file normalize $toDir]
     if {$::tcl_platform(platform) eq "windows"} {
         set driveMap {
             a: A: b: B: c: C: d: D: e: E: f: F: g:
             G: h: H: i: I: j: J: k: K: l: L: m: M:
             n: N: o: O: p: P: q: Q: r: R: s: S: t:
             T: u: U: v: V: w: W: x: X: y: Y: z: Z:
         }
         regexp {^[a-zA-Z]:} [pwd] drive
         if {![regexp {^[a-zA-Z]:} $from]} {
             set from $drive$from
         }
         set from [string map $driveMap $from]
         if {![regexp {^[a-zA-Z]:} $to]} {
             set to $drive$to
         }
         set to [string map $driveMap $to]
     }
     set fromList [file split $from]
     set fromLength [llength $fromList]
     set toList [file split $to]
     set toLength [llength $toList]
     set commonList {}
     foreach path1 $fromList path2 $toList {
         if {$path1 ne $path2} {
             break
         } else {
             lappend commonList $path1
         }
     }
     set commonLength [llength $commonList]
     set fromList1 [lrange $fromList $commonLength end]
     set toList1 [lrange $toList $commonLength end]
     set resultList {}
     foreach i $fromList1 {
         lappend resultList ..
     }
     eval lappend resultList $toList1
     if {$resultList ne {}} {
         eval file join $resultList
     }
 }

 array set database {}
 set titles {}

 proc parseFile file {
     variable startFile
     variable links
     variable database
     variable titles
     if {$startFile eq $file} then {
         set links {}
         array unset database
         array set database {}
     }
     set startDir [file dirname $startFile]
     set file [file normalize $file]
     set myPath [relPathFromTo [file normalize $startDir] $file]
     if {[lsearch $links $myPath] >= 0} then {
         return
     }
     echo processing $myPath
     lappend links [string map [list \\ \\\\ \" \\\"] $myPath]
     set myDir [file dirname $myPath]
     set contents [cat $file]
     # title
     if {[regexp -nocase {<title>[^<]+</title>} $contents title]} then {
         set title [regsub -all { *</?title> *} $title ""]
     } else {
         set title [file rootname [file tail $file]]
     }
     lappend titles [string map [list \\ \\\\ \" \\\"] $title]
     # words
     set contents1 [regsub -all -- {<.*?>} $contents ""]
     set contents2 [string map {
         &auml;  ä
         &ouml;  ö
         &uuml;  ü
         &szlig; ß
         &Auml;  Ä
         &Ouml;  Ö
         &Uuml;  Ü
     } $contents1]
     set contents3 [regsub -all -- {[^[:alnum:]]+} $contents2 " "]
     set words {}
     foreach word [split $contents3] {
         if {$word ne ""} then {
             foreach frag [fragments [string tolower $word]] {
                 if {[lsearch $words $frag] < 0} then {
                     lappend words $frag
                 }
             }
         }
     }
     array set database {}
     foreach word $words {
         if {![info exists database($word)]
             ||
             [lsearch $database($word) $myPath] < 0} then {
             lappend database($word) $myPath
         }
     }
     # links verfolgen
     foreach src [regexp -nocase -inline -all\
                      {<a [^>]*?href=['"][^:?]+["']>} $contents] {
         set href [regexp -inline {(?:href="[^\"]*"|href='[^']*')} $src]
         set target\
             [file normalize\
                  [file join  $myDir\
                       [string trim [string range $href 7 end-1] '\"\\]]]
         if {[file isfile $target]} then {
             parseFile $target
         } else {
             puts stdout [list not processed: $target]
         }
     }
     array get database
 }

 parseFile $startFile

 set src {var files = }
 append src {[} \n\t\" [join $links \",\n\t\"] \"\n {]} \n\n\
     {var titles = }\
     {[} \n\t\" [join $titles \",\n\t\"] \"\n {]} \n\n\
     {var database = }  \{
 foreach key [array names database] {
     append src \n\t\" $key \": " " \[
     set indices {}
     foreach target $database($key) {
         lappend indices [lsearch $links $target]
     }
     append src [join $indices ", "]
     append src \],
 }
 # remove trailing comma ...
 set src [string range $src 0 end-1]
 append src \n \}

 set out [open search.js w]
 puts $out $src

 puts $out {
     // from here on fixed javascript

     // arrayContainsElement (arr, el)
     // return true if el is element of arr

     function arrayContainsElement (arr, el)
     {
         for (var i in arr) if (arr[i] == el) return true
         return false
     }

     // commonElementsOf (arr1, arr2)
     // return new array containing elements which are common in array

     function commonElementsOf (arr1, arr2)
     {
         var result=[]
         for (var i in arr1)
         {
          var el = arr1[i]
          if (arrayContainsElement (arr2, el)) result .push (el)
      }
         return result
     }

     var formFields = decodeURI (location .search) .slice (1) .split("&")

     var queryList = []
     var lang = "de"

     for (var i in formFields)
     {
      var keyVal = formFields[i] .split("=")
      var key = keyVal[0]
      var val = keyVal[1]
      if (key == "query" || key == "q")
      {
          queryList = val .toLowerCase() .split("+")
          // if there should be the search form on position 0 ...
          if (document .forms .length && document .forms[0] [key])
              document .forms [0] [key] .value = val .replace ("+", " ")
      }
      if (key == "lang") lang = val
     }

     // write some feedback to HTML

     var resultList = []

     for (var i in queryList)
     {
      if (i == 0)
         resultList = database[queryList[i]]
      else if (queryList[i] != "")
         resultList = commonElementsOf(resultList, database[queryList[i]])
     }

     if (resultList && resultList .length)
     {
         document .write( "<ol>")
         for (var i in resultList)
         {
          var result = resultList[i]
          document .write ("\n<li><a href='", files[result], "'>",
                           titles[result], "</a></li>")
         }
         document .write( "</ol>")
     }
 }

 close $out

 if false {

Here the contents of the search page:

 <html>
  <head>
    <title>Suchseite</title>
  </head>
  <body>
    <h1>Suchseite</h1>
    <form>
      <input type="text" name="query" />
      <input type="submit" name="suche" />
    </form>
    <script type="text/javascript" src="search.js"></script>
  </body>
 </html>

Usage:

  1. cd to the directory where the start page of your site resides.
  2. start the script with the start page as argument, e.g. "tclsh makeSearch.tcl index.htm". The script takes this HTML as starting point for web-crawling, then it writes the found data to a file named search.js.
  3. Put the search above, e.g. as an HTML named search.htm, into the same directory.
  4. You can access this HTML not only by its own form but also from any other page by a reference like search.htm?query=My+Request+from+last+year

From now on, you can "google" your private site. Cool.

 }