Tcl chatroom snaphost history (2)


In late November, 2001, changes were made to this web site to create an ongoing log of the chatroom. The code below should no longer be used, due to its impact on the host computer. Instead, see grabchat for a tool that will return for you the previous day's chat log. If you want some other day's chat log, feel free to modify the tool.


MS

Here is a pure Tcl script that builds up the history of the chatroom. See also Larry Virden's ksh chatroom snaphost history. Note that Larry's script requires (ksh + lynx + sed + diff + egrep).

Notes:

  • the script will build daily snapshots of the chatroom activity
  • this may still be buggy ...
  • these scripts produces invalid html - it is missing the end-tags </BODY></HTML>. Most browsers are tolerant of this (I hope, at least netscape is ...)

A first version, which sometimes stutters and is slower, is at Tcl chatroom snaphost history; I left it there so that I can reconstruct my mistakes ... This version has the following improvements:

  • it does not need an auxiliary file
  • it does not stutter
  • it uses string ops instead of list ops, moving some processing from the script to [string]
  • it is better factored (findOverlap as new proc)
  • it has a default time setting (180 secs)
  • does not put time stamps - they proved to be a nuisance; there is now an independent robot that timestamps the chat every half hour
  • uses better regexps that reduce the cleanup time by a factor 10

 #! /usr/local/bin/tclsh
 # Author: Miguel Sofer < mailto:[email protected] >
 # Date:    Sept. 23, 2001
 # Version: 0.5 (remove redirects)
 # Adapted from Larry W. Virden's program (ksh+lynx+sed+diff+egrep)
 # Purpose: to scrape tcl'ers chat log pages and accumulate them
 # NOTE: This script is going to record private messages and memos to your
 #   id, so you should be careful to review the information before making it
 #   public.

 #############################################################
 # USAGE
 #############################################################
 #
 # First define the variables URL and chatPath in this section.
 #
 # Then call this program giving a time (in seconds) between checks,
 # it will run forever and update the history file periodically.
 #

 #
 #       Set URL to your chat information
 #
 set URL {http://purl.org/mini/cgi-bin/chat.cgi?action=chat&name=miguel&password=I_WONT_TELL_YOU&updatefrequency=45&color=800000&new_msg_on_top=0&ls=&pause=#end}
 # To get this URL, use your web browser to visit the
 # chat room, and then check the URL information for the dialog frame.
 # WARNING: The URL contains your chat password - so this file needs to be
 # protected appropriately.

 # 
 # Set the directory for history files histFile_date.htm
 # 
 set chatPath /SCRAPE

 #############################################################

 set len [llength $argv]
 if {$len != 1} {
     if {$len} {
        error "USAGE: $argv0 seconds_between_snapshots"
     } else {
        set argv 180
     }
 }
 set time [expr {1000 * [lindex $argv 0]}]

 package require http

 proc repeatRecord {time} {
     recordData
     after $time repeatRecord $time
 }


 #
 # Build a regexp to clean up; this is very dependent on the
 # particular formatting of the chat program
 # The cleaned-up data has to start with a "real data" line
 #

 # match headers, up to <BODY ...> tag
 set re1 {^.*<BO[^>]*>}

 # match entered/left lines
 set re2 {<B>[^ :]* h[^R]*R>\n}

 # match trailers, after <A ...> tag
 set re3 {<A NAME.*}

 # remove URL redirection (hopefully this text does not appear in messages)
 set re4 {chat2.cgi\?action=gotourl&url=}

 set re "${re1}|${re2}|${re3}|${re4}"

 #
 # getData gets and cleans up the data from the chat.
 #
 proc getChat {} {
     set token {}
     if {[catch {set token [::http::geturl $::URL -timeout 30000]}]\
            || ([::http::status $token] != "ok")\
            || ([::http::ncode $token] != "200")} {
        ::http::cleanup $token
        return {}
     }
     regsub -all $::re [::http::data $token] {} data
     ::http::cleanup $token
     set data
 }


 #
 # the workhorse: record the data
 #
 variable oldData {} 

 proc recordData {} {
     variable oldData

     set data [getChat]
     set dataLen [string length $data]
     if {$dataLen} {
        set new $data
     } else {
        set new "<B>******* [clock format [clock seconds]]: ... no connection</B>\n<BR>\n"
     }

     set date [clock format [clock seconds] -format %y%m%d]
     set histFile [file join $::chatPath histfile_$date.htm]

     #
     # If there is no history file, start one with the current data
     #
     if {![file isfile $histFile]} {
        set hist [open $histFile w]
        puts $hist {<HTML><HEAD><STYLE TYPE="text/css"> </STYLE></HEAD><BODY BGColor=#ffffff>}
        puts $hist "<B>******* [clock format [clock seconds]]: START RECORDING</B>\n<BR>\n<BR>\n"
        puts -nonewline $hist $new
        close $hist
        set oldData $data
        return
     } 

     #
     # If old data is not in memory, get it from the history file;
     # only get as many bytes as you need
     #
     if {![string length $oldData]} {
        set hist [open $histFile]
        set oldData [read $hist]
        set oldData [string range $oldData end-[string length $data] end]
        close $hist
     }

     #
     # Record the new messages
     #
     set hist [open $histFile a]
     if {$dataLen} {
        set pos [findOverlap $oldData $data]
        if {$pos} {
            puts -nonewline $hist [string range $data $pos end]
        } else {
            puts $hist "<B>....... possibly missing data</B>\n<BR>\n"
            puts -nonewline $hist $data
        }
        set oldData $data
     } else {
        puts $hist $new
     }
     close $hist
 }


 #
 # This proc finds the new messages; specialized from
 #        http://purl.org/mini/tcl/2184.html
 #
 proc findOverlap {str1 str2} {
     set first2 [string range $str2 0 [string first "\n" $str2]]
     set firstLen [string length $first2]

     while {1} {
         set index [string first $first2 $str1]
         if {$index == -1} {
             return 0
         }
         set str1 [string range $str1 $index end]
         set len [string length $str1]
         if {[string equal -length $len $str1 $str2]} {
             return $len
         }
         set str1 [string range $str1 $firstLen end]
     }
 }       set str1 [string range $str1 $firstLen end]

 repeatRecord $time
 vwait forever