#! /bin/ksh
 # Author: Larry W. Virden < mailto:lvirden@yahoo.com >
 # Date:    Sept. 19, 2001
 # Version: 1.4
 # Purpose: to scrape tcl'ers chat log pages and accumulate them
 # NOTE: This script is going to record private messages and memos to your
 #   id, so you should be careful to review the information before making it
 #   public.  Also, right now it does not toss away /help msgs, the
 #  WELCOME message, etc.
 #
 # USAGE: scrape NUMBER_OF_SECONDS

 set -u

 if [ "$#" -ne 1 ] ; then
        echo "USAGE: $0 snapshot_time_in_seconds" 2>&1
        exit 1
 fi

 pslock()
 {
 set -o nolog
 machine=$(hostname)
 if [ -f $HOME/.pslock ] ; then
        read psmachine psprocess < $HOME/.pslock
        if [ "$psmachine" != "$machine" ] ; then
                # lock is remote
                prefix="rsh -n $psmachine "
        else
                prefix=""
        fi
        eval $prefix kill -0 $psprocess 2>/tmp/$$.lockck
        if [ -s /tmp/$$.lockck ] ; then
                echo "Removing (probably) dead link $psmachine:$psprocess" >&2
                rm /tmp/$$.lockck
                rm $HOME/.pslock
                echo  "$machine $$" > $HOME/.pslock
                chmod 600 $HOME/.pslock
        else
                rm /tmp/$$.lockck
                return 1
        fi
 else
        echo  "$machine $$" > $HOME/.pslock
        chmod 600 $HOME/.pslock
        return 0
 fi
 }
 pslock
 if [ $? -ne 0 ] ; then
        exit 2
 fi

 trap 'echo "TRAP encountered";rm $HOME/.pslock;exit 3' \
        HUP QUIT INT ABRT BUS SEGV SYS PIPE TERM \
        USR1 USR2 

 BASE="$HOME"                   # chat history file storage directory
 current=$BASE/chat.current

 time="$1"
 timestamp="0"

 #      Set scrape to some non-interactive command that can fetch a
 #      page of text
 scrape="/projects/intranet/bin/lynx -dump -nolist -nolog -nopause -nostatus -noprint -dont_wrap_pre -hiddenlinks=ignore -noreferer -width=10240"

 # Set URL to your chat information
 # To get this URL, use your web browser to visit the
 #  chat room, and then check the URL information for the dialog frame.
 # This script expects that the user has the chat room configured for
 #  new messages to appear at the bottom of the screen.
 # WARNING: The URL contains your chat password - so this file needs to be
 #  protected appropriately.
 URL=""

 # Looping forever
 while [ /bin/true ] ; do
    date=$(date +%Y%m%d)
    history=$BASE/chat.$date
    working1=$BASE/chat.scrape.$date
    working2=$BASE/chat.working.$date

    rm $current
    ln -s $history $current

 #  Create file with today's date
    touch $history
    touch $working1
    touch $working2
    chmod 600 $history $working1 $working2

 #  Looping once every N seconds
    while [ /bin/true ] ; do

 #   Scrape the chat's page into a file

      sdate=$(date)
      $scrape $URL > $BASE/chat.scrape.$date
 #   diff the historical file and the new scraping
 #   The deletion of 4 lines is because of my use of lynx;  It will
 #   be removed once I figure out how to tell lynx to stop adding stuff to
 #   the output.  If you don't use lynx, then you probably won't need that
 #   sed segment.
      diff $history $working1 | sed -n '/^>/p'| \
        sed -e 's!^> !!' | sed -e '1,4d' | \
        egrep -v '^ *[^ ]+ has (entered|left) the chat' > $working2

 #   Manipulate the results so that the unique output is then appended
 #    to the history
      if [ -s $working2 ] ; then
        if [ "$timestamp" = "1" ] ; then
                echo "$sdate" >> $history
        fi
        cat $working2 >> $history 
      fi

 #   If the date has changed, exit the inner loop
 #
      cdate=$(date +%Y%m%d)
      if [ $cdate != $date ] ; then
        break
      fi
      sleep $time
    done

 done

----
See also [Tcl chatroom snaphost history (2)] for a pure Tcl version that will also run on non-*nix platforms.

----
[AK]: I changed my version slightly.

Instead of using $HOME everywhere I use $BASE and define it at the beginning of the script. For me its value is $HOME/archives/chat for example. This paves the for a future redefinition via cmdline argument.

A second change is that I maintain a soft-link chat.current which points to the history file the scraper currently appends to. This makes tail'ing the log then crossing midnight easier.

----
[LV] AK's changes have been merged in above.  Also added above is a process
lock so that the script can run from a cron.