#! /bin/ksh
# Author: Larry W. Virden < mailto:[email protected] > # Date: Sept. 19, 2001 # Version: 1.4 # Purpose: to scrape tcl'ers chat log pages and accumulate them # NOTE: This script is going to record private messages and memos to your # id, so you should be careful to review the information before making it # public. Also, right now it does not toss away /help msgs, the # WELCOME message, etc. # # USAGE: scrape NUMBER_OF_SECONDS set -u if [ "$#" -ne 1 ] ; then echo "USAGE: $0 snapshot_time_in_seconds" 2>&1 exit 1 fi pslock() { set -o nolog machine=$(hostname) if [ -f $HOME/.pslock ] ; then read psmachine psprocess < $HOME/.pslock if [ "$psmachine" != "$machine" ] ; then # lock is remote prefix="rsh -n $psmachine " else prefix="" fi eval $prefix kill -0 $psprocess 2>/tmp/$$.lockck if [ -s /tmp/$$.lockck ] ; then echo "Removing (probably) dead link $psmachine:$psprocess" >&2 rm /tmp/$$.lockck rm $HOME/.pslock echo "$machine $$" > $HOME/.pslock chmod 600 $HOME/.pslock else rm /tmp/$$.lockck return 1 fi else echo "$machine $$" > $HOME/.pslock chmod 600 $HOME/.pslock return 0 fi } pslock if [ $? -ne 0 ] ; then exit 2 fi trap 'echo "TRAP encountered";rm $HOME/.pslock;exit 3' \ HUP QUIT INT ABRT BUS SEGV SYS PIPE TERM \ USR1 USR2 BASE="$HOME" # chat history file storage directory current=$BASE/chat.current time="$1" timestamp="0" # Set scrape to some non-interactive command that can fetch a # page of text scrape="/projects/intranet/bin/lynx -dump -nolist -nolog -nopause -nostatus -noprint -dont_wrap_pre -hiddenlinks=ignore -noreferer -width=10240" # Set URL to your chat information # To get this URL, use your web browser to visit the # chat room, and then check the URL information for the dialog frame. # This script expects that the user has the chat room configured for # new messages to appear at the bottom of the screen. # WARNING: The URL contains your chat password - so this file needs to be # protected appropriately. URL="" # Looping forever while [ /bin/true ] ; do date=$(date +%Y%m%d) history=$BASE/chat.$date working1=$BASE/chat.scrape.$date working2=$BASE/chat.working.$date rm $current ln -s $history $current # Create file with today's date touch $history touch $working1 touch $working2 chmod 600 $history $working1 $working2 # Looping once every N seconds while [ /bin/true ] ; do # Scrape the chat's page into a file sdate=$(date) $scrape $URL > $BASE/chat.scrape.$date # diff the historical file and the new scraping # The deletion of 4 lines is because of my use of lynx; It will # be removed once I figure out how to tell lynx to stop adding stuff to # the output. If you don't use lynx, then you probably won't need that # sed segment. diff $history $working1 | sed -n '/^>/p'| \ sed -e 's!^> !!' | sed -e '1,4d' | \ egrep -v '^ *[^ ]+ has (entered|left) the chat' > $working2 # Manipulate the results so that the unique output is then appended # to the history if [ -s $working2 ] ; then if [ "$timestamp" = "1" ] ; then echo "$sdate" >> $history fi cat $working2 >> $history fi # If the date has changed, exit the inner loop # cdate=$(date +%Y%m%d) if [ $cdate != $date ] ; then break fi sleep $time done done
See also Tcl chatroom snaphost history (2) for a pure Tcl version that will also run on non-*nix platforms.
AK: I changed my version slightly.
Instead of using $HOME everywhere I use $BASE and define it at the beginning of the script. For me its value is $HOME/archives/chat for example. This paves the for a future redefinition via cmdline argument.
A second change is that I maintain a soft-link chat.current which points to the history file the scraper currently appends to. This makes tail'ing the log then crossing midnight easier.
LV AK's changes have been merged in above. Also added above is a process lock so that the script can run from a cron.