#! /bin/ksh # Author: Larry W. Virden < mailto:lvirden@yahoo.com > # Date: Sept. 19, 2001 # Version: 1.4 # Purpose: to scrape tcl'ers chat log pages and accumulate them # NOTE: This script is going to record private messages and memos to your # id, so you should be careful to review the information before making it # public. Also, right now it does not toss away /help msgs, the # WELCOME message, etc. # # USAGE: scrape NUMBER_OF_SECONDS set -u if [ "$#" -ne 1 ] ; then echo "USAGE: $0 snapshot_time_in_seconds" 2>&1 exit 1 fi pslock() { set -o nolog machine=$(hostname) if [ -f $HOME/.pslock ] ; then read psmachine psprocess < $HOME/.pslock if [ "$psmachine" != "$machine" ] ; then # lock is remote prefix="rsh -n $psmachine " else prefix="" fi eval $prefix kill -0 $psprocess 2>/tmp/$$.lockck if [ -s /tmp/$$.lockck ] ; then echo "Removing (probably) dead link $psmachine:$psprocess" >&2 rm /tmp/$$.lockck rm $HOME/.pslock echo "$machine $$" > $HOME/.pslock chmod 600 $HOME/.pslock else rm /tmp/$$.lockck return 1 fi else echo "$machine $$" > $HOME/.pslock chmod 600 $HOME/.pslock return 0 fi } pslock if [ $? -ne 0 ] ; then exit 2 fi trap 'echo "TRAP encountered";rm $HOME/.pslock;exit 3' \ HUP QUIT INT ABRT BUS SEGV SYS PIPE TERM \ USR1 USR2 BASE="$HOME" # chat history file storage directory current=$BASE/chat.current time="$1" timestamp="0" # Set scrape to some non-interactive command that can fetch a # page of text scrape="/projects/intranet/bin/lynx -dump -nolist -nolog -nopause -nostatus -noprint -dont_wrap_pre -hiddenlinks=ignore -noreferer -width=10240" # Set URL to your chat information # To get this URL, use your web browser to visit the # chat room, and then check the URL information for the dialog frame. # This script expects that the user has the chat room configured for # new messages to appear at the bottom of the screen. # WARNING: The URL contains your chat password - so this file needs to be # protected appropriately. URL="" # Looping forever while [ /bin/true ] ; do date=$(date +%Y%m%d) history=$BASE/chat.$date working1=$BASE/chat.scrape.$date working2=$BASE/chat.working.$date rm $current ln -s $history $current # Create file with today's date touch $history touch $working1 touch $working2 chmod 600 $history $working1 $working2 # Looping once every N seconds while [ /bin/true ] ; do # Scrape the chat's page into a file sdate=$(date) $scrape $URL > $BASE/chat.scrape.$date # diff the historical file and the new scraping # The deletion of 4 lines is because of my use of lynx; It will # be removed once I figure out how to tell lynx to stop adding stuff to # the output. If you don't use lynx, then you probably won't need that # sed segment. diff $history $working1 | sed -n '/^>/p'| \ sed -e 's!^> !!' | sed -e '1,4d' | \ egrep -v '^ *[^ ]+ has (entered|left) the chat' > $working2 # Manipulate the results so that the unique output is then appended # to the history if [ -s $working2 ] ; then if [ "$timestamp" = "1" ] ; then echo "$sdate" >> $history fi cat $working2 >> $history fi # If the date has changed, exit the inner loop # cdate=$(date +%Y%m%d) if [ $cdate != $date ] ; then break fi sleep $time done done ---- See also [Tcl chatroom snaphost history (2)] for a pure Tcl version that will also run on non-*nix platforms. ---- [AK]: I changed my version slightly. Instead of using $HOME everywhere I use $BASE and define it at the beginning of the script. For me its value is $HOME/archives/chat for example. This paves the for a future redefinition via cmdline argument. A second change is that I maintain a soft-link chat.current which points to the history file the scraper currently appends to. This makes tail'ing the log then crossing midnight easier. ---- [LV] AK's changes have been merged in above. Also added above is a process lock so that the script can run from a cron.