Version 1 of deadlinks.tcl - A Bookmark File Cleaner

Updated 2003-09-05 19:18:16

05 Sept 2003 Mike Tuxford: I was chatting on irc this morning and a friend said 'I need something to clean out all the dead links in my bookmark file' and so I wrote the following script.

It works well for me as you can see from the command line output example below but I advise that you back-up your real bookmark file first.

 burp:/tcl/s/bookmarks# ./deadlinks.tcl bookmarks.html
 Checking 1150 sites...
 Testing site  [1150]
  1024 Good sites
   126 Dead sites

 #!/usr/bin/tclsh
 #
 #################################
 # Mike Tuxford 2003
 #
 # Script: deadlinks.tcl
 # Syntax: deadlinks <bookmark filename>
 # Checks a netscape (mozilla) style bookmark file for dead links.
 #
 # Requires the http packages.
 # Does not overwrite the bookmark file but creates 2
 # new files (name.good name.dead)
 #
 # Tests sites by just retrieving the html headers and will
 # also leave local file/dir links untouched
 #
 # adjust the "timeout,seconds" values as desired.
 # The timeout value is how long we should try retreiving a
 # page header from a site before listing it as dead.
 #

 array set opt {
   "timeout,seconds" 10
 }

 array set data {
   "c,sites" 0 "c,good" 0 "c,dead" 0
 }

 proc main {} {
   global opt fd data
   set i 0
   puts "Checking $data(c,sites) sites..."
   puts -nonewline stdout "Checking site "; flush stdout
   foreach line $data(bm,in) {
     switch -glob -- $line {
       *HREF=* {
         incr i
         puts -nonewline stdout \
           " \[$i\][string repeat \b [expr [string length $i] +3]]"
         flush stdout
         if {![extractSite $line]} {
           # it's a local dir/file bookmark
           puts $fd(good) $line
           incr data(c,good)
           continue
         }
         if {![verifySite]} {
           puts $fd(dead) $line
           incr data(c,dead)
         } else {
           puts $fd(good) $line
           incr data(c,good)
         }
       }
       default {
         puts $fd(good) $line
       }
     }
   }
   puts -nonewline "\n"
   flush stdout
   return
 }

 proc verifySite {} {
   global opt data
   if {[catch {http::geturl $data(site) -validate 1 \
       -timeout [expr 1000 * $opt(timeout,seconds)]} tok]} {
     #puts $tok
     return 0
   }
   upvar #0 $tok state
   if {$state(status) == "timeout"} {
     return 0
   }
   http::cleanup $tok
   return 1
 }

 proc extractSite {line} {
   global data
   set sidx [expr [string first "HREF" $line]+13]
   if {[string match -nocase *file:* [lrange $line 0 $sidx]]} {
     return 0
   }
   set edix [expr [string first \" $line $sidx]-1]
   set data(site) [string range $line $sidx $edix]
   return 1
 }

 ###########################
 # get the show started here
 #
 if {[catch {package require http} loadErr]} {
   puts $loadErr
   puts "Need the http package"
   exit
 }

 if {[llength $argv] == 0 || [string match -h* [lindex $argv 0]]} {
   puts "Syntax: deadlinks.tcl <bookmark filename>"
   exit
 } else {
   set opt(file,in) [lindex $argv 0]
 }

 if {[catch {open $opt(file,in) r} fd(in)]} {
   puts $fd(in)
   exit
 } else {
   set data(bm,in) [split [read $fd(in)] \n]
   close $fd(in)
 }

 set opt(file,good) "$opt(file,in).good"
 set opt(file,dead) "$opt(file,in).dead"
 foreach type {good dead} {
   if {[catch {open $opt(file,$type) w+} fd($type)]} {
     puts $fd($type)
     exit
   }
 }

 foreach line $data(bm,in) {
   if {[string match -nocase *href=* $line]} {
     incr data(c,sites)
   }
 }

 main
 close $fd(good)
 close $fd(dead)
 puts [format "%5d %s" $data(c,good) "Good sites"]
 puts [format "%5d %s" $data(c,dead) "Dead sites"]