US You need a sitemap file, listing all URLs of your site?
This Tcl program generates a sitemap.txt file, which is also suitable for the Google sitemap programme. It's very simple and probably needs a lot of improvement, but it works. Suggestions, improvements and bug fixes welcome. If you don't want to watch it working, just remove the puts to stdout.
To validate all pages in your sitemap file use the Sitemap Validator.
Of course, it is public domain, do whatever you want with it, but don't blame me if it doesn't do what it should.
#!/usr/bin/env tclsh package require http proc get_urls {url} { global urls global inv_urls global exc_urls global urlptr # get protocol and site if {![regexp {^(http://[^/]+)(/.*)?$} $url --> psite x]} { # reject incomplete urls return } puts "getting $url" # get page 'url' set p [::http::geturl $url] if {[set status [::http::ncode $p]] != 200} { # update visit counter and status lappend inv_urls $url ::http::cleanup $p return } set new_urls [list] # find all hrefs to same domain set re {<a\s.*?href="?([^" >]+)[" >]} ;# " foreach {href new_url} [regexp -all -inline -- $re [::http::data $p]] { lappend new_urls [lindex [split $new_url ?] 0] } # Uncomment the following lines if you need to reach a page # behind a query form via its action. # set re {<form\s.*?action="?([^" >]+)[" >]} ;# " # foreach {href new_url} [regexp -all -inline -- $re [::http::data $p]] { # lappend new_urls [lindex [split $new_url ?] 0] # } foreach new_url $new_urls { puts "found $new_url" # don't visit excluded urls set drop 0 foreach ex $exc_urls { if {[string match *${ex}* $new_url]} { puts "excluded $new_url" set drop 1 } } if {$drop} { continue } if {[string index $new_url 0] eq "/"} { puts "completing $new_url" set new_url "${psite}$new_url" } if {![string match ${psite}* $new_url]} { puts "dropped $new_url" continue } # insert into db (unique!) if {[lsearch -exact $urls $new_url] == -1} { lappend urls $new_url } } ::http::cleanup $p # select the first unvisited url from db if {[llength $urls] > $urlptr} { set next_url [lindex $urls $urlptr] incr urlptr # call geturls get_urls $next_url } return } # init_db urls.db set urls [list] set inv_urls [list] set urlptr 0 # A list of pages you don't want to have scanned set exc_urls {contact} get_urls [lindex $argv 0] # cleanup invalid urls foreach iu $inv_urls { set idx [lsearch -exact $urls $iu] set urls [lreplace $urls $idx $idx] } set fd [open sitemap.txt w] foreach url [lsort $urls] { puts $fd $url } close $fd