gopher-get

Difference between version 1 and 2 - Previous - Next
gopher-get.tcl is a tool to mirror gopher sites.  It depends on TclCurl, Tclx, and tcllib.  I have only tested this on Slackware64 15.0.

----


======
#!/usr/bin/tclsh
# gopher-get.tcl version 23 by Ben Collver
# Short script to mirror a gopher site.
# It works similar to `wget --mirror`.

package require TclCurl
package require Tclx
package require uri

proc curl_exit {ch code} {
    $ch cleanup
    exit $code
}

proc download_delay {} {
    # be a good bot and sleep between downloads (uses Tclx [sleep])
    sleep 1
}

# fetch a gopher document by URL
# if a directory index has been fetched,
# then return a file name for the contents

proc fetch {ch url} {
    set parts [gopher_uri_split $url]
    set parsed [dict create {*}$parts]

    set host [dict get $parsed fqdn]
    set type [dict get $parsed type]
    set path [dict get $parsed path]

    if {[string length $host] == 0 ||
        [string length $type] == 0 ||
        [string length $path] == 0
    } {
        puts "Could not parse URL $url"
        return ""
    }

    if {$type eq "0"} {
        if {[string index $path end] eq "/"} {
            set filename "index.gph"
            set file [format "%s/%s%s%s" $host $type $path $filename]
        } else {
            set file [format "%s/%s%s" $host $type $path]
        }
    } elseif {$type eq "1"} {
        if {[string index $path end] ne "/"} {
            set path "$path/"
        }
        set filename "index.gph"
        set file [format "%s/%s%s%s" $host $type $path $filename]
    } elseif {$type eq "9"} {
        if {[string index $path end] eq "/"} {
            puts "Bad binary filename $path"
            return ""
        } else {
            set file [format "%s/%s%s" $host $type $path]
        }
    } elseif {$type eq "I" || $type eq "g"} {
        if {[string index $path end] eq "/"} {
            puts "Bad image filename $path"
            return ""
        }
        set file [format "%s/%s%s" $host $type $path]
    } else {
        puts "Unknown item type $type"
        return ""
    }

    set dirname [file dirname $file]
    if {![file exists $dirname]} {
        puts "mkdir $dirname"
        file mkdir $dirname
    }
    if {![file exists $file]} {
        $ch configure -file $file -url $url
        puts "Downloading $url ..."
        set result [$ch perform]
        if {$result != 0} {
            puts "Curl error $result"
        }
        download_delay
    }
    if {$type eq "1"} {
        set retval $file
    } else {
        set retval ""
    }
    return $retval
}

# gopher_uri_pattern_orig returns a regular expression to parse gopher URL's
# the same pattern exists in $::uri::gopher::url
# this unused procedure is here to compare with gopher_uri_pattern

proc gopher_uri_pattern_orig {} {
    set escape $::uri::basic::escape
    set hostOrPort $::uri::basic::hostOrPort
    set search $::uri::http::search

    # make $xCharN the same as $::uri::basic::xCharN
    set xCharN {[a-zA-Z0-9$_.+!*'(,);/?:@&=-]}

    # make $xChar the same as $::uri::basic::xChar
    set xChar "(${xCharN}|${escape})"

    # make $url the same as $::uri::gopher::url
    set type $xChar
    set selector "$xChar*"
    set string $selector
    set schemepart  "//${hostOrPort}(/(${type}(${selector}(%09${search}(%09${string})?)?)?)?)?"
    set url "gopher:${schemepart}"

    return $::uri::gopher::url
}

# gopher_uri_pattern returns a regular expression to parse gopher URL's
# a similar regular expression can be found in $::uri::gopher::url
# this version is the same, except it adds a tilde as an acceptable
# character.
#
# Why didn't tcllib uri allow tilde in the first place?
# See link below for pedantic reasons against allowing tilde.
# https://jkorpela.fi/tilde.html

proc gopher_uri_pattern {} {
    set escape $::uri::basic::escape
    set hostOrPort $::uri::basic::hostOrPort
    set search $::uri::http::search

    # add tilde "~" to $xCharN
    set xCharN {[a-zA-Z0-9$_.+~!*'(,);/?:@&=-]}

    # add tilde "~" to $xChar
    set xChar "(${xCharN}|${escape})"

    # add tilde "~" to $url
    set type $xChar
    set selector "$xChar*"
    set string $selector
    set schemepart  "//${hostOrPort}(/(${type}(${selector}(%09${search}(%09${string})?)?)?)?)?"
    set url "gopher:${schemepart}"

    return $url
}

proc gopher_uri_split {url} {
    set pattern [gopher_uri_pattern]
    set parts [regexp -inline $pattern $url]
    # protect against relative path abuse
    set path [lindex $parts 11]
    set path [string map [list "/../" "/dotdot/"] $path]

    set retval [list                 \
        url       [lindex $parts 0]  \
        fqdn      [lindex $parts 1]  \
        fqdn2     [lindex $parts 2]  \
        host_dot  [lindex $parts 3]  \
        host      [lindex $parts 4]  \
        domain    [lindex $parts 5]  \
        unused1   [lindex $parts 6]  \
        unused2   [lindex $parts 7]  \
        req_abs   [lindex $parts 8]  \
        req_rel   [lindex $parts 9]  \
        type      [lindex $parts 10] \        path      [lindex $partsh 11]             \
        last_char [lindex $parts 12] \
        unused3   [lindex $parts 13] \
        unused4   [lindex $parts 14] \
        unused5   [lindex $parts 15] \
        unused6   [lindex $parts 16] \
    ]
}

# parse a gopher directory index for links

proc parse {filename} {
   set retval [list]

   set fh [open $filename]
   set text [read $fh]
   close $fh

   set text [string map [list "\r\n" "\n"] $text]
   set lines [split $text "\n"]
   foreach {line} $lines {
      set type [string index $line 0]
      set data [string range $line 1 end]
      set fields [split $data "\t"]
      lassign $fields label path server port
      if {$type eq "1"} {
          if {[string index $path end] ne "/"} {
              set path "$path/"
          }
      }
      if {$type eq "0" ||
          $type eq "1" ||
          $type eq "9" ||
          $type eq "I" ||
          $type eq "g"
      } {
          if {$port == 70} {
              set url [format "gopher://%s/%s%s" $server $type $path]
          } else {
              set url [format "gopher://%s:%d/%s%s" $server $port $type $path]
          }
          lappend retval $url
      } elseif {$type eq "h" || $type eq "i"} {
          # skip "h" and "i" types
          continue
      } elseif {$type eq "."} {
          break
      } else {
          puts "Unknown gopher type $type in $filename"
          continue
      }
   }
   return $retval
}

proc main {argv} {
    set ch [curl::init]

    set binary "error"
    set show_skipped false
    set url ""
    while {[llength $argv] > 0} {
        set remainder [lassign $argv arg]
        set argv $remainder
        if {$arg eq "--binary-skip"} {
            set binary "skip"
        } elseif {$arg eq "--binary-save"} {
            set binary "save"
        } elseif {$arg eq "--show-skipped"} {
            set show_skipped true
        } elseif {[regexp {^--} $arg]} {
            puts "\nUnknown option $arg\n"
            break
        } else {
            set url $arg
            break
        }
    }

    if {[string length $url] == 0} {
        puts {Usage: gopher-get.tcl [options] gopher-URI}
        puts ""
        puts "Options:"
        puts "--binary-skip (Skip item type 9 links)"
        puts "--binary-save (Save item type 9 links)"
        puts "--show-skipped (Report skipped links)"
        puts ""
        exit
    }

    set parts [gopher_uri_split $url]
    set parsed [dict create {*}$parts]
    set host [dict get $parsed fqdn]
    set path [dict get $parsed path]
    set path_len [string length $path]
    if {$path_len > 0} {
        incr path_len -1
    }

    set files_done [dict create]
    set uris [list $parsed]
    set urls_done [dict create]

    while {[llength $uris] > 0} {
        # for each url in the list, fetch a file
        # keep track of type 1 index files
        set files [list]
        foreach {uri} $uris {
            set type [dict get $uri type]
            set url [dict get $uri url]
            dict set urls_done $url 1
            set filename [fetch $ch $url]
            if {$type eq "1" &&
                [llength $filename] > 0 &&
                ![dict exists $files_done $filename]
            } {
                lappend files $filename
            }
        }

        # parse each type 1 index file for new urls
        set uris [list]
        foreach {filename} $files {
            dict set files_done $filename 1
            set links [parse $filename]
            foreach {link} $links {
                set parts [gopher_uri_split $link]
                set parsed [dict create {*}$parts]
                set link_host [dict get $parsed fqdn]
                set link_path [dict get $parsed path]
                set link_type [dict get $parsed type]
                set start [string range $link_path 0 $path_len]
                if {$link_host ne $host} {
                    if {$show_skipped} {
                        puts "Skipped link $link, $link_host != $host"
                    }
                    continue
                }
                if {$start ne $path} {
                    if {$show_skipped} {
                        puts "Skipped link $link, $start != $path"
                    }
                    continue
                }
                if {[dict exists $urls_done $link]} {
                    if {$show_skipped} {
                        puts "Skipped link $link, link already flagged done"
                    }
                    continue
                }
                if {$link_type eq "9"} {
                    if {$binary eq "error"} {
                        puts "Error: item type 9 (binary file)"
                        exit
                    } elseif {$binary eq "skip"} {
                        if {$show_skipped} {
                            puts "Skipped link $link, item type = 9"
                        }
                        continue
                    } else {
                        # save it
                    }
                }
                lappend uris $parsed
            }
        }

    }

    curl_exit $ch 0
}

main $::argv
======

<<categories>> Internet