Version 0 of gopher-get

Updated 2022-09-14 20:18:31 by pi31415

gopher-get.tcl is a tool to mirror gopher sites. It depends on TclCurl, Tclx, and tcllib. I have only tested this on Slackware64 15.0.


#!/usr/bin/tclsh

# gopher-get.tcl version 1 by Ben Collver
# Short script to mirror a gopher site.
# It works similar to `wget --mirror`.

package require TclCurl
package require Tclx
package require uri

proc curl_exit {ch code} {
    $ch cleanup
    exit $code
}

proc download_delay {} {
    # be a good bot and sleep between downloads
    sleep 1
}

# fetch a gopher document by URL
# if a directory index has been fetched,
# then return a file name for the contents

proc fetch {ch url} {
    set parts [gopher_uri_split $url]
    set parsed [dict create {*}$parts]

    set host [dict get $parsed fqdn]
    set type [dict get $parsed type]
    set path [dict get $parsed path]

    if {[string length $host] == 0 ||
        [string length $type] == 0 ||
        [string length $path] == 0
    } {
        puts "Could not parse URL $url"
        return ""
    }

    if {$type eq "0"} {
        if {[string index $path end] eq "/"} {
            set filename "index.gph"
            set file [format "%s/%s%s%s" $host $type $path $filename]
        } else {
            set file [format "%s/%s%s" $host $type $path]
        }
    } elseif {$type eq "1"} {
        if {[string index $path end] ne "/"} {
            set path "$path/"
        }
        set filename "index.gph"
        set file [format "%s/%s%s%s" $host $type $path $filename]
    } elseif {$type eq "I" || $type eq "g"} {
        if {[string index $path end] eq "/"} {
            puts "Bad image filename $path"
            return ""
        }
        set file [format "%s/%s%s" $host $type $path]
    } else {
        puts "Unknown item type $type"
        return ""
    }

    set dirname [file dirname $file]
    if {![file exists $dirname]} {
        puts "mkdir $dirname"
        file mkdir $dirname
    }
    if {![file exists $file]} {
        $ch configure -file $file -url $url
        puts "Downloading $url ..."
        set result [$ch perform]
        if {$result != 0} {
            puts "Curl error $result"
        }
        download_delay
    }
    if {$type eq "1"} {
        set retval $file
    } else {
        set retval ""
    }
    return $retval
}

# gopher_uri_pattern_orig returns a regular expression to parse gopher URL's
# the same pattern exists in $::uri::gopher::url
# this unused procedure is here to compare with gopher_uri_pattern

proc gopher_uri_pattern_orig {} {
    set escape $::uri::basic::escape
    set hostOrPort $::uri::basic::hostOrPort
    set search $::uri::http::search

    # make $xCharN the same as $::uri::basic::xCharN
    set xCharN {[a-zA-Z0-9$_.+!*'(,);/?:@&=-]}

    # make $xChar the same as $::uri::basic::xChar
    set xChar "(${xCharN}|${escape})"

    # make $url the same as $::uri::gopher::url
    set type $xChar
    set selector "$xChar*"
    set string $selector
    set schemepart  "//${hostOrPort}(/(${type}(${selector}(%09${search}(%09${string})?)?)?)?)?"
    set url "gopher:${schemepart}"

    return $::uri::gopher::url
}

# gopher_uri_pattern returns a regular expression to parse gopher URL's
# a similar regular expression can be found in $::uri::gopher::url
# this version is the same, except it adds a tilde as an acceptable
# character.
#
# Why didn't tcllib uri allow tilde in the first place?
# See link below for pedantic reasons against allowing tilde.
# https://jkorpela.fi/tilde.html

proc gopher_uri_pattern {} {
    set escape $::uri::basic::escape
    set hostOrPort $::uri::basic::hostOrPort
    set search $::uri::http::search

    # add tilde "~" to $xCharN
    set xCharN {[a-zA-Z0-9$_.+~!*'(,);/?:@&=-]}

    # add tilde "~" to $xChar
    set xChar "(${xCharN}|${escape})"

    # add tilde "~" to $url
    set type $xChar
    set selector "$xChar*"
    set string $selector
    set schemepart  "//${hostOrPort}(/(${type}(${selector}(%09${search}(%09${string})?)?)?)?)?"
    set url "gopher:${schemepart}"

    return $url
}

proc gopher_uri_split {url} {
    set pattern [gopher_uri_pattern]
    set parts [regexp -inline $pattern $url]
    set retval [list \
        url       [lindex $parts 0] \
        fqdn      [lindex $parts 1] \
        fqdn2     [lindex $parts 2] \
        host_dot  [lindex $parts 3] \
        host      [lindex $parts 4] \
        domain    [lindex $parts 5] \
        unused1   [lindex $parts 6] \
        unused2   [lindex $parts 7] \
        req_abs   [lindex $parts 8] \
        req_rel   [lindex $parts 9] \
        type      [lindex $parts 10] \
        path      [lindex $parts 11] \
        last_char [lindex $parts 12] \
        unused3   [lindex $parts 13] \
        unused4   [lindex $parts 14] \
        unused5   [lindex $parts 15] \
        unused6   [lindex $parts 16] \
    ]
}

# parse a gopher directory index for links

proc parse {filename} {
   set retval [list]

   set fh [open $filename]
   set text [read $fh]
   close $fh

   set text [string map [list "\r\n" "\n"] $text]
   set lines [split $text "\n"]
   foreach {line} $lines {
      set type [string index $line 0]
      set data [string range $line 1 end]
      set fields [split $data "\t"]
      lassign $fields label path server port
      if {$type eq "1"} {
          if {[string index $path end] ne "/"} {
              set path "$path/"
          }
      }
      if {$type eq "0" || $type eq "1" || $type eq "I" || $type eq "g"} {
          if {$port == 70} {
              set url [format "gopher://%s/%s%s" $server $type $path]
          } else {
              set url [format "gopher://%s:%d/%s%s" $server $port $type $path]
          }
          lappend retval $url
      } elseif {$type eq "h" || $type eq "i"} {
          # ignore "h" and "i" types
          continue
      } elseif {$type eq "."} {
          break
      } else {
          puts "Unknown gopher type $type in $filename"
          continue
      }
   }
   return $retval
}

proc main {argv} {
    set ch [curl::init]

    if {[llength $argv] == 0} {
        puts "Usage: gopher-get.tcl gopher-URI"
        exit
    }

    set url [lindex $argv 0]

    set parts [gopher_uri_split $url]
    set parsed [dict create {*}$parts]
    set host [dict get $parsed fqdn]
    set path [dict get $parsed path]
    set path_len [string length $path]
    if {$path_len > 0} {
        incr path_len -1
    }

    set files_done [dict create]
    set urls [list $url]
    set urls_done [dict create]

    while {[llength $urls] > 0} {
        set files [list]
        foreach {url} $urls {
            dict set urls_done $url 1
            set filename [fetch $ch $url]
            if {[llength $filename] > 0 &&
                ![dict exists $files_done $filename]
            } {
                lappend files $filename
            }
        }

        set urls [list]
        foreach {filename} $files {
            dict set files_done $filename 1
            set links [parse $filename]
            foreach {link} $links {
                set parts [gopher_uri_split $link]
                set parsed [dict create {*}$parts]
                set link_host [dict get $parsed fqdn]
                set link_path [dict get $parsed path]
                set start [string range $link_path 0 $path_len]
                if {$link_host eq $host             &&
                    $start eq $path                 &&
                    ![dict exists $urls_done $link]
                } {
                    lappend urls $link
                }
            }
        }
    }

    curl_exit $ch 0
}

main $::argv