vfs::http

A version of this code is now part of vfslib, which is part of the tclvfs project code.

It provides one with the ability to access directories and files via the http protocol.


package provide vfs::http 0.5

  package require vfs 1.0
  package require http

  # This works for basic operations, but has not been very debugged.

  namespace eval vfs::http {}

  proc vfs::http::Mount {dirurl local} {
    ::vfs::log "http-vfs: attempt to mount $dirurl at $local"
    if {[string index $dirurl end] != "/"} {
        append dirurl "/"
    }
    if {[string range $dirurl 0 6] == "http://"} {
        set rest [string range $dirurl 7 end]
    } else {
        set rest $dirurl
        set dirurl "http://${dirurl}"
    }
    
    if {![regexp {(([^:]*)(:([^@]*))?@)?([^/]*)(/(.*/)?([^/]*))?$} $rest \
      junk junk user junk pass host junk path file]} {
        return -code error "Sorry I didn't understand\
          the url address \"$dirurl\""
    }
    
    if {[string length $file]} {
        return -code error "Can only mount directories, not\
          files (perhaps you need a trailing '/' - I understood\
          a path '$path' and file '$file')"
    }
    
    if {![string length $user]} {
        set user anonymous
    }
    
    set token [::http::geturl $dirurl -validate 1]

    if {![catch {vfs::filesystem info $dirurl}]} {
        # unmount old mount
        ::vfs::log "ftp-vfs: unmounted old mount point at $dirurl"
        vfs::unmount $dirurl
    }
    ::vfs::log "http $host, $path mounted at $local"
    vfs::filesystem mount $local [list vfs::http::handler $dirurl $path]
    # Register command to unmount
    vfs::RegisterMount $local [list ::vfs::http::Unmount $dirurl]
    return $dirurl
  }

  proc vfs::http::Unmount {dirurl local} {
    vfs::filesystem unmount $local
  }

  proc vfs::http::handler {dirurl path cmd root relative actualpath args} {
    if {$cmd == "matchindirectory"} {
        eval [list $cmd $dirurl $relative $actualpath] $args
    } else {
        eval [list $cmd $dirurl $relative] $args
    }
  }

  # If we implement the commands below, we will have a perfect
  # virtual file system for remote http sites.

  proc vfs::http::stat {dirurl name} {
    ::vfs::log "stat $name"
    
    # get information on the type of this file.  We describe everything
    # as a file (not a directory) since with http, even directories
    # really behave as the index.html they contain.
    set state [::http::geturl "$dirurl$name" -validate 1]
    set mtime 0
    lappend res type file
    lappend res dev -1 uid -1 gid -1 nlink 1 depth 0 \
      atime $mtime ctime $mtime mtime $mtime mode 0777
    return $res
  }

  proc vfs::http::access {dirurl name mode} {
    ::vfs::log "access $name $mode"
    if {$mode & 2} {
        vfs::filesystem posixerror $::vfs::posix(EROFS)
    }
    if {$name == ""} { return 1 }
    set state [::http::geturl "$dirurl$name"]
    set info ""
    if {[string length $info]} {
        return 1
    } else {
        error "No such file"
    }
  }

  # We've chosen to implement these channels by using a memchan.
  # The alternative would be to use temporary files.
  proc vfs::http::open {dirurl name mode permissions} {
    ::vfs::log "open $name $mode $permissions"
    # return a list of two elements:
    # 1. first element is the Tcl channel name which has been opened
    # 2. second element (optional) is a command to evaluate when
    #    the channel is closed.
    switch -glob -- $mode {
        "" -
        "r" {
            set state [::http::geturl "$dirurl$name"]

            set filed [vfs::memchan]
            fconfigure $filed -translation binary
            puts -nonewline $filed [::http::data $state]

            fconfigure $filed -translation auto
            seek $filed 0
            return [list $filed]
        }
        "a" -
        "w*" {
            vfs::filesystem posixerror $::vfs::posix(EROFS)
        }
        default {
            return -code error "illegal access mode \"$mode\""
        }
    }
  }

  proc vfs::http::matchindirectory {dirurl path actualpath pattern type} {
    ::vfs::log "matchindirectory $path $pattern $type"
    set res [list]

    if {[string length $pattern]} {
        # need to match all files in a given remote http site.
        # need to match all files in a given remote http site.
    } else {
        # single file
        if {![catch {access $dirurl $path 0}]} {
            lappend res $path
        }
    }
    
    return $res
  }

  proc vfs::http::createdirectory {dirurl name} {
    ::vfs::log "createdirectory $name"
    vfs::filesystem posixerror $::vfs::posix(EROFS)
  }

  proc vfs::http::removedirectory {dirurl name recursive} {
    ::vfs::log "removedirectory $name"
    vfs::filesystem posixerror $::vfs::posix(EROFS)
  }

  proc vfs::http::deletefile {dirurl name} {
    ::vfs::log "deletefile $name"
    vfs::filesystem posixerror $::vfs::posix(EROFS)
  }

  proc vfs::http::fileattributes {dirurl path args} {
    ::vfs::log "fileattributes $args"
    switch -- [llength $args] {
        0 {
            # list strings
            return [list]
        }
        1 {
            # get value
            set index [lindex $args 0]
        }
        2 {
            # set value
            set index [lindex $args 0]
            set val [lindex $args 1]
            vfs::filesystem posixerror $::vfs::posix(EROFS)
        }
    }
  }

  proc vfs::http::utime {dirurl path actime mtime} {
    vfs::filesystem posixerror $::vfs::posix(EROFS)
  }

append the following if you are intrested in cding into a directory

  proc vfs::http::stat {dirurl name} {
    # Procedure redfined by Zarutian
    if {$name != {}} {
      set url "$dirurl$name"
    } else {
      set url $dirurl
    }
    ::vfs::log "http-vfs: stat url $url"
    set state [::http::geturl $url -validate 1]
    set mtime 0
    lappend res dev -1 
    lappend res uid -1
    lappend res gid -1
    lappend res nlink 1
    lappend res depth 0
    lappend res atime $mtime
    lappend res ctime $mtime
    lappend res mtime $mtime
    lappend res mode 0777
    if {[string index $url end] == "/"} {
      lappend res type directory
    } else {
      lappend res type file
    }
    return $res
  }

MHo 2009-06-28: Does someone know how to cross a proxy which needs auth with vfs::http? Does autoproxy help?


Zarutian 17. july 2005: The vfs::http implemention above loads the whole file into memory before giving the caller script access to it. I ask is this a deficiency?

I think it is. So an implemention of vfs::http that uses a dual sided memchan would be better because then the vfs could handle files serveral megabytes in size without the interpreter running out of memory. Also the vfs could handle streaming files, such as generated by CGI or a streaming audio/video. (A dual sided memchan is actualy two channels per memmory buffer: one in, one out.)

LV: Sounds like a great idea - are you working on that? It would probably be great to add to tclvfs once it is finished.

Zarutian 31. july 2005: Well, I got the dual sided membuffer somewhere on my personal page at this wiki (think it is under zarutian.memchan but I am not sure). And another thing I want to do is to keep the metadata that is returned in the http header accessable from the vfs.


RFox - 2010-11-12 12:56:12

If you enable the http package to handle https as shown elsewhere in the wiki, does that make the Http virtual filesystem able to mount https: URLs?

Thanks

EF No, but all it takes is to change the following procedure... This works by verifying with the http package that it supports and recognises the protocol found in the URL, and if so, go on as usual with the mounting. Verification works by unregistering the protocol: if that fails, the protocol is unknown; if that works, the protocol is known and we register back again. I would suggest to apply the change to the main CVS tree .

proc vfs::http::urlparse {url} {
    # Taken from http 2.5.3

    # Validate URL by parts.  We suck out user:pass if it exists as the
    # core http package does not automate HTTP Basic Auth yet.

    # Returns data in [array get] format.  The url, host and file keys are
    # guaranteed to exist.  proto, port, query, anchor, and user should be
    # checked with [info exists]. (user may contain password)

    # URLs have basically four parts.
    # First, before the colon, is the protocol scheme (e.g. http)
    # Second, for HTTP-like protocols, is the authority
    #        The authority is preceded by // and lasts up to (but not including)
    #        the following / and it identifies up to four parts, of which only one,
    #        the host, is required (if an authority is present at all). All other
    #        parts of the authority (user name, password, port number) are optional.
    # Third is the resource name, which is split into two parts at a ?
    #        The first part (from the single "/" up to "?") is the path, and the
    #        second part (from that "?" up to "#") is the query. *HOWEVER*, we do
    #        not need to separate them; we send the whole lot to the server.
    # Fourth is the fragment identifier, which is everything after the first
    #        "#" in the URL. The fragment identifier MUST NOT be sent to the server
    #        and indeed, we don't bother to validate it (it could be an error to
    #        pass it in here, but it's cheap to strip).
    #
    # An example of a URL that has all the parts:
    #   http://jschmoe:[email protected]:8000/foo/bar.tml?q=foo#changes
    # The "http" is the protocol, the user is "jschmoe", the password is
    # "xyzzy", the host is "www.bogus.net", the port is "8000", the path is
    # "/foo/bar.tml", the query is "q=foo", and the fragment is "changes".
    #
    # Note that the RE actually combines the user and password parts, as
    # recommended in RFC 3986. Indeed, that RFC states that putting passwords
    # in URLs is a Really Bad Idea, something with which I would agree utterly.
    # Also note that we do not currently support IPv6 addresses.
    #
    # From a validation perspective, we need to ensure that the parts of the
    # URL that are going to the server are correctly encoded.

    set URLmatcher {(?x)                # this is _expanded_ syntax
        ^
        (?: (\w+) : ) ?                        # <protocol scheme>
        (?: //
            (?:
                (
                    [^@/\#?]+                # <userinfo part of authority>
                ) @
            )?
            ( [^/:\#?]+ )                # <host part of authority>
            (?: : (\d+) )?                # <port part of authority>
        )?
        ( / [^\#?]* (?: \? [^\#?]* )?)?        # <path> (including query)
        (?: \# (.*) )?                        # <fragment> (aka anchor)
        $
    }

    # Phase one: parse
    if {![regexp -- $URLmatcher $url -> proto user host port srvurl anchor]} {
        unset $token
        return -code error "Unsupported URL: $url"
    }
    # Phase two: validate
    if {$host eq ""} {
        # Caller has to provide a host name; we do not have a "default host"
        # that would enable us to handle relative URLs.
        unset $token
        return -code error "Missing host part: $url"
        # Note that we don't check the hostname for validity here; if it's
        # invalid, we'll simply fail to resolve it later on.
    }
    if {$port ne "" && $port>65535} {
        unset $token
        return -code error "Invalid port number: $port"
    }
    # The user identification and resource identification parts of the URL can
    # have encoded characters in them; take care!
    if {$user ne ""} {
        # Check for validity according to RFC 3986, Appendix A
        set validityRE {(?xi)
            ^
            (?: [-\w.~!$&'()*+,;=:] | %[0-9a-f][0-9a-f] )+
            $
        }
        if {![regexp -- $validityRE $user]} {
            unset $token
            # Provide a better error message in this error case
            if {[regexp {(?i)%(?![0-9a-f][0-9a-f]).?.?} $user bad]} {
                return -code error \
                        "Illegal encoding character usage \"$bad\" in URL user"
            }
            return -code error "Illegal characters in URL user"
        }
    }
    if {$srvurl ne ""} {
        # Check for validity according to RFC 3986, Appendix A
        set validityRE {(?xi)
            ^
            # Path part (already must start with / character)
            (?:              [-\w.~!$&'()*+,;=:@/]  | %[0-9a-f][0-9a-f] )*
            # Query part (optional, permits ? characters)
            (?: \? (?: [-\w.~!$&'()*+,;=:@/?] | %[0-9a-f][0-9a-f] )* )?
            $
        }
        if {![regexp -- $validityRE $srvurl]} {
            unset $token
            # Provide a better error message in this error case
            if {[regexp {(?i)%(?![0-9a-f][0-9a-f])..} $srvurl bad]} {
                return -code error \
                        "Illegal encoding character usage \"$bad\" in URL path"
            }
            return -code error "Illegal characters in URL path"
        }
    } else {
        set srvurl /
    }
    if {$proto eq ""} {
        set proto http
    } else {
        set result(proto) $proto
    }

    # Here we vary from core http

    # vfs::http supports all the protocols that http knows about...
    if { [catch {::http::unregister $proto} spec] == 0 } {
        ::http::register $proto [lindex $spec 0] [lindex $spec 1]
    } else {
        return -code error "Unsupported URL type \"$proto\""
    }

    # OK, now reassemble into a full URL, with result containing the
    # parts that exist and will be returned to the user
    array set result {}
    set url ${proto}://
    if {$user ne ""} {
        set result(user) $user
        # vfs::http will do HTTP basic auth on their existence,
        # but we pass these through as they are innocuous
        append url $user
        append url @
    }
    append url $host
    set result(host) $host
    if {$port ne ""} {
        # don't bother with adding default port
        append url : $port
        set result(port) $port
    }
    append url $srvurl
    if {$anchor ne ""} {
        # XXX: Don't append see the anchor, as it is generally a client-side
        # XXX: item.  The user can add it back if they want.
        #append url \# $anchor
        set result(anchor) $anchor
    }

    set idx [string first ? $srvurl]
    if {$idx >= 0} {
        set query [string range [expr {$idx+1}] end]
        set file  [string range 0 [expr {$idx-1}]]
        set result(file) $file
        set result(query) $query
    } else {
        set result(file) $srvurl
    }

    set result(url) $url

    # return array format list of items
    return [array get result]
}

ZB 20110404 What about "-timeout" option? I think, that network-related commands should have such option.