webrobot - a package for web scraping

NEM 2008-01-17: Here's some basic code from a package I'm writing to support web scraping in a simple fashion, supporting best practices. At present it's just a simple snit wrapper around the http package, adding support for authenticating proxies, following redirects, and rudimentary cache and cookie support. One crucial part that I have in basic form and which I will add shortly will be support for robots.txt so that the package is well-behaved in not fetching URLs that are discouraged.

Example

webrobot web -useragent "Neil's Magic Web Robot/1.0 [email protected]" \
     -proxyhost myproxy.example.com -proxyport 8080 \
     -proxyuser neilm -proxypass sekret
set page [web fetch https://wiki.tcl-lang.org/webrobot -urlvar url]
puts "Fetched [string length $page] chars from $url"

The Code

# webrobot.tcl --
#
#       A package for implementing HTTP web robots to automatically retrieve
#       web pages. Provides support for checking robots.txt files, caching
#       retrievals, and negotiating authenticating proxies.
#
# Copyright (c) 2007 Neil Madden ([email protected]).
# All rights reserved.
#
# CVS: $Id: webrobot.tcl 13 2008-01-13 23:21:52Z nem $

package require Tcl     8.5
package require snit    2.2
package require http    2.5
package require base64  2.3

package provide webrobot    0.1

snit::type webrobot {
    option  -proxyuser  -default "" -configuremethod ChangeProxy
    option  -proxypass  -default "" -configuremethod ChangeProxy
    option  -proxyhost  -default "" -configuremethod ChangeOption
    option  -proxyport  -default "" -configuremethod ChangeOption
    option  -maxredirects -default 5
    option  -useragent  -default "TclWebRobot/[package provide webrobot]" \
                -configuremethod ChangeOption
    option  -cookievar  -default ""
    option  -referer    -default ""
    option  -urlvar     -default ""
    option  -query      -default "" -configuremethod SetQuery
    option  -cachevar   -default ""

    variable ProxyAuth ""

    proc throw {code message {info ""}} {
        return -code error -errorcode $code \
               -errorinfo $info -level 2 $message
    }

    method fetch {url args} {
        set tmp [array get options]
        $self configurelist $args

        set var [$self cget -urlvar]
        if {$var ne ""} {
            upvar 1 $var current
        }
        set current $url

        set max [$self cget -maxredirects]
        for {set i 0} {$i <= $max} {incr i} {
            lassign [$self geturl $current] code data meta
            switch -glob $code {
                200         { break }
                30[1237]    { set current [dict get $meta location] }
                default     { throw [list HTTP $code] \
                                "unknown status: $code" }
            }
        }
        if {$i == $max} {
            throw [list HTTP REDIRECT] "too many redirects"
        }
        $self configurelist $tmp
        return $data
    }

    method ChangeProxy {option value} {
        set options($option) $value
        if {[$self cget -proxyuser] ne ""} {
            set user [$self cget -proxyuser]
            set pass [$self cget -proxypass]
            set token [base64::encode $user:$pass]
            set ProxyAuth [concat Basic $token]
        } else {
            set ProxyAuth ""
        }
    }

    method ChangeOption {option value} {
        http::config $option $value
        set options($option) $value
    }

    proc EffectiveHostname host {
        if {[string first "." $host] == -1} {
            return $host.local
        } else {
            return $host
        }
    }

    proc MatchHost {pattern host} {
        if {$pattern eq $host} { return 1 }
    }

    typevariable encodemap [dict create]
    typevariable decodemap [dict create]
    typemethod InitEncodeMap {} {
        set d [dict create + { }]
        set e [dict create]
        for {set i 0} {$i < 256} {incr i} {
            set c [format %c $i]
            set x %[format %02x $i]
            if {![string match {[a-zA-Z0-9]} $c]} {
                dict set e $c $x
                dict set d $x $c
            }
        }
        set encodemap $e
        set decodemap $d
    }
    typeconstructor {
        $type InitEncodeMap
    }

    method {url encode} args {
        set ret ""
        foreach {name value} $args {
            if {[incr count] > 0} { append ret & }
            append ret [string map $encodemap $name] = \
                       [string map $encodemap $value]
        }
        return $ret
    }

    method {url decode} str {
        set ret [list]
        foreach part [split $str &] {
            lassign [split $part =] name value
            lappend ret [string map $decodemap $name] \
                        [string map $decodemap $value]
        }
        return $ret
    }


    method cookies {} {
        set cookies [dict create]
        if {[$self cget -cookievar] ne ""} {
            upvar #0 [$self cget -cookievar] cs
            if {![info exists cs]} { set cs [dict create] }
            set cookies $cs
        }
        if {[dict size $cookies] == 0} {
            return [list]
        }
        set ret [list]
        dict for {name value} $cookies {
            lappend ret [$self url encode $name $value]
        }
        return [list Cookies [join $ret ";"]]
    }

    method set-cookies headers {
        set var [$self cget -cookievar]
        if {$var eq ""} { return } ;# no point saving cookies
        upvar #0 $var cookies
        if {[dict exists $headers "set-cookie"]} {
            foreach cookie [split [dict get $headers "set-cookie"] ,] {
                set val [lindex [split $cookie ";"] 0]
                if {[regexp {([^=]+)=(.*)$} $val -> field content]} {
                    dict set cookies [$self url decode $field] \
                                     [$self url decode $content]
                }
            }
        }
    }

    method geturl url {
        if {[string length $options(-cachevar)]} {
            upvar #0 $options(-cachevar) cache
        } 
        if {![info exists cache]} {
            set cache [dict create]
        }
        set headers [$self cookies]
        if {[string length $ProxyAuth]} {
            lappend headers  Proxy-Authorization $ProxyAuth
        }
        set opts [list -headers $headers]
        if {[string length $options(-query)]} {
            lappend opts -query $options(-query)
        }
        set key [list $url $opts]
        if {[dict exists $cache $key]} {
            return [list 200 [dict get $cache $key] {}]
        }
        #puts "Actually fetching $url"
        set t [http::geturl $url {*}$opts]
        set c [http::ncode $t]
        set d [http::data $t]
        upvar #0 $t state
        set m [map-keys {string tolower} $state(meta)]
        http::cleanup $t

        dict set cache $key $d

        # See if any cookies have been set
        $self set-cookies $m
        return [list $c $d $m]
    }

    proc map-keys {f d} {
        set ret [dict create]
        dict for {k v} $d { dict set ret [invoke 1 $f $k] $v }
        return $ret
    }

    proc invoke {level f args} {
        if {[string is integer -strict $level]} { incr level }
        uplevel $level $f $args
    }

    method SetQuery {option value} {
        # Encode the query correctly
        set options(-query) [$self url encode {*}$value]
    }
}

sol 2008-03-18: Small improvement can be made to this robot to help redirection on relative urls. Just in the middle of fetch method of yours add this code

set current [uri::resolve $url $current]

on top of for, like this:

for {set i 0} {$i <= $max} {incr i} {

    set current [uri::resolve $url $current]

    lassign [$self geturl $current] code data meta
    switch -glob $code {
        200         { break }
    ......

And of course do not forget to include package require uri somewhere :)