NEM 2008-01-17: Here's some basic code from a package I'm writing to support web scraping in a simple fashion, supporting best practices. At present it's just a simple snit wrapper around the http package, adding support for authenticating proxies, following redirects, and rudimentary cache and cookie support. One crucial part that I have in basic form and which I will add shortly will be support for robots.txt so that the package is well-behaved in not fetching URLs that are discouraged.
webrobot web -useragent "Neil's Magic Web Robot/1.0 [email protected]" \ -proxyhost myproxy.example.com -proxyport 8080 \ -proxyuser neilm -proxypass sekret set page [web fetch https://wiki.tcl-lang.org/webrobot -urlvar url] puts "Fetched [string length $page] chars from $url"
# webrobot.tcl -- # # A package for implementing HTTP web robots to automatically retrieve # web pages. Provides support for checking robots.txt files, caching # retrievals, and negotiating authenticating proxies. # # Copyright (c) 2007 Neil Madden ([email protected]). # All rights reserved. # # CVS: $Id: webrobot.tcl 13 2008-01-13 23:21:52Z nem $ package require Tcl 8.5 package require snit 2.2 package require http 2.5 package require base64 2.3 package provide webrobot 0.1 snit::type webrobot { option -proxyuser -default "" -configuremethod ChangeProxy option -proxypass -default "" -configuremethod ChangeProxy option -proxyhost -default "" -configuremethod ChangeOption option -proxyport -default "" -configuremethod ChangeOption option -maxredirects -default 5 option -useragent -default "TclWebRobot/[package provide webrobot]" \ -configuremethod ChangeOption option -cookievar -default "" option -referer -default "" option -urlvar -default "" option -query -default "" -configuremethod SetQuery option -cachevar -default "" variable ProxyAuth "" proc throw {code message {info ""}} { return -code error -errorcode $code \ -errorinfo $info -level 2 $message } method fetch {url args} { set tmp [array get options] $self configurelist $args set var [$self cget -urlvar] if {$var ne ""} { upvar 1 $var current } set current $url set max [$self cget -maxredirects] for {set i 0} {$i <= $max} {incr i} { lassign [$self geturl $current] code data meta switch -glob $code { 200 { break } 30[1237] { set current [dict get $meta location] } default { throw [list HTTP $code] \ "unknown status: $code" } } } if {$i == $max} { throw [list HTTP REDIRECT] "too many redirects" } $self configurelist $tmp return $data } method ChangeProxy {option value} { set options($option) $value if {[$self cget -proxyuser] ne ""} { set user [$self cget -proxyuser] set pass [$self cget -proxypass] set token [base64::encode $user:$pass] set ProxyAuth [concat Basic $token] } else { set ProxyAuth "" } } method ChangeOption {option value} { http::config $option $value set options($option) $value } proc EffectiveHostname host { if {[string first "." $host] == -1} { return $host.local } else { return $host } } proc MatchHost {pattern host} { if {$pattern eq $host} { return 1 } } typevariable encodemap [dict create] typevariable decodemap [dict create] typemethod InitEncodeMap {} { set d [dict create + { }] set e [dict create] for {set i 0} {$i < 256} {incr i} { set c [format %c $i] set x %[format %02x $i] if {![string match {[a-zA-Z0-9]} $c]} { dict set e $c $x dict set d $x $c } } set encodemap $e set decodemap $d } typeconstructor { $type InitEncodeMap } method {url encode} args { set ret "" foreach {name value} $args { if {[incr count] > 0} { append ret & } append ret [string map $encodemap $name] = \ [string map $encodemap $value] } return $ret } method {url decode} str { set ret [list] foreach part [split $str &] { lassign [split $part =] name value lappend ret [string map $decodemap $name] \ [string map $decodemap $value] } return $ret } method cookies {} { set cookies [dict create] if {[$self cget -cookievar] ne ""} { upvar #0 [$self cget -cookievar] cs if {![info exists cs]} { set cs [dict create] } set cookies $cs } if {[dict size $cookies] == 0} { return [list] } set ret [list] dict for {name value} $cookies { lappend ret [$self url encode $name $value] } return [list Cookies [join $ret ";"]] } method set-cookies headers { set var [$self cget -cookievar] if {$var eq ""} { return } ;# no point saving cookies upvar #0 $var cookies if {[dict exists $headers "set-cookie"]} { foreach cookie [split [dict get $headers "set-cookie"] ,] { set val [lindex [split $cookie ";"] 0] if {[regexp {([^=]+)=(.*)$} $val -> field content]} { dict set cookies [$self url decode $field] \ [$self url decode $content] } } } } method geturl url { if {[string length $options(-cachevar)]} { upvar #0 $options(-cachevar) cache } if {![info exists cache]} { set cache [dict create] } set headers [$self cookies] if {[string length $ProxyAuth]} { lappend headers Proxy-Authorization $ProxyAuth } set opts [list -headers $headers] if {[string length $options(-query)]} { lappend opts -query $options(-query) } set key [list $url $opts] if {[dict exists $cache $key]} { return [list 200 [dict get $cache $key] {}] } #puts "Actually fetching $url" set t [http::geturl $url {*}$opts] set c [http::ncode $t] set d [http::data $t] upvar #0 $t state set m [map-keys {string tolower} $state(meta)] http::cleanup $t dict set cache $key $d # See if any cookies have been set $self set-cookies $m return [list $c $d $m] } proc map-keys {f d} { set ret [dict create] dict for {k v} $d { dict set ret [invoke 1 $f $k] $v } return $ret } proc invoke {level f args} { if {[string is integer -strict $level]} { incr level } uplevel $level $f $args } method SetQuery {option value} { # Encode the query correctly set options(-query) [$self url encode {*}$value] } }
sol 2008-03-18: Small improvement can be made to this robot to help redirection on relative urls. Just in the middle of fetch method of yours add this code
set current [uri::resolve $url $current]
on top of for, like this:
for {set i 0} {$i <= $max} {incr i} { set current [uri::resolve $url $current] lassign [$self geturl $current] code data meta switch -glob $code { 200 { break } ......
And of course do not forget to include package require uri somewhere :)