# websearch.tcl -- # # Procedures to help automate searching of search engines. # # # Michael A. Cleverly, 1 Jan 2002 # # Distributed under the same terms as the Tcl core. package require Tcl 8.2 package require http 2.3 package provide websearch 0.1 namespace eval ::websearch:: { variable search# 0 variable subcommands variable entities array set subcommands { all_results {0 1 t "?keys?"} cleanup {0 0 f ""} configure {1 2 f "key ?value?"} more_p {0 0 t ""} next_result {0 1 t "?keys?"} curr_result {0 1 t "?keys?"} query {1 1 f "query"} reset {0 0 f ""} variables {0 0 f ""} } array set defaults { limit 100 timeout 120 useragent "Tcl websearch" queried? no } # These escapes borrowed from tcllib's htmlparse package array set escapes { lt < gt > amp & quot \" copy \xa9 reg \xae ob \x7b cb \x7d nbsp \xa0 } array set escapes { iexcl \xa1 cent \xa2 pound \xa3 curren \xa4 yen \xa5 brvbar \xa6 sect \xa7 uml \xa8 copy \xa9 ordf \xaa laquo \xab not \xac shy \xad reg \xae hibar \xaf deg \xb0 plusmn \xb1 sup2 \xb2 sup3 \xb3 acute \xb4 micro \xb5 para \xb6 middot \xb7 cedil \xb8 sup1 \xb9 ordm \xba raquo \xbb frac14 \xbc frac12 \xbd frac34 \xbe iquest \xbf Agrave \xc0 Aacute \xc1 Acirc \xc2 Atilde \xc3 Auml \xc4 Aring \xc5 AElig \xc6 Ccedil \xc7 Egrave \xc8 Eacute \xc9 Ecirc \xca Euml \xcb Igrave \xcc Iacute \xcd Icirc \xce Iuml \xcf ETH \xd0 Ntilde \xd1 Ograve \xd2 Oacute \xd3 Ocirc \xd4 Otilde \xd5 Ouml \xd6 times \xd7 Oslash \xd8 Ugrave \xd9 Uacute \xda Ucirc \xdb Uuml \xdc Yacute \xdd THORN \xde szlig \xdf agrave \xe0 aacute \xe1 acirc \xe2 atilde \xe3 auml \xe4 aring \xe5 aelig \xe6 ccedil \xe7 egrave \xe8 eacute \xe9 ecirc \xea euml \xeb igrave \xec iacute \xed icirc \xee iuml \xef eth \xf0 ntilde \xf1 ograve \xf2 oacute \xf3 ocirc \xf4 otilde \xf5 ouml \xf6 divide \xf7 oslash \xf8 ugrave \xf9 uacute \xfa ucirc \xfb uuml \xfc yacute \xfd thorn \xfe yuml \xff } # Convert into a form that we can use with [string map] foreach key [array names escapes] { set entities(&$key\;) $escapes($key) } } proc ::websearch::new {{search_engine google}} { variable search# if [catch { package require websearch::[string tolower $search_engine] }] { error "Do not know how to search \"$search_engine\"" } else { set command ::websearch::[incr search#] interp alias {} $command \ {} ::websearch::Search $search_engine \ [set search#] variable defaults variable [set search#] array set [set search#] [list search_engine $search_engine] array set [set search#] [array get defaults] array set [set search#] [array get ${search_engine}::defaults] return $command } } proc ::websearch::defaults {args} { variable defaults switch [llength $args] { 0 { return [array get defaults] } 1 { return $defaults([lindex $args 0]) } 2 { return [set $defaults([lindex $args 0]) [lindex $args 1]] } } error "Invalid syntax. Should be:\ websearch::defaults ?key? ?value?" } proc ::websearch::Search {search_engine search# command args} { variable subcommands if {![info exists subcommands($command)]} { error "Unknown option \"$command\". Should be one of:\ [join [lsort [array names subcommands]] ", "]." } foreach {min max queried_p parameters} $subcommands($command) break set explanation "::websearch::[set search#] $command $parameters" set argc [llength $args] if {$argc < $min} { error "Insufficient parameters. Should be:\ [string trim $explanation]" } if {$argc > $max} { error "Too many parameters. Should be:\ [string trim $explanation]" } if {[Boolean $queried_p]} { ::websearch::AssertQueryPerformed [set search#] } set cmd [list ::websearch::[string totitle $command] [set search#]] foreach arg $args { lappend cmd $arg } return [eval $cmd] } proc ::websearch::AssertQueryPerformed {search#} { variable [set search#] if {![info exists [set search#]]} { return -code error "Search not found!" } if {![Boolean [set [set search#](queried?)]]} { return -code error "No query has been performed yet!" } } proc ::websearch::Boolean {bool} { if {![string is boolean -strict $bool] && ![string is digit -strict $bool]} { return -code error "Could not interpret \"$bool\" as\ a boolean value." } switch -glob [string trimleft [string tolower $bool] 0] { "" - 0 - n* - of* - f* { return 0 } default { return 1 } } } proc ::websearch::Cleanup {search#} { variable [set search#] Reset [set search#] interp alias {} ::websearch::[set search#] {} unset [set search#] } proc ::websearch::Configure {search# key args} { variable [set search#] upvar 0 [set search#] token # Query a value if {[llength $args] == 0} { if {[info exists token($key)]} { return $token($key) } else { return } } # Set a value if {![string is word -strict $key]} { error "$key can be queried, but not set!" } set $token($key) [lindex $args 0] } proc ::websearch::More_p {search#} { variable [set search#] upvar 0 [set search#] token if {$token(@pos) < ($token(#results) - 1)} { return 1 } else { return 0 } } proc ::websearch::All_results {search# args} { variable [set search#] upvar 0 [set search#] token # save our current position set saved_pos $token(@pos) set token(@pos) -1 set data {} while {[More_p [set search#]]} { if {[llength $args] == 0} { lappend data [Next_result [set search#]] } else { lappend data [Next_result [set search#] [lindex $args 0]] } } # restore our previous position set token(@pos) $saved_pos return $data } proc ::websearch::Curr_result {search# args} { variable [set search#] upvar 0 [set search#] token # If we haven't gotten our first result, # then there is no "current" result if {$token(@pos) == -1} { return } # we'll cheat by decrementing our position and then # calling Next_result which will increment it back to where # we currently are at incr token(@pos) -1 if {[llength $args] == 0} { return [Next_result [set search#]] } else { return [Next_result [set search#] [lindex $args 0]] } } proc ::websearch::Next_result {search# args} { variable [set search#] upvar 0 [set search#] token upvar 0 ::websearch::$token(search_engine)::results results upvar 0 ::websearch::$token(search_engine)::variables variables # have we run out of results? if {$token(@pos) >= ($token(#results) - 1)} { return } incr token(@pos) set result [lindex $results([set search#]) $token(@pos)] switch [llength $args] { 0 { } 1 { set keys [lindex $args 0] } default { error "Should be impossible to get here." } } if {[info exists keys]} { foreach key $keys { set positions($key) [lsearch -exact $variables $key] } if {[llength $keys] == 1} { set data [lindex $result $positions($keys)] } else { foreach key $keys { lappend data [lindex $result $positions($key)] } } } else { foreach key $variables \ val [lindex $results([set search#]) $token(@pos)] { set array($key) $val } set data [array get array] } return $data } proc ::websearch::Query {search# query} { variable [set search#] upvar 0 [set search#] token Reset [set search#] return [::websearch::$token(search_engine)::Query \ [set search#] [UrlEncode $query]] } proc ::websearch::Reset {search#} { variable [set search#] upvar 0 [set search#] token if {[Boolean $token(queried?)]} { unset \ ::websearch::$token(search_engine)::results([set search#]) unset token(query) unset token(@pos) unset token(#results) set token(queried?) no } } proc ::websearch::Variables {search#} { variable [set search#] upvar 0 [set search#] token return [join [set ::websearch::$token(search_engine)::variables]] } proc ::websearch::GetUrl {search# url {headers ""}} { variable [set search#] upvar 0 [set search#] token array set config_old [http::config] array set config_new {-accept text/html,text/plain} foreach key [list proxyhost proxyport useragent] { if {[info exists token($key)]} { set config_new(-$key) $token($key) } } foreach key [array names config_old] { if {[info exists config_new($key)] && ![string equal $config_new($key) $config_old($key)]} { http::config $key $config_new($key) } } set timeout [expr {$token(timeout) * 1000}] set conn [http::geturl $url -headers $headers \ -timeout $timeout] set html [http::data $conn] http::cleanup $conn foreach key [array names config_new] { http::config $key $config_old($key) } return $html } proc ::websearch::PostUrl {search# url formvars {headers ""}} { variable [set search#] upvar 0 [set search#] token array set config_old [http::config] array set config_new {-accept text/html,text/plain} foreach key [list proxyhost proxyport useragent] { if {[info exists token($key)]} { set config_new(-$key) $token($key) } } foreach key [array names config_old] { array set config_new {-accept text/html,text/plain} foreach key [list proxyhost proxyport useragent] { if {[info exists token($key)]} { set config_new(-$key) $token($key) } } foreach key [array names config_old] { if {[info exists config_new($key)] && ![string equal $config_new($key) $config_old($key)]} { http::config $key $config_new($key) } } set timeout [expr {$token(timeout) * 1000}] set conn [http::geturl $url -headers $headers \ -query $formvars \ -timeout $timeout] set html [http::data $conn] http::cleanup $conn foreach key [array names config_new] { http::config $key $config_old($key) } return $html } proc ::websearch::UrlEncode {string} { set encoded "" foreach char [split $string ""] { if {[regexp {[A-Za-z0-9]} $char]} { append encoded $char } else { scan $char %c ascii append encoded %[format %x $ascii] } } return $encoded } proc ::websearch::StripHtml {html} { variable entities regsub -all -- {<[^>]+>} $html "" text set text [string map [array get entities] $text] regsub -all -- {&[^ ]+;} $text "" text regsub -all -- {\s+} $text " " text return $text } ---- # websearch-google.tcl # # Companion package to websearch that handles searching of Google # # # Michael A. Cleverly. 1 Jan 2002 # # Distributed under the same terms as the Tcl core. package require Tcl 8.2 package require websearch package provide websearch::google 0.1 namespace eval ::websearch::google { variable defaults variable results variable variables variable REs array set defaults {} array set results {} set variables { raw_hit url title view_as_html_url description size cached_url similar_pages_url } # We will use regular expressions to "scrape" the data we want # out of the HTML that Google generates. This is, inheriently, # fragile; if Google were to change the layout of their web pages, # these REs would likely have to be modified. # # Also, since normal ``tight'' regular expressions can end up # looking a lot more like Perl than Tcl, we will stick with # expanded regular expressions, which allows us to include comments # documenting what the RE is doing. Hopefully this will make # them more maintainable down the road if and when Google changes # their page layout. set REs(results) {(?xi) # expanded, case insensitive RE # previously we should have used [string map] to turn #
and
into NULLs \x00, so now all we have # to do is: \0([^\0]+)\0 } set REs(hit_details) {(?xi) # expanded, case insensitive RE # first capture the url ]+)> # then the title (which is what the first url is linked to) # this assumes we've removed all and tags previously # with [string map] because otherwise this next piece will fail # if one of the query terms appears in the title of the linked # document (since google bolds query terms). ([^<]+) # Google makes available a dynamic PDF to HTML converter # obviously not all search results will be PDFs, so this # URL often is not not present. (?: .* href=([^>]+)>View.as.HTML
)? # Everything up to the "URL - size - Cached - Simillar pages" # links (which appear in a green font) we'll capture as # the description (.+) # Skip the url (since we already have it, and the displayed one # might be truncated, or, occasionally, not even present) (?: \S+ \s-\s)? # Usually Google tells us the size of the document, but # not always (?: (\d+k) \s-\s)? # Usually Google has a cached version available (relative link) (?: .+ href=(/search\?q=cache[^>]+)>Cached )? # Almost always (but maybe not sometimes?) Google has a link # to "similar pages" (?: .+ href=(/search[^>]+)>Similar \s pages)? # With the -inline switch this RE (if it sucessfully matches) # will return a list containing: # # pos 0: {the whole match} # pos 1: {url} # pos 2: {title} # pos 3: {view as html link, if any} # pos 4: {description} # pos 5: {size, if any} # p os 6: {cached link, if any} # po s 7: {similar pages link, if any} } set REs(next_page_of_results) {(?xi) # expanded, case insensitive # capture the url for the next page of search results ]+)> ]+>
(?:<[^>]+>)? Next < } } proc ::websearch::google::Query {search# query} { variable REs variable results variable variables upvar 0 ::websearch::[set search#] token set url http://www.google.com/search?q=$query&num=30 set html [::websearch::GetUrl [set search#] $url] array set token [list queried? yes \ @pos -1 \ "#results" 0 query $query] set results([set search#]) {} set skipped 0 while {1} { if {[info exists hit_results]} { unset hit_results } # performance boost: change
to NULLs and remove tags set html [string map [list
\x00
\x00 \ "" ""] $html] if {![regexp $REs(results) $html => hit_results]} { # No results found break } # each "hit" is separated by a

tag set hit_results [string map [list

\x00] $hit_results] foreach hit [split $hit_results \x00] { set details [regexp -inline $REs(hit_details) $hit] if {[llength $details] == 0} { # this means there is some rare type of result that our # current regular expressions don't know how to handle incr skipped continue } foreach $variables $details break foreach var {url view_as_html_url cached_url similar_pages_url} { set $var [string trim [set $var] '\"] if {[string match /* [set $var]]} { set $var http://www.google.com[set $var] } } foreach var {title description} { set $var [::websearch::StripHtml [set $var]] } set data {} foreach var $variables { lappend data [set $var] } if {$token(#results) < $token(limit)} { lappend results([set search#]) $data incr token(#results) } else { break } } if {$token(#results) >= $token(limit)} { break } if {![regexp $REs(next_page_of_results) $html => next_page]} { break } if {$skipped >= $token(limit)} { break } set next_page [string trim $next_page '\"] if {[string match /* $next_page]} { set next_page http://www.google.com$next_page } set html [::websearch::GetUrl [set search#] $next_page] } return $token(#results) } ---- # websearch-wiki.tcl # # Companion package to websearch that handles searching of # the Tcl'ers Wiki # # # Copyright (c) 2002 by Michael A. Cleverly. # # Distributed under the same terms as the Tcl core. package require Tcl 8.2 package require websearch package provide websearch::wiki 0.1 namespace eval ::websearch::wiki { variable defaults variable results variable variables array set defaults {} array set results {} set variables { raw_hit last_modified url title } } proc ::websearch::wiki::Query {search# query} { variable results variable variables upvar 0 ::websearch::[set search#] token set RE {

  • (\S+ \S+) [. ]+ ([^<]+)} set url http://mini.net/tcl/2.html?$query set html [::websearch::GetUrl [set search#] $url] array set token [list queried? yes \ @pos -1 \ "#results" 0 \ query $query] set results([set search#]) {} foreach {raw_hit last_modified url title} \ [regexp -inline -all $RE $html] { incr token(#results) lappend results([set search#]) [list $raw_hit \ $last_modified http://mini.net/tcl/$url $title] if {$token(#results) == $token(limit)} { break } } return $token(#results) } ---- # websearch-usenet.tcl # # Companion package to websearch that handles searching of # Usenet via groups.google.com # # # Michael A. Cleverly. 1 Jan 2002. # # Distributed under the same terms as the Tcl core. package require Tcl 8.2 package require websearch package provide websearch::usenet 0.1 namespace eval ::websearch::usenet { variable defaults variable results variable variables variable REs array set defaults {} array set results {} set variables { raw_hit url title description newsgroup_url newsgroup date author thread_url thread_length } # We will use regular expressions to "scrape" the data we want # out of the HTML that Google generates. This is, inheriently, # fragile; if Google were to change the layout of their web pages, # these REs would likely have to be modified. # # Also, since normal ``tight'' regular expressions can end up # looking a lot more like Perl than Tcl, we will stick with # expanded regular expressions, which allows us to include # comments documenting what the RE is doing. Hopefully this # will make them more maintainable down the road if and when # Google changes their page layout. set REs(results) {(?xi) # expanded, case insensitive RE # previously we should have used [string map] to turn #
    and
    into NULLs \x00, so now all we have # to do is: \0([^\0]+)\0 } set REs(hit_details) {(?xi) # expanded, case insensitive RE # first capture the url ]+)> # then the title (which is what the first url is linked to) # this assumes we've removed all and tags # previously with [string map] because otherwise this next # piece will fail if one of the query terms appears in the # title of the linked document (since Google bolds query # terms). ([^<]+) # Grab the description (.+) # Newsgroup posted to ([^<]+) # date posted on \s - \s (\d\d \s [A-Z][a-z][a-z] \s \d\d\d\d) \s+ # author by (.+) \s - \s # thread ]*>View \s+ Thread \s+ \((\d+) \s+ article # With the -inline switch this RE (if it sucessfully matches) # will return a list containing: # # pos 0: {the whole match} # pos 1: {url} # pos 2: {title} # pos 4: {description} # pos 5: {newsgroup url} # pos 6: {newsgroup} # pos 7: {date} # pos 8: {author} # pos 9: {thread_url} # pos 10: {thread_length} } set REs(next_page_of_results) {(?xi) # expanded, case insensitive # capture the url for the next page of search results ]+)> ]+> ]+)> ]+>
    (?:<[^>]+>)? Next < } } proc ::websearch::usenet::Query {search# query} { variable REs variable results variable variables upvar 0 ::websearch::[set search#] token set url http://groups.google.com/groups?q=$query&num=30 set html [::websearch::GetUrl [set search#] $url] array set token [list queried? yes \ @pos -1 \ "#results" 0 \ query $query] set results([set search#]) {} set skipped 0 while {1} { if {[info exists hit_results]} { unset hit_results } # performance boost: change
    to NULLs and remove tags set html [string map [list
    \x00
    \x00 \ "" ""] $html] if {![regexp $REs(results) $html => hit_results]} { # No results found break } # each "hit" is separated by a

    tag set hit_results [string map [list

    \x00] $hit_results] foreach hit [split $hit_results \x00] { set details [regexp -inline $REs(hit_details) $hit] if {[llength $details] == 0} { # this means there is some rare type of result that # our current regular expressions don't know how to # handle incr skipped continue } foreach $variables $details break foreach var {url newsgroup_url thread_url} { set $var [string trim [set $var] '\"] if {[string match /* [set $var]]} { set $var http://groups.google.com[set $var] } } foreach var {title description} { set $var [::websearch::StripHtml [set $var]] } set data {} foreach var $variables { lappend data [set $var] } if {$token(#results) < $token(limit)} { lappend results([set search#]) $data incr token(#results) } else { break } } if {$token(#results) >= $token(limit)} { break } if {![regexp $REs(next_page_of_results) $html => next_page]} { break } if {$skipped >= $token(limit)} { break } set next_page [string trim $next_page '\"] if {[string match /* $next_page]} { set next_page http://groups.google.com$next_page } set html [::websearch::GetUrl [set search#] $next_page] } return $token(#results) }