I needed a simple, easy to work with, asyncronous safe RSS parser. I tried TclRSS [L1 ], but it had heavy requirements, and I never actually figured out the correct combination of dependencies and their versions needed to make it work. I gave up on that and started writing my own RSS parser, using the subset of the TclRSS API that I had already written a program to. I found TAX: A Tiny API for XML (make sure you use the version at the bottom of the page, I just made that into a package and called it "tax 0.1") a small, good-enough Tcl XML parser and wrote an RSS parser from example RSS 2.0 feeds, and not from the specification (I was in a hurry). It works with my limited testing (digg.com, sourceforge, and cnn), but probably doesn't work for anything that is not exactly RSS 2.0 and pretty close to one of my examples.
#! /usr/bin/env tclsh package require tax namespace eval rss { namespace eval channels { } namespace eval items { } } proc rss::__replace_entities {text} { return [string map [list " " " " ">" ">" "<" "<" "&" "&"] $text] } proc rss::__strip_html {text} { # We replace entities here (i.e., twice) because HTML-inside-XML will have # the HTML entities escaped twice. return [__replace_entities [regsub -all -- {<[^>]*>} $text ""]] } proc rss::__tax_add_to_object {obj tag isClose isSelfClosing properties body} { upvar #0 $obj rssobj set channelid [namespace tail $obj] set tag [string tolower [string trim $tag]] if {$tag == "docstart"} { set rssobj(parent) [list] namespace eval ::rss::items::$channelid {} } if {$tag == "docstart" || $tag == "rss"} { return } if {[string index $tag 0] == "?"} { return } if {$isClose && !$isSelfClosing} { if {$tag == "item"} { # We close tag items twice, because we add a fake open with the tag id set rssobj(parent) [lrange $rssobj(parent) 0 end-1] } set rssobj(parent) [lrange $rssobj(parent) 0 end-1] return } lappend rssobj(parent) $tag set parent [lindex $rssobj(parent) end-1] switch -- $tag { "item" { if {$parent == "channel"} { set lastusedid [namespace tail [lindex [lsort -dictionary -decreasing [info procs ::rss::items::${channelid}::*]] 0]] if {$lastusedid == ""} { set lastusedid 0 } set id "::rss::items::${channelid}::[expr $lastusedid + 1]" proc $id [list command [list obj $obj] [list id $id]] { upvar #0 $obj rssobj switch -- [string tolower $command] { "title" { set idx [list $id title] } "link" { set idx [list $id link] } "description" { set idx [list $id description] } "date" { set idx [list $id pubdate] } } if {![info exists idx]} { return "" } if {![info exists rssobj($idx)]} { return "" } return $rssobj($idx) } lappend rssobj(items) $id lappend rssobj(parent) $id } } "title" { set rssobj([list $parent title]) [__strip_html [__replace_entities $body]] } "link" { set rssobj([list $parent link]) [__replace_entities $body] } "description" { set rssobj([list $parent description]) [__strip_html [__replace_entities $body]] } "pubdate" { catch { set body [clock scan $body] } set rssobj([list $parent pubdate]) $body } } if {$isClose} { # For self-closing tags if {$tag == "item"} { # We close tag items twice, because we add a fake open with the tag id set rssobj(parent) [lrange $rssobj(parent) 0 end-1] } set rssobj(parent) [lrange $rssobj(parent) 0 end-1] } return } # Return ID proc rss::parse {data} { set lastusedid [namespace tail [lindex [lsort -dictionary -decreasing [info vars ::rss::channels::*]] 0]] if {$lastusedid == ""} { set lastusedid 0 } set id "::rss::channels::[expr $lastusedid + 1]" upvar #0 $id rssobj tax::parse [list ::rss::__tax_add_to_object $id] $data proc $id [list command [list obj $id]] { upvar #0 $obj rssobj switch -- $command { "items" { set idx items } "description" { set idx [list channel description] } "link" { set idx [list channel link] } } if {![info exists idx]} { return "" } if {![info exists rssobj($idx)]} { return "" } return $rssobj($idx) } return $id } proc rss::cleanup {id} { if {[string match "::rss::channels::*" $id]} { set channelid [namespace tail $id] foreach proc [info procs ::rss::items::${channelid}::*] { rename $proc "" } unset -nocomplain $id } return 1 } package provide rss 0.1
You'll notice that it is slightly inconsisent towards the middle.. I should have used a namespace under ::rss for both channels and items, but I didn't realize this until I didn't feel like changing it. Feel free to edit the above, or use it in your own code.
LV Do you have any examples to demonstrate?
Roy Keene Sure, a small example:
#! /usr/bin/env tclsh package require rss package require http set token [http::geturl "http://www.digg.com/rss/index.xml"] set rssdata [http::data $token] http::cleanup $token set id [rss::parse $rssdata] foreach item [$id items] { puts "[clock format [$item date]]: [$item title]: [$item description] ([$item link])" }
A more complete example:
#! /usr/bin/env tclsh package require rss package require http package require Tk proc gui_bg_update_news {newsobj} { set rssfeeds [list {http://news.google.com/news?ned=us&topic=h&output=rss} {http://rss.cnn.com/rss/cnn_world.rss}] http::config -useragent {Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 OpenSSL/0.9.7e} if {![info exists ::rss_newsitems]} { set ::rss_newsitems [list] } foreach url $rssfeeds { catch { http::geturl $url -command gui_bg_update_news_data } } after 30000 [list gui_bg_update_news_text $newsobj] } proc gui_bg_update_news_text {newsobj} { if {[llength $::rss_newsitems] != 0} { $newsobj delete 0 end unset -nocomplain ::rss_newsitems_urls foreach item [lsort -dictionary -index 0 $::rss_newsitems] { set date [lindex $item 0] set title [lindex $item 1] set desc [lindex $item 2] set link [lindex $item 3] $newsobj insert end "$title" set ::rss_newsitems_urls($title) $link } set ::rss_newsitems [list] } after 120000 [list gui_bg_update_news $newsobj] } proc gui_bg_update_news_data {token} { if {[http::ncode $token] != "200"} { if {$::DEBUG} { puts "Error opening url: [http::ncode $token]" } http::cleanup $token return } set rssdata [http::data $token] if {[catch { set id [rss::parse $rssdata] set newsitems [list] foreach item [$id items] { set newitem [list [$item date] [$item title] [$item description] [$item link]] if {[lsearch $::rss_newsitems $newitem] == -1} { lappend ::rss_newsitems $newitem } } } err]} { if {[info exists $::DEBUG]} { puts "Error in RSS feed update: $err" puts "$::errorInfo" } } http::cleanup $token if {[info exists id]} { rss::cleanup $id } } proc load_rss_url {newsobj x y} { set idx [$newsobj nearest $y] set idxbbox [$newsobj bbox $idx] set idx_starty [expr [lindex $idxbbox 1]] set idx_endy [expr $idx_starty + [lindex $idxbbox 3]] if {$y < ($idx_starty - 4)} { return } if {$y > ($idx_endy + 4)} { return } set title [$newsobj get $idx] if {![info exists ::rss_newsitems_urls($title)]} { return } set url $::rss_newsitems_urls([$newsobj get $idx]) puts "Loading URL: $url" # XXX: TODO, Figure out how to actually load a URL across platforms } listbox .newsInfo -width 80 button .exit -text "Exit" -command exit pack .newsInfo -expand 1 -fill both pack .exit bind .newsInfo <Double-1> [list load_rss_url .newsInfo %x %y] after 1000 [list gui_bg_update_news .newsInfo]