TAX RSS

I needed a simple, easy to work with, asyncronous safe RSS parser. I tried TclRSS [L1 ], but it had heavy requirements, and I never actually figured out the correct combination of dependencies and their versions needed to make it work. I gave up on that and started writing my own RSS parser, using the subset of the TclRSS API that I had already written a program to. I found TAX: A Tiny API for XML (make sure you use the version at the bottom of the page, I just made that into a package and called it "tax 0.1") a small, good-enough Tcl XML parser and wrote an RSS parser from example RSS 2.0 feeds, and not from the specification (I was in a hurry). It works with my limited testing (digg.com, sourceforge, and cnn), but probably doesn't work for anything that is not exactly RSS 2.0 and pretty close to one of my examples.

 #! /usr/bin/env tclsh

 package require tax

 namespace eval rss {
        namespace eval channels {
        }
        namespace eval items {
        }
 }

 proc rss::__replace_entities {text} {
        return [string map [list "&nbsp;" " " "&gt;" ">" "&lt;" "<" "&amp;" "&"] $text]
 }

 proc rss::__strip_html {text} {
        # We replace entities here (i.e., twice) because HTML-inside-XML will have
        # the HTML entities escaped twice.
        return [__replace_entities [regsub -all -- {<[^>]*>} $text ""]]
 }

 proc rss::__tax_add_to_object {obj tag isClose isSelfClosing properties body} {
        upvar #0 $obj rssobj
        set channelid [namespace tail $obj]

        set tag [string tolower [string trim $tag]]
        if {$tag == "docstart"} {
                set rssobj(parent) [list] 

                namespace eval ::rss::items::$channelid {}
        }
        if {$tag == "docstart" || $tag == "rss"} {
                return
        }
        if {[string index $tag 0] == "?"} {
                return
        }

        if {$isClose && !$isSelfClosing} {
                if {$tag == "item"} {
                        # We close tag items twice, because we add a fake open with the tag id
                        set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
                }
                set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
                return
        }

        lappend rssobj(parent) $tag

        set parent [lindex $rssobj(parent) end-1]
        switch -- $tag {
                "item" {
                        if {$parent == "channel"} {
                                set lastusedid [namespace tail [lindex [lsort -dictionary -decreasing [info procs ::rss::items::${channelid}::*]] 0]]
                                if {$lastusedid == ""} {
                                        set lastusedid 0
                                }
                                set id "::rss::items::${channelid}::[expr $lastusedid + 1]"

                                proc $id [list command [list obj $obj] [list id $id]] {
                                        upvar #0 $obj rssobj
                                        switch -- [string tolower $command] {
                                                "title" {
                                                        set idx [list $id title]
                                                }
                                                "link" {
                                                        set idx [list $id link]
                                                }
                                                "description" {
                                                        set idx [list $id description]
                                                }
                                                "date" {
                                                        set idx [list $id pubdate]
                                                }
                                        }

                                        if {![info exists idx]} {
                                                return ""
                                        }
                                        if {![info exists rssobj($idx)]} {
                                                return ""
                                        }

                                        return $rssobj($idx)
                                }

                                lappend rssobj(items) $id

                                lappend rssobj(parent) $id
                        }
                }
                "title" {
                        set rssobj([list $parent title]) [__strip_html [__replace_entities $body]]
                }
                "link" {
                        set rssobj([list $parent link]) [__replace_entities $body]
                }
                "description" {
                        set rssobj([list $parent description]) [__strip_html [__replace_entities $body]]
                }
                "pubdate" {
                        catch {
                                set body [clock scan $body]
                        }
                        set rssobj([list $parent pubdate]) $body
                }
        }

        if {$isClose} {
                # For self-closing tags
                if {$tag == "item"} {
                        # We close tag items twice, because we add a fake open with the tag id
                        set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
                }
                set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
        }

        return
 }

 # Return ID
 proc rss::parse {data} {
        set lastusedid [namespace tail [lindex [lsort -dictionary -decreasing [info vars ::rss::channels::*]] 0]]
        if {$lastusedid == ""} {
                set lastusedid 0
        }
        set id "::rss::channels::[expr $lastusedid + 1]"

        upvar #0 $id rssobj

        tax::parse [list ::rss::__tax_add_to_object $id] $data

        proc $id [list command [list obj $id]] {
                upvar #0 $obj rssobj
                switch -- $command {
                        "items" {
                                set idx items
                        }
                        "description" {
                                set idx [list channel description]
                        }
                        "link" {
                                set idx [list channel link]
                        }
                }

                if {![info exists idx]} {
                        return ""
                }
                if {![info exists rssobj($idx)]} {
                        return ""
                }
                return $rssobj($idx)
        }

        return $id
 }

 proc rss::cleanup {id} {
        if {[string match "::rss::channels::*" $id]} {
                set channelid [namespace tail $id]
                foreach proc [info procs ::rss::items::${channelid}::*] {
                        rename $proc ""
                }
                unset -nocomplain $id
        }

        return 1
 }

 package provide rss 0.1

You'll notice that it is slightly inconsisent towards the middle.. I should have used a namespace under ::rss for both channels and items, but I didn't realize this until I didn't feel like changing it. Feel free to edit the above, or use it in your own code.


LV Do you have any examples to demonstrate?


Roy Keene Sure, a small example:

 #! /usr/bin/env tclsh

 package require rss
 package require http

 set token [http::geturl "http://www.digg.com/rss/index.xml"]
 set rssdata [http::data $token]
 http::cleanup $token

 set id [rss::parse $rssdata]
 foreach item [$id items] {
         puts "[clock format [$item date]]: [$item title]: [$item description] ([$item link])"
 }

A more complete example:

 #! /usr/bin/env tclsh

 package require rss
 package require http
 package require Tk

 proc gui_bg_update_news {newsobj} {
         set rssfeeds [list {http://news.google.com/news?ned=us&topic=h&output=rss} {http://rss.cnn.com/rss/cnn_world.rss}]

         http::config -useragent {Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 OpenSSL/0.9.7e}

         if {![info exists ::rss_newsitems]} {
                 set ::rss_newsitems [list]
         }
         foreach url $rssfeeds {
                 catch {
                         http::geturl $url -command gui_bg_update_news_data
                 }
         }

         after 30000 [list gui_bg_update_news_text $newsobj]
 }

 proc gui_bg_update_news_text {newsobj} {

         if {[llength $::rss_newsitems] != 0} {
                 $newsobj delete 0 end
                 unset -nocomplain ::rss_newsitems_urls
                 foreach item [lsort -dictionary -index 0 $::rss_newsitems] {
                         set date [lindex $item 0]
                         set title [lindex $item 1]
                         set desc [lindex $item 2]
                         set link [lindex $item 3]

                         $newsobj insert end "$title"
                         set ::rss_newsitems_urls($title) $link
                 }
                 set ::rss_newsitems [list]
         }

         after 120000 [list gui_bg_update_news $newsobj]
 }

 proc gui_bg_update_news_data {token} {
         if {[http::ncode $token] != "200"} {
                 if {$::DEBUG} {
                         puts "Error opening url: [http::ncode $token]"
                 }
                 http::cleanup $token
                 return
         }
         set rssdata [http::data $token]

         if {[catch {
                 set id [rss::parse $rssdata]
                 set newsitems [list]
                 foreach item [$id items] {
                         set newitem [list [$item date] [$item title] [$item description] [$item link]]
                         if {[lsearch $::rss_newsitems $newitem] == -1} {
                                 lappend ::rss_newsitems $newitem
                         }
                 }
         } err]} {
                 if {[info exists $::DEBUG]} {
                         puts "Error in RSS feed update: $err"
                         puts "$::errorInfo"
                 }
         }

         http::cleanup $token
         if {[info exists id]} {
                 rss::cleanup $id
         }
 }

 proc load_rss_url {newsobj x y} {
         set idx [$newsobj nearest $y]
         set idxbbox [$newsobj bbox $idx]

         set idx_starty [expr [lindex $idxbbox 1]]
         set idx_endy [expr $idx_starty + [lindex $idxbbox 3]]

         if {$y < ($idx_starty - 4)} {
                 return
         }
         if {$y > ($idx_endy + 4)} {
                 return
         }

         set title [$newsobj get $idx]

         if {![info exists ::rss_newsitems_urls($title)]} {
                 return
         }

         set url $::rss_newsitems_urls([$newsobj get $idx])

         puts "Loading URL: $url"
         # XXX: TODO, Figure out how to actually load a URL across platforms
 }

 listbox .newsInfo -width 80
 button .exit -text "Exit" -command exit

 pack .newsInfo -expand 1 -fill both
 pack .exit

 bind .newsInfo <Double-1> [list load_rss_url .newsInfo %x %y]
 after 1000 [list gui_bg_update_news .newsInfo]

Category Internet