Version 1 of TAX RSS

Updated 2006-11-14 06:04:01

I needed a simple, easy to work with, asyncronous safe RSS parser. I tried TclRSS [L1 ], but it had heavy requirements, and I never actually figured out the correct combination of dependencies and their versions needed to make it work. I gave up on that and started writing my own RSS parser, using the subset of the TclRSS API that I had already written a program to. I found TAX: A Tiny API for XML (make sure you use the version at the bottom of the page, I just made that into a package and called it "tax 0.1") a small, good-enough Tcl XML parser and wrote an RSS parser from example RSS 2.0 feeds, and not from the specification (I was in a hurry). It works with my limited testing (digg.com, sourceforge, and cnn), but probably doesn't work for anything that is not exactly RSS 2.0 and pretty close to one of my examples.

 #! /usr/bin/env tclsh

 package require tax

 namespace eval rss {
         namespace eval channels {
         }
         namespace eval items {
         }
 }

 proc rss::__replace_entities {text} {
         return [string map [list "&gt;" ">" "&lt;" "<" "&amp;" "&"] $text]
 }

 proc rss::__strip_html {text} {
         return [regsub -all -- {<[^>]*>} $text ""]
 }

 proc rss::__tax_add_to_object {obj tag isClose isSelfClosing properties body} {
         upvar #0 $obj rssobj
         set channelid [namespace tail $obj]

         set tag [string tolower [string trim $tag]]
         if {$tag == "docstart"} {
                 set rssobj(parent) [list] 

                 namespace eval ::rss::items::$channelid {}
         }
         if {$tag == "docstart" || $tag == "rss"} {
                 return
         }
         if {[string index $tag 0] == "?"} {
                 return
         }

         if {$isClose && !$isSelfClosing} {
                 if {$tag == "item"} {
                         # We close tag items twice, because we add a fake open with the tag id
                         set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
                 }
                 set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
                 return
         }

         lappend rssobj(parent) $tag

         set parent [lindex $rssobj(parent) end-1]
         switch -- $tag {
                 "item" {
                         if {$parent == "channel"} {
                                 set lastusedid [namespace tail [lindex [lsort -dictionary -decreasing [info procs ::rss::items::${channelid}::*]] 0]]
                                 if {$lastusedid == ""} {
                                         set lastusedid 0
                                 }
                                 set id "::rss::items::${channelid}::[expr $lastusedid + 1]"

                                 proc $id [list command [list obj $obj] [list id $id]] {
                                         upvar #0 $obj rssobj
                                         switch -- [string tolower $command] {
                                                 "title" {
                                                         set idx [list $id title]
                                                 }
                                                 "link" {
                                                         set idx [list $id link]
                                                 }
                                                 "description" {
                                                         set idx [list $id description]
                                                 }
                                                 "date" {
                                                         set idx [list $id pubdate]
                                                 }
                                         }

                                         if {![info exists idx]} {
                                                 return ""
                                         }
                                         if {![info exists rssobj($idx)]} {
                                                 return ""
                                         }

                                         return $rssobj($idx)
                                 }

                                 lappend rssobj(items) $id

                                 lappend rssobj(parent) $id
                         }
                 }
                 "title" {
                         set rssobj([list $parent title]) [__strip_html [__replace_entities $body]]
                 }
                 "link" {
                         set rssobj([list $parent link]) [__replace_entities $body]
                 }
                 "description" {
                         set rssobj([list $parent description]) [__strip_html [__replace_entities $body]]
                 }
                 "pubdate" {
                         catch {
                                 set body [clock scan $body]
                         }
                         set rssobj([list $parent pubdate]) $body
                 }
         }

         if {$isClose} {
                 # For self-closing tags
                 if {$tag == "item"} {
                         # We close tag items twice, because we add a fake open with the tag id
                         set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
                 }
                 set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
         }

         return
 }

 # Return ID
 proc rss::parse {data} {
         set lastusedid [namespace tail [lindex [lsort -dictionary -decreasing [info vars ::rss::channels::*]] 0]]
         if {$lastusedid == ""} {
                 set lastusedid 0
         }
         set id "::rss::channels::[expr $lastusedid + 1]"

         upvar #0 $id rssobj

         tax::parse [list ::rss::__tax_add_to_object $id] $data

         proc $id [list command [list obj $id]] {
                 upvar #0 $obj rssobj
                 switch -- $command {
                         "items" {
                                 set idx items
                         }
                         "description" {
                                 set idx [list channel description]
                         }
                         "link" {
                                 set idx [list channel link]
                         }
                 }

                 if {![info exists idx]} {
                         return ""
                 }
                 if {![info exists rssobj($idx)]} {
                         return ""
                 }
                 return $rssobj($idx)
         }

         return $id
 }

 proc rss::cleanup {id} {
         if {[string match "::rss::channels::*" $id]} {
                 set channelid [namespace tail $id]
                 foreach proc [info procs ::rss::items::${channelid}::*] {
                         rename $proc ""
                 }
                 unset -nocomplain $id
         }

         return 1
 }

 package provide rss 0.1

You'll notice that it is slightly inconsisent towards the middle.. I should have used a namespace under ::rss for both channels and items, but I didn't realize this until I didn't feel like changing it. Feel free to edit the above, or use it in your own code.