A little RSS reaper

Richard Suchenwirth 2005-02-09 - As a companion piece to RS's RSS, this midnight fun project script works without Tk - specify a RSS URL on the command line, and get on stdout a generated HTML page with the headlines linked locally to compacted versions of the pages they refer to, so the extension of the RSS is contained in a single file (about 23...110 KB size). My use case is that I want to "reap" fresh news to download to the iPaq for offline reading, but avoiding fancy frames, ads, links that I couldn't follow. Usage example:

C:\_Ricci\sep>tclsh rss2html.tcl > t.html http://news.bbc.co.uk/rss/newsonline_world_edition/front_page/rss091.xml

Result at [L1 ] - Sample screenshot:

Again: no warranties at all, but I'm basically happy with the output of this script on my few test cases (Spiegel online, Tagesspiegel, BBC...) - feel free to add criticisms or improvements :) A tiny bash script called feedme reaps all feeds that I want, on demand, and ActiveSynch takes care to transfer the HTML pages to the little thing:

 tclsh x:/tcl/rss2html.tcl > Spiegel.htm www.spiegel.de/schlagzeilen/rss/0,5291,,00.xml
 tclsh x:/tcl/rss2html.tcl > Tagesspiegel.htm www.tagesspiegel.de/feed/index.xml
 tclsh x:/tcl/rss2html.tcl > BBC.htm news.bbc.co.uk/rss/newsonline_world_edition/front_page/rss091.xml

 set usage {
    usage: rss2html.tcl rss_url > htmlfile
 }
 package require http
 package require uri

 proc main argv {
    if {[llength $argv] != 1} {puts stderr $::usage; exit}
    set rss [lindex $argv 0] ;# other arguments ignored for now
    set content [readRSS $rss]
    set n 0
    puts "<html><head/><body>From: $rss<br>"
    puts "Updated: [clock format [clock sec] -format {%Y-%m-%d, %H:%M:%S}]<hr>"
    #-- pass 1: table of contents
    foreach {title - descr} $content {
        incr n
        puts "<br><a href=#$n>$title</a> $descr"
    }
    #-- pass 2: the pages themselves
    set n 0
    foreach {title url -} $content {
        incr n
        puts "<hr><a name=$n><h4>$title</h4></a>"
        puts [readHTML $title $url]
    }
    puts "<hr>Reaped by rss2html :)</body></html>"
 }
 proc readRSS url {
    upvar #0 [geturl_followRedirects $url] arr
    if ![info exists arr(body)] {set arr(body) "<html>not found :(</html>"}
    set res {}
    foreach {tag content} [html2txt $arr(body)] {
        switch -- $tag {
            <description> {set descr $content}
            <title> {set title $content}
            <link>  {set link $content}
            </item> {lappend res $title $link $descr; set descr ""}
        }
    }
    set res
 }
 proc readHTML {title url} {
    regexp {[*](http://.+)} $url -> url
    set res {}
    upvar #0 [geturl_followRedirects $url] arr
    foreach {tag content} [html2txt $arr(body)] {
        set content [string trim [despace $content]]
        if [string match *$content* $title] continue
        if {[string length $content]<20}    continue
        if [regexp {userAgent|navigator.platform|http|\(\)} $content] continue
        switch -glob -- [string tolower $tag] {
            <br* - <div* - </div> - <p> - </p> - </script> -
            <li> - </li> - </ul> {append res <p>$content\n}
            <i> - <b> - </a> - </b> - <!--* - </em>   {append res $content\n}
            default {#append res "\n<!-- [string trim $tag <>] - $content -->\n"}
        }
    }
    set res
 }
 proc html2txt {html} {
   set res {}
   set re {(<[^>]+>) *([^<>]*)}
   foreach {all tag content} [regexp -all -inline $re $html] {
      if {![regexp src= $content]} {
              lappend res $tag $content
      }
   }
   string map {
      &#220; ½½oe &#223; ½½Y &#228; ½½¤ &#246; ½½¶ &#252; ½½¼ &#132; ' &#147; '
      &auml; ½½¤ &ouml; ½½¶ &uuml; ½½¼ &szlig; ½½Y  &nbsp; " "
   } $res
 }
 proc despace string {string trim [regsub -all {\s+} $string " "]}

 #-- courtesy KPV's https://wiki.tcl-lang.org/11831
 proc geturl_followRedirects {url args} {
    array set URI [::uri::split $url] ;# Need host info from here
    while {1} {
        set token [eval [list http::geturl $url] $args]
        if {![string match {30[1237]} [::http::ncode $token]]} {return $token}
        array set meta [set ${token}(meta)]
        if {![info exist meta(Location)]} {
            return $token
        }
        array set uri [::uri::split $meta(Location)]
        unset meta
        if {$uri(host) == ""} { set uri(host) $URI(host) }
        # problem w/ relative versus absolute paths
        set url [eval ::uri::join [array get uri]]
    }
 }

 main $argv

metoto - 2009-12-30 11:48:11

Hi thats good that you know how to write the bash shell for rss2html can you write a similar script for a webpage? So that the url's are printed live from any Rss Feed?

Thanks

Category Internet

Arts and Crafts of Tcl-Tk Programming