Richard Suchenwirth 2005-02-09 - As a companion piece to RS's RSS, this midnight fun project script works without Tk - specify a RSS URL on the command line, and get on stdout a generated HTML page with the headlines linked locally to compacted versions of the pages they refer to, so the extension of the RSS is contained in a single file (about 23...110 KB size). My use case is that I want to "reap" fresh news to download to the iPaq for offline reading, but avoiding fancy frames, ads, links that I couldn't follow. Usage example:
Result at [L1 ] - Sample screenshot:
Again: no warranties at all, but I'm basically happy with the output of this script on my few test cases (Spiegel online, Tagesspiegel, BBC...) - feel free to add criticisms or improvements :) A tiny bash script called feedme reaps all feeds that I want, on demand, and ActiveSynch takes care to transfer the HTML pages to the little thing:
tclsh x:/tcl/rss2html.tcl > Spiegel.htm www.spiegel.de/schlagzeilen/rss/0,5291,,00.xml tclsh x:/tcl/rss2html.tcl > Tagesspiegel.htm www.tagesspiegel.de/feed/index.xml tclsh x:/tcl/rss2html.tcl > BBC.htm news.bbc.co.uk/rss/newsonline_world_edition/front_page/rss091.xml
set usage { usage: rss2html.tcl rss_url > htmlfile } package require http package require uri proc main argv { if {[llength $argv] != 1} {puts stderr $::usage; exit} set rss [lindex $argv 0] ;# other arguments ignored for now set content [readRSS $rss] set n 0 puts "<html><head/><body>From: $rss<br>" puts "Updated: [clock format [clock sec] -format {%Y-%m-%d, %H:%M:%S}]<hr>" #-- pass 1: table of contents foreach {title - descr} $content { incr n puts "<br><a href=#$n>$title</a> $descr" } #-- pass 2: the pages themselves set n 0 foreach {title url -} $content { incr n puts "<hr><a name=$n><h4>$title</h4></a>" puts [readHTML $title $url] } puts "<hr>Reaped by rss2html :)</body></html>" } proc readRSS url { upvar #0 [geturl_followRedirects $url] arr if ![info exists arr(body)] {set arr(body) "<html>not found :(</html>"} set res {} foreach {tag content} [html2txt $arr(body)] { switch -- $tag { <description> {set descr $content} <title> {set title $content} <link> {set link $content} </item> {lappend res $title $link $descr; set descr ""} } } set res } proc readHTML {title url} { regexp {[*](http://.+)} $url -> url set res {} upvar #0 [geturl_followRedirects $url] arr foreach {tag content} [html2txt $arr(body)] { set content [string trim [despace $content]] if [string match *$content* $title] continue if {[string length $content]<20} continue if [regexp {userAgent|navigator.platform|http|\(\)} $content] continue switch -glob -- [string tolower $tag] { <br* - <div* - </div> - <p> - </p> - </script> - <li> - </li> - </ul> {append res <p>$content\n} <i> - <b> - </a> - </b> - <!--* - </em> {append res $content\n} default {#append res "\n<!-- [string trim $tag <>] - $content -->\n"} } } set res } proc html2txt {html} { set res {} set re {(<[^>]+>) *([^<>]*)} foreach {all tag content} [regexp -all -inline $re $html] { if {![regexp src= $content]} { lappend res $tag $content } } string map { Ü ½½oe ß ½½Y ä ½½¤ ö ½½¶ ü ½½¼ „ ' “ ' ä ½½¤ ö ½½¶ ü ½½¼ ß ½½Y " " } $res } proc despace string {string trim [regsub -all {\s+} $string " "]} #-- courtesy KPV's https://wiki.tcl-lang.org/11831 proc geturl_followRedirects {url args} { array set URI [::uri::split $url] ;# Need host info from here while {1} { set token [eval [list http::geturl $url] $args] if {![string match {30[1237]} [::http::ncode $token]]} {return $token} array set meta [set ${token}(meta)] if {![info exist meta(Location)]} { return $token } array set uri [::uri::split $meta(Location)] unset meta if {$uri(host) == ""} { set uri(host) $URI(host) } # problem w/ relative versus absolute paths set url [eval ::uri::join [array get uri]] } } main $argv
metoto - 2009-12-30 11:48:11
Hi thats good that you know how to write the bash shell for rss2html can you write a similar script for a webpage? So that the url's are printed live from any Rss Feed?
Thanks