if 0 {[Richard Suchenwirth] 2005-02-09 - As a companion piece to [RS's RSS], this script works without [Tk] - specify a [RSS] URL on the command line, and get on stdout a generated [HTML] page with the headlines linked locally to compacted versions of the pages they refer to. My use case is that I want to "reap" fresh news to download to the [iPaq] for offline reading, but avoiding fancy frames, ads, links that I couldn't follow. Usage example: * C:\_Ricci\sep>tclsh rss2html.tcl > t.html http://news.bbc.co.uk/rss/newsonline_world_edition/front_page/rss091.xml Result at [http://mini.net/files/bbc.html] Again: no warranties at all, but I'm basically happy with the output of this script on my few test cases (Spiegel online, Tagesspiegel, BBC...) - feel free to add criticisms or improvements :) } set usage { usage: rss2html.tcl rss_url > htmlfile } package require http package require uri proc main argv { if {[llength $argv] != 1} {puts stderr $::usage; exit} set rss [lindex $argv 0] ;# other arguments ignored for now set content [readRSS $rss] set n 0 puts "From: $rss
" puts "Updated: [clock format [clock sec] -format {%Y-%m-%d, %H:%M:%S}]
" #-- pass 1: table of contents foreach {title - descr} $content { incr n puts "
$title $descr" } #-- pass 2: the pages themselves set n 0 foreach {title url -} $content { incr n puts "

$title

" puts [readHTML $title $url] } puts "
Reaped by rss2html :)" } proc readRSS url { upvar #0 [geturl_followRedirects $url] arr if ![info exists arr(body)] {set arr(body) "not found :("} set res {} foreach {tag content} [html2txt $arr(body)] { switch -- $tag { {set descr $content} {set title $content} <link> {set link $content} </item> {lappend res $title $link $descr; set descr ""} } } set res } proc readHTML {title url} { regexp {[*](http://.+)} $url -> url set res $url<br> upvar #0 [geturl_followRedirects $url] arr foreach {tag content} [html2txt $arr(body)] { set content [string trim [despace $content]] if {$content eq $title} continue if {[string length $content]<20} continue if [regexp {userAgent|navigator.platform|http|\(\)} $content] continue switch -glob -- [string tolower $tag] { <br* - <div* - </div> - <p> - </p> - <li> - </li> - </ul> {append res <p>$content\n} <i> - <b> - </a> - </b> - <!--* - </em> {append res $content\n} default {#append res "\n<!-- [string trim $tag <>] - $content -->\n"} } } set res } proc html2txt {html} { set res {} set re {(<[^>]+>) *([^<>]*)} foreach {all tag content} [regexp -all -inline $re $html] { if {![regexp src= $content]} { lappend res $tag $content } } string map { Ü Ü ß ß ä ä ö ö ü ü „ ' “ ' ä ä ö ö ü ü ß ß   " " } $res } proc despace string {string trim [regsub -all {\s+} $string " "]} #-- courtesy KPV's http://wiki.tcl.tk/11831 proc geturl_followRedirects {url args} { array set URI [::uri::split $url] ;# Need host info from here while {1} { set token [eval [list http::geturl $url] $args] if {![string match {30[1237]} [::http::ncode $token]]} {return $token} array set meta [set ${token}(meta)] if {![info exist meta(Location)]} { return $token } array set uri [::uri::split $meta(Location)] unset meta if {$uri(host) == ""} { set uri(host) $URI(host) } # problem w/ relative versus absolute paths set url [eval ::uri::join [array get uri]] } } main $argv if 0 { ---- [Category Internet] | [Arts and crafts of Tcl-Tk programming] }