While [DG] was making an [RSS] reader/formatter for [tclhttpd] for dynamic page generation of news feeds, I bumped into some nasty problems with encodings and thought I'd share. [tDOM]'s [[dom parse]] command expects the xml string to have already been properly decoded and be in its "pure" state. The http package does encoding translations but only when the HTTP header for Content-Type contains a charset declaration. To make matters a bit more tricky, if the XML string contains an encoding value in the XML declaration, the XML parser will do a needless second conversion. Here's a working way to grab and parse XML with the http package: ====== # Lie like a senator for google to stop giving me a 403.. http::config -useragent {Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8) Gecko/20051111 Firefox/1.5} # Should the server uses this info, give it what we want to receive. http::config -accept "text/xml,application/xml,application/rss+xml,application/rdf+xml,application/atom+xml" # Returns the DOM object of the RSS feed. proc tmlrss::fetchXML {uri {recurse_limit 4}} { set token [http::geturl $uri] upvar #0 $token state if {[http::status $token] != "ok" || [http::ncode $token] != 200} { # was the error a redirect? If so, do it.. if {[http::ncode $token] == 302 && [incr recurse_limit -1] > 0} { array set meta $state(meta) set result [fetchXML $meta(Location) $recurse_limit] http::cleanup $token return $result } set err [http::code $token] http::cleanup $token return -code error $err } set xml [http::data $token] array set meta $state(meta) http::cleanup $token # Do we need to do encoding conversions or was it already done # in transit? if {[info exist meta(Content-Type)] && \ [regexp -nocase {charset\s*=\s*(\S+)} $meta(Content-Type)]} { # Socket channel encodings already performed! No Work to do # here. See section 5.2.2 of the html spec. Server set # encodings win. } else { # Read and perform charset mappings of the document rather than # hand off the literal string to [dom parse] as Tcl itself is # more encoding capable. set xml [encoding convertfrom [getXmlEncoding $xml] $xml] } # Strip the XML declaration, should it exist. The encoding # conversions have already been performed. Let's not let Expat # try to convert it a second time as the work has already been # performed. As the parser assumes utf-8 by default without a # declaration, and the fact that tdom is handing expat the utf-8 # string rep, this will work even though it doesn't look correct. return [dom parse -baseurl [uriBase $uri] [stripXmlDecl $xml]] } proc tmlrss::uriBase {uri} { array set info [uri::split $uri] set info(path) [file dirname $info(path)] return [eval uri::join [array get info]] } proc tmlrss::stripXmlDecl {xml} { if {![binary scan [string range $xml 0 3] "H8" firstBytes]} { # very short (< 4 Bytes) string return $xml } # If the entity has an XML Declaration, the first four characters # must be "" $xml] if {$closeIndex == -1} { error "Weird XML data or not XML data at all" } set xml [string range $xml [expr {$closeIndex+1}] end] } default { # no declaration. } } return $xml } proc tmlrss::getXmlEncoding {xml} { # The autodetection of the encoding follows # XML Recomendation, Appendix F if {![binary scan $xml "H8" firstBytes]} { # very short (< 4 Bytes) file return iso8859-1 } # If the entity has a XML Declaration, the first four characters # must be "" $xml] if {$closeIndex == -1} { error "Weird XML data or not XML data at all" } set xmlDeclaration [string range $xml 0 [expr {$closeIndex}]] # extract the encoding information set pattern {^[^>]+encoding=[\x20\x9\xd\xa]*["']([^ "']+)['"]} # emacs: " if {![regexp $pattern $xml - encStr]} { # Probably something like . # Without encoding declaration, pass-thru return iso8859-1 } else { set encoding [tDOM::IANAEncoding2TclEncoding $encStr] } } default { # pass-thru set encoding iso8859-1 } } return $encoding } ====== ---- [HaO] What happens, if the server sends an XML-Document in unicode starting with a BOM (byte order mark) ? Or is this not relevant in this context ? <> XML