Method used to encode general strings (containing arbitrary Unicode characters) as strings of characters allowed in internet domain names.
Anyone up for implementing this in Tcl?
Here is a TCL implementation. pi31415
punycode.tcl
# This is based on the following Python implementation. # http://pydoc.org/get.cgi/usr/local/lib/python2.4/encodings/punycode.py # # Also done in C, Perl, PHP, and Javascript. # http://www.ietf.org/rfc/rfc3492.txt # http://cpansearch.perl.org/src/MIYAGAWA/IDNA-Punycode-0.02/lib/IDNA/Punycode.pm # http://svn.php.net/viewvc/pear/packages/Net_IDNA/trunk/Net/IDNA.php?revision=300850&view=co # https://gist.github.com/1035853 namespace eval ::puny { set digits "abcdefghijklmnopqrstuvwxyz0123456789" } proc ::puny::toChars {nums} { set chars [list] foreach {char} $nums { lappend chars [format "%c" $char] } return [join $chars ""] } # Encoding proc ::puny::enumerate {sequence {start 0}} { set retval [list] set n $start foreach {elem} $sequence { lappend retval $n $elem incr n } return $retval } # 3.1 Basic code point segregation proc ::puny::segregate {str} { set base [list] set extended [list] foreach {c} [split $str ""] { scan $c "%c" {char} if {$char < 128} { lappend base $char } else { lappend extended $c } } set extended [lsort $extended] return [list $base $extended] } # Return the length of str, considering only characters below max. proc ::puny::selectiveLen {str max} { set res 0 foreach c [split $str ""] { scan $c "%c" {char} if {$char < $max} { incr res } } return $res } # Return a pair (index, pos), indicating the next occurrence of # char in str. index is the position of the character considering # only ordinals up to and including char, and pos is the position in # the full string. index/pos is the starting position in the full # string. proc ::puny::selectiveFind {str char index pos} { set l [string length $str] while {true} { incr pos if {$pos == $l} { return [list -1 -1] } set c [string index $str $pos] if {$c == $char} { incr index return [list $index $pos] } elseif {$c < $char} { incr index } } } # 3.2 Insertion unsort coding proc ::puny::insertionUnsort {str extended} { set oldchar 128 set result [list] set oldindex -1 foreach {c} $extended { set index -1 set pos -1 scan $c "%c" {char} set curlen [::puny::selectiveLen $str $char] set delta [expr {($curlen + 1) * ($char - $oldchar)}] while {true} { lassign [::puny::selectiveFind $str $c $index $pos] {index} {pos} if {$index == -1} { break } set delta [expr {$delta + $index - $oldindex}] lappend result [expr {$delta - 1}] set oldindex $index set delta 0 } set oldchar $char } return $result } # Punycode parameters: tmin = 1, tmax = 26, base = 36 proc ::puny::T {j bias} { set res [expr {36 * ($j + 1) - $bias}] if {$res < 1} { return 1 } if {$res > 26} { return 26 } return $res } # 3.3 Generalized variable-length integers proc ::puny::generateGeneralizedInteger {N bias} { set result [list] set j 0 while {true} { set t [::puny::T $j $bias] if {$N < $t} { lappend result [string index $::puny::digits $N] return $result } set pos [expr {$t + (($N - $t) % (36 - $t))}] lappend result [string index $::puny::digits $pos] set N [expr {int(($N - $t) / (36 - $t))}] incr j } } proc ::puny::adapt {delta first numchars} { if {$first} { set delta [expr {int($delta / 700)}] } else { set delta [expr {int($delta / 2)}] } incr delta [expr {int($delta / $numchars)}] # int((($base - $tmin) * $tmax) / 2) == 455 set divisions 0 while {$delta > 455} { # base - tmin set delta [expr {int($delta / 35)}] incr divisions 36 } set bias [expr {$divisions + int(36 * $delta / ($delta + 38))}] return $bias } # 3.4 Bias adaptation # Punycode parameters: initial bias = 72, damp = 700, skew = 38 proc ::puny::generateIntegers {baselen deltas} { set result [list] set bias 72 foreach {points delta} [::puny::enumerate $deltas] { set s [::puny::generateGeneralizedInteger $delta $bias] set result [concat $result $s] set bias [::puny::adapt \ $delta \ [expr {$points == 0}] \ [expr {$baselen + $points + 1}] \ ] } return $result } proc ::puny::encode {text} { lassign [::puny::segregate $text] {base} {extended} set deltas [::puny::insertionUnsort $text $extended] set extended [::puny::generateIntegers [llength $base] $deltas] if {[llength $base]} { return [format "%s-%s" [::puny::toChars $base] [join $extended ""]] } return [join $extended ""] } # Decoding proc ::puny::toNums {text} { set retval [list] foreach {c} [split $text ""] { lappend retval [scan $c "%c"] } return $retval } # 3.3 Generalized variable-length integers proc ::puny::decodeGeneralizedNumber {extended extpos bias errors} { set result 0 set w 1 set j 0 while {true} { set c [lindex $extended $extpos] if {[string length $c] == 0} { if {$errors == "strict"} { error "incomplete punicode string" } incr extpos return [list $extpos None] } scan $c "%c" {char} incr extpos if {65 <= $char && $char <= 90} { # A-Z set digit [expr {$char - 65}] } elseif {48 <= $char && $char <= 57} { # 0x30-26 set digit [expr {$char - 22}] } elseif {$errors == "strict"} { set pos [lindex $extended $extpos] error [format "Invalid extended code point '%s'" $pos] } else { return [list $extpos None] } set t [::puny::T $j $bias] set result [expr {$result + $digit * $w}] if {$digit < $t} { return [list $extpos $result] } set w [expr {$w * (36 - $t)}] incr j } } # 3.2 Insertion unsort coding proc ::puny::insertionSort {base extended errors} { set char 128 set pos -1 set bias 72 set extpos 0 while {$extpos < [llength $extended]} { lassign [::puny::decodeGeneralizedNumber \ $extended \ $extpos \ $bias \ $errors \ ] {newpos} {delta} if {$delta == "None"} { # There was an error in decoding. We can't continue because # synchronization is lost. return $base } set pos [expr {$pos + $delta + 1}] set char [expr {$char + int($pos / ([llength $base] + 1))}] if {$char > 1114111} { if {$errors == "strict"} { error [format "Invalid character U+%x" $char] } scan "?" "%c" {char} } set pos [expr {$pos % ([llength $base] + 1)}] set base [concat \ [lrange $base 0 "$pos-1"] \ $char \ [lrange $base $pos end] \ ] set bias [::puny::adapt \ $delta \ [expr {$extpos == 0}] \ [llength $base] \ ] set extpos $newpos } return $base } proc ::puny::decode {text errors} { set pos [string last "-" $text] if {$pos == -1} { set base [list] set code [string toupper $text] set extended [split $code ""] } else { set base [::puny::toNums [string range $text 0 "$pos-1"]] set code [string toupper [string range $text "$pos+1" end]] set extended [split $code ""] } return [::puny::toChars [::puny::insertionSort $base $extended $errors]] }
test.tcl
package require tcltest source punycode.tcl proc testme {str} { set code [::puny::encode $str] set text [::puny::decode $code ""] if {$str != $text} { error "Round-trip error" } return $code } ::tcltest::test emptyinput "Empty input" \ -body {testme ""} \ -result "" ::tcltest::test singleunichar "Single unicode character" \ -body {testme "ü"} \ -result "tda" ::tcltest::test nounichar "No unicode character" \ -body {testme "Goethe"} \ -result "Goethe-" ::tcltest::test midleunichar "Unicode character in the middle" \ -body {testme "Bücher"} \ -result "Bcher-kva" ::tcltest::test indashes "Text within dashes" \ -body {testme {-> $1.00 <-}} \ -result {-> $1.00 <--}
DKF: Here's a working punycode encoder, converted from the code in the RFC:
namespace eval punycode { variable digits [split "abcdefghijklmnopqrstuvwxyz0123456789" ""] # Bootstring parameters for Punycode variable base 36 variable tmin 1 variable tmax 26 variable skew 38 variable damp 700 variable initial_bias 72 variable initial_n 0x80 proc adapt {delta first numchars} { variable base variable tmin variable tmax variable damp variable skew set delta [expr {$delta / ($first ? $damp : 2)}] incr delta [expr {$delta / $numchars}] set k 0 while {$delta > ($base - $tmin) * $tmax / 2} { set delta [expr {$delta / ($base-$tmin)}] incr k $base } return [expr {$k + ($base-$tmin+1) * $delta / ($delta+$skew)}] } # Main encode function proc encode {input {case ""}} { variable digits variable tmin variable tmax variable base variable initial_n variable initial_bias set in [split $input ""] set output {} # Initialize the state: set n $initial_n set delta 0 set bias $initial_bias # Handle the basic code points: foreach ch $in { if {$ch < "\u0080"} { if {$case ne ""} { if {$case} { append output [string toupper $ch] } else { append output [string tolower $ch] } } else { append output $ch } } } set h [set b [string length $output]] # h is the number of code points that have been handled, b is the # number of basic code points. if {$b} { append output "-" } # Main encoding loop: while {$h < [llength $in]} { # All non-basic code points < n have been handled already. Find # the next larger one: for {set m inf; set j 0} {$j < [llength $in]} {incr j} { scan [lindex $in $j] "%c" ch if {$ch >= $n && $ch < $m} { set m $ch } } # Increase delta enough to advance the decoder's <n,i> state to # <m,0>, but guard against overflow: if {$m-$n > (0xffffffff-$delta)/($h+1)} { error "overflow in delta computation" } incr delta [expr {($m-$n) * ($h+1)}] set n $m for {set j 0} {$j < [llength $in]} {incr j} { scan [lindex $in $j] "%c" ch if {$ch < $n && ([incr delta] & 0xffffffff) == 0} { error "overflow in delta computation" } if {$ch == $n} { # Represent delta as a generalized variable-length # integer: for {set q $delta; set k $base} true {incr k $base} { set t [expr {min(max($k-$bias,$tmin),$tmax)}] if {$q < $t} break append output \ [lindex $digits [expr {$t + ($q-$t)%($base-$t)}]] set q [expr {($q-$t) / ($base-$t)}] } append output [lindex $digits $q] set bias [adapt $delta [expr {$h==$b}] [expr {$h+1}]] set delta 0 incr h } } incr delta incr n } return $output } }