Method used to encode general strings (containing arbitrary [Unicode] characters) as strings of characters allowed in internet domain names. * [RFC] 3490 * http://en.wikipedia.org/wiki/Punycode Anyone up for implementing this in Tcl? ---- Here is a TCL implementation. [pi31415] punycode.tcl ====== # This is based on the following Python implementation. # http://pydoc.org/get.cgi/usr/local/lib/python2.4/encodings/punycode.py # # Also done in C, Perl, PHP, and Javascript. # http://www.ietf.org/rfc/rfc3492.txt # http://cpansearch.perl.org/src/MIYAGAWA/IDNA-Punycode-0.02/lib/IDNA/Punycode.pm # http://svn.php.net/viewvc/pear/packages/Net_IDNA/trunk/Net/IDNA.php?revision=300850&view=co # https://gist.github.com/1035853 namespace eval ::puny { set digits "abcdefghijklmnopqrstuvwxyz0123456789" } proc ::puny::toChars {nums} { set chars [list] foreach {char} $nums { lappend chars [format "%c" $char] } return [join $chars ""] } # Encoding proc ::puny::enumerate {sequence {start 0}} { set retval [list] set n $start foreach {elem} $sequence { lappend retval $n $elem incr n } return $retval } # 3.1 Basic code point segregation proc ::puny::segregate {str} { set base [list] set extended [list] foreach {c} [split $str ""] { scan $c "%c" {char} if {$char < 128} { lappend base $char } else { lappend extended $c } } set extended [lsort $extended] return [list $base $extended] } # Return the length of str, considering only characters below max. proc ::puny::selectiveLen {str max} { set res 0 foreach c [split $str ""] { scan $c "%c" {char} if {$char < $max} { incr res } } return $res } # Return a pair (index, pos), indicating the next occurrence of # char in str. index is the position of the character considering # only ordinals up to and including char, and pos is the position in # the full string. index/pos is the starting position in the full # string. proc ::puny::selectiveFind {str char index pos} { set l [string length $str] while {true} { incr pos if {$pos == $l} { return [list -1 -1] } set c [string index $str $pos] if {$c == $char} { incr index return [list $index $pos] } elseif {$c < $char} { incr index } } } # 3.2 Insertion unsort coding proc ::puny::insertionUnsort {str extended} { set oldchar 128 set result [list] set oldindex -1 foreach {c} $extended { set index -1 set pos -1 scan $c "%c" {char} set curlen [::puny::selectiveLen $str $char] set delta [expr {($curlen + 1) * ($char - $oldchar)}] while {true} { lassign [::puny::selectiveFind $str $c $index $pos] {index} {pos} if {$index == -1} { break } set delta [expr {$delta + $index - $oldindex}] lappend result [expr {$delta - 1}] set oldindex $index set delta 0 } set oldchar $char } return $result } # Punycode parameters: tmin = 1, tmax = 26, base = 36 proc ::puny::T {j bias} { set res [expr {36 * ($j + 1) - $bias}] if {$res < 1} { return 1 } if {$res > 26} { return 26 } return $res } # 3.3 Generalized variable-length integers proc ::puny::generateGeneralizedInteger {N bias} { set result [list] set j 0 while {true} { set t [::puny::T $j $bias] if {$N < $t} { lappend result [string index $::puny::digits $N] return $result } set pos [expr {$t + (($N - $t) % (36 - $t))}] lappend result [string index $::puny::digits $pos] set N [expr {int(($N - $t) / (36 - $t))}] incr j } } proc ::puny::adapt {delta first numchars} { if {$first} { set delta [expr {int($delta / 700)}] } else { set delta [expr {int($delta / 2)}] } incr delta [expr {int($delta / $numchars)}] # int((($base - $tmin) * $tmax) / 2) == 455 set divisions 0 while {$delta > 455} { # base - tmin set delta [expr {int($delta / 35)}] incr divisions 36 } set bias [expr {$divisions + int(36 * $delta / ($delta + 38))}] return $bias } # 3.4 Bias adaptation # Punycode parameters: initial bias = 72, damp = 700, skew = 38 proc ::puny::generateIntegers {baselen deltas} { set result [list] set bias 72 foreach {points delta} [::puny::enumerate $deltas] { set s [::puny::generateGeneralizedInteger $delta $bias] set result [concat $result $s] set bias [::puny::adapt \ $delta \ [expr {$points == 0}] \ [expr {$baselen + $points + 1}] \ ] } return $result } proc ::puny::encode {text} { lassign [::puny::segregate $text] {base} {extended} set deltas [::puny::insertionUnsort $text $extended] set extended [::puny::generateIntegers [llength $base] $deltas] if {[llength $base]} { return [format "%s-%s" [::puny::toChars $base] [join $extended ""]] } return [join $extended ""] } # Decoding proc ::puny::toNums {text} { set retval [list] foreach {c} [split $text ""] { lappend retval [scan $c "%c"] } return $retval } # 3.3 Generalized variable-length integers proc ::puny::decodeGeneralizedNumber {extended extpos bias errors} { set result 0 set w 1 set j 0 while {true} { set c [lindex $extended $extpos] if {[string length $c] == 0} { if {$errors == "strict"} { error "incomplete punicode string" } incr extpos return [list $extpos None] } scan $c "%c" {char} incr extpos if {65 <= $char && $char <= 90} { # A-Z set digit [expr {$char - 65}] } elseif {48 <= $char && $char <= 57} { # 0x30-26 set digit [expr {$char - 22}] } elseif {$errors == "strict"} { set pos [lindex $extended $extpos] error [format "Invalid extended code point '%s'" $pos] } else { return [list $extpos None] } set t [::puny::T $j $bias] set result [expr {$result + $digit * $w}] if {$digit < $t} { return [list $extpos $result] } set w [expr {$w * (36 - $t)}] incr j } } # 3.2 Insertion unsort coding proc ::puny::insertionSort {base extended errors} { set char 128 set pos -1 set bias 72 set extpos 0 while {$extpos < [llength $extended]} { lassign [::puny::decodeGeneralizedNumber \ $extended \ $extpos \ $bias \ $errors \ ] {newpos} {delta} if {$delta == "None"} { # There was an error in decoding. We can't continue because # synchronization is lost. return $base } set pos [expr {$pos + $delta + 1}] set char [expr {$char + int($pos / ([llength $base] + 1))}] if {$char > 1114111} { if {$errors == "strict"} { error [format "Invalid character U+%x" $char] } scan "?" "%c" {char} } set pos [expr {$pos % ([llength $base] + 1)}] set base [concat \ [lrange $base 0 "$pos-1"] \ $char \ [lrange $base $pos end] \ ] set bias [::puny::adapt \ $delta \ [expr {$extpos == 0}] \ [llength $base] \ ] set extpos $newpos } return $base } proc ::puny::decode {text errors} { set pos [string last "-" $text] if {$pos == -1} { set base [list] set code [string toupper $text] set extended [split $code ""] } else { set base [::puny::toNums [string range $text 0 "$pos-1"]] set code [string toupper [string range $text "$pos+1" end]] set extended [split $code ""] } return [::puny::toChars [::puny::insertionSort $base $extended $errors]] } ====== ---- test.tcl ====== package require tcltest source punycode.tcl proc testme {str} { set code [::puny::encode $str] set text [::puny::decode $code ""] if {$str != $text} { error "Round-trip error" } return $code } ::tcltest::test emptyinput "Empty input" \ -body {testme ""} \ -result "" ::tcltest::test singleunichar "Single unicode character" \ -body {testme "ü"} \ -result "tda" ::tcltest::test nounichar "No unicode character" \ -body {testme "Goethe"} \ -result "Goethe-" ::tcltest::test midleunichar "Unicode character in the middle" \ -body {testme "Bücher"} \ -result "Bcher-kva" ::tcltest::test indashes "Text within dashes" \ -body {testme {-> $1.00 <-}} \ -result {-> $1.00 <--} ====== ---- !!!!!! %| [Category Local] | [Category Internet] |% !!!!!!