if 0 {[Richard Suchenwirth] 2003-06-18 - SYNOPSIS: package require i18n ?1.0? i18n::codepage encoding i18n::data ?name ?value?? i18n::from repname string i18n::language ?code? i18n::render string i18n::tell string i18n::to keyword string DESCRIPTION i18n is short for internationalization, "make software work with many languages". This package contains support routines and utilities for i18n work, mainly for conversion between 16-bit Unicode strings (where each character can be between \u0000 and \uFFFF) and 7-bit ASCII strings (where the range is limited to \x00..\x7F). ASCII strings are less expressive, but can conveniently be handled with keyboards and transmission channels. An ASCII string with the additional information what transliteration is used can be turned into correct Unicode, e.g. ("Moskva", ruslish) -> (Russian spelling of Moscow) Conversely, one can extract from a Unicode character both the name of the subset (writing system - use the ''tell'' command) and an ASCII representation of one or more characters (see the ''to'' command). i18n::codepage encoding Returns a list of 256 characters corresponding to the byte values \x00 to \xFF in the specified one-byte ''encoding''. Undefined characters will be expressed with a default character. Interesting test: ''ebcdic''!} namespace eval i18n {variable version 1.0} proc i18n::codepage encoding { set res {} for {set i 0} {$i<256} {incr i} { lappend res [encoding convertfrom $encoding [format %c $i]] } set res } if 0 {--------------------------------------------------------- i18n::data ?name ?value?? Called with no arguments, returns the names of available data tables (typically alternating lists). With one argument, returns the data table associated with ''name''. With two arguments, stores the given ''value'' persistently as data table ''name''. } proc i18n::data args { variable data switch [llength $args] { 0 {lsort [array names data]} 1 {subst -nocom -novar [join $data([lindex $args 0])]} 2 {set data([lindex $args 0]) [lindex $args 1]} default {error "usage: i18n::data ?name ?value??"} } } if 0 {--------------------------------------------------------- i18n::from repname string Converts the ASCII ''string'' into a Unicode string according to the representation rules specified by ''repname''. The following representations are included: * greeklish * hanglish - computes a Korean hangul from the input pronunciation * pinyin - takes one pinyin syllable, returns list of matching chars Add custom representations by just registering them as ''i18n::data'': i18n::data foolish {foo T bar c grill l} i18n::from foolish foobargrill ;# => Tcl } proc i18n::from {repname string} { variable data switch -- $repname { hanglish {from_hanglish $string} pinyin {from_pinyin $string} default {string map $data($repname) $string} } } proc i18n::from_hanglish string { set res "" foreach i [split $string] { foreach j [split $i -] {append res [hanglish2uc $j]} append res " " } set res } proc i18n::hanglish2uc hanglish { set L ""; set V "" ;# in case regexp doesn't hit set h2 [string map { NG Q YE X YAI F AI R YA V YO Y YU Z VI F } [string toupper $hanglish]] regexp {^([GNDLMBSQJCKTPH]+)?([ARVFEIXOYUZW]+)([GNDLMBSQJCKTPH]*)$} \ $h2 -> L V T ;# lead cons.-vowel-trail cons. if {$L==""} {set L Q} if {$V==""} {return $hanglish} set l [lsearch {G GG N D DD L M B BB S SS Q J JJ C K T P H} $L] if {$l<0} {return $hanglish} set v [lsearch {A R V F E EI X XI O OA OR OI Y U UE UEI UI Z W WI I} $V] if {$v<0} {return $hanglish} set t [lsearch {"" G GG GS N NJ NH D L LG LM LB LS LT LP LH \ M B BS S SS Q J C K T P H} $T] ;# trailing consonants if {$t<0} {return $hanglish} format %c [expr {$l*21*28 + $v*28 + $t + 0xAC00}] } proc i18n::from_pinyin string { #-- list of Chinese chars for which pinyin is string in gb2312 variable data set pos [lsearch $data(pinyin) $string] if {$pos >= 0} { set res {} set from [lindex $data(pinyin) [incr pos]] set to [lindex $data(pinyin) [incr pos 2]] while {$from<$to} { if {($from-1)%100>93} continue ;# skip 95..00 gap set b1 [format %c [expr {$from/100+32}]] set b2 [format %c [expr {$from%100+32}]] lappend res [encoding convertfrom gb2312 $b1$b2] incr from } set res } } if 0 {--------------------------------------------------------- i18n::language ?code? Called with no arguments, returns the list of ISO 639 language codes (e.g. ''en'' for English). With one argument, returns the English name of the language coded in ISO 639 as ''code''.} interp alias {} i18n::language {} i18n::_lfind $i18n::data(iso639) #-- Generic table searcher: proc i18n::_lfind {list {code ""}} { if {$code==""} { lsort [_keys $list] } else { set pos [lsearch $list $code] if {$pos>=0} {lindex $list [incr pos]} } } proc i18n::_keys list { set res {} foreach {key -} $list {lappend res $key} set res } if 0 {--------------------------------------------------------- i18n::render string Prepares the string for display on a Tk widget. For Arabic, this involves selecting the correct context glyph, and r2l; for Hebrew, it does only r2l conversion. This is a workaround as long as Tk cannot handle bidi automatically. } proc i18n::render string { if [regexp {^([\x0-\u04ff]*)([\u0621-\u064a ]+)(.*)$} $string -> a b c] { set string "$a[render_arab $b] $c" } set string } proc i18n::_srevert string { set res "" set i [string length $string] while {[incr i -1]>=0} {append res [string index $string $i]} set res } proc i18n::render_arab string { variable data set s2 [_srevert $string] foreach i [split $s2] { if [regexp {^[0-9][-.,0-9\u0660-\u0669]+$} $i] { regsub $i $s2 [_srevert $i] s2 } ;# re-revert decimal numbers } foreach {i j} { , \u060C ? \u061F \u0621 \uFE80} { regsub -all "\[$i\]" $s2 $j s2 } ;# special characters foreach i { \u0622 \u0623 \u0624 \u0625 \u0627 \u0629 \u062F \u0630 \u0631 \u0632 \u0648 \u0649 } { regsub -all $i $s2 $i, s2 } ;# joining right only foreach i { \u0626 \u0628 \u062A \u062B \u062C \u062D \u062E \u0633 \u0634 \u0635 \u0636 \u0637 \u0638 \u0639 \u063A \u0640 \u0641 \u0642 \u0643 \u0644 \u0645 \u0646 \u0647 \u064A \u064B \u064C \u064D \u064E \u064F \u0650 \u0651 \u0652 \u0670 \u0671 } { regsub -all $i $s2 ,$i, s2 } ;# joining both sides regsub -all ,, $s2 ,,,, s2 ;# pad for neighboring equals regsub -all {\\} $s2 "" s2 set res [string map $data(ar_join2) $s2] regsub -all , $res "" res ;# remove redundant commas set res } if 0 {--------------------------------------------------------- i18n::tell string Returns a descriptive string for the Unicode subsystem in which the first character is contained, e.g. ''ascii, hebrew, hiragana'' or ''cjkIdeograph''.} proc i18n::tell string { variable data scan $string %c uc foreach {name range} $data(tell) { foreach {from to} $range break if {$uc>=$from && $uc<=$to} {return $name} } } if 0 {--------------------------------------------------------- i18n::to keyword string Converts the Unicode ''string'' according to the rules specified by ''keyword'', which may be one of the representation names used in the i18n::from command, or * escaped - Replaces all non-ASCII characters with \u.... escaping } proc i18n::to {keyword string} { variable data switch -- $keyword { escaped {to_escaped $string} pinyin {to_pinyin $string} default {string map [_swap $data($keyword)] $string} } } #-------------- Internal works, but feel free to look :) proc i18n::to_escaped string { set res "" foreach char [split $string ""] { scan $char %c uc if {$uc>127} {set char [format {\u%04X} $uc]} append res $char } set res } proc i18n::to_pinyin string { set res "" foreach char [split $string ""] { set try [u2pinyin $char] if {$try!=""} {set char $try} append res $char } set res } proc i18n::u2pinyin char { #-- returns the gb2312 pinyin for char if applicable, else "" if {[tell $char] == "cjkIdeograph"} { set gb [encoding convertto gb2312 $char] foreach {b1 b2} [split $gb ""] break set gbd [expr {([scan $b1 %c]-32)*100+[scan $b2 %c]-32}] if {$gbd>=1601 && $gbd<=5589} { variable data _rangesearch $data(pinyin) $gbd } } } proc i18n::_rangesearch {list value} { #-- returns foo if value>=x and value=$lastval && $value<$val} {return $lastkey} set lastkey $key set lastval $val } } proc i18n::_swap list { set res {} foreach {a b} $list {lappend res $b $a} set res } #-------------------- Get the data from [The i18n package: data] source [file join [file dir [info script]] i18n_data.tcl] package provide i18n 1.0 ---- For three months I've seen no reactions to this page. Has anybody used it? -RS