if 0 {Richard Suchenwirth 2003-06-18 - SYNOPSIS:
package require i18n ?1.0? i18n::codepage encoding i18n::data ?name ?value?? i18n::from repname string i18n::language ?code? i18n::render string i18n::tell string i18n::to keyword string
DESCRIPTION
i18n is short for internationalization, "make software work with many languages". This package contains support routines and utilities for i18n work, mainly for conversion between 16-bit Unicode strings (where each character can be between \u0000 and \uFFFF) and 7-bit ASCII strings (where the range is limited to \x00..\x7F). ASCII strings are less expressive, but can conveniently be handled with keyboards and transmission channels. An ASCII string with the additional information what transliteration is used can be turned into correct Unicode, e.g.
("Moskva", ruslish) -> (Russian spelling of Moscow).
Conversely, one can extract from a Unicode character both the name of the subset (writing system - use the tell command) and an ASCII representation of one or more characters (see the to command).
i18n::codepage encoding
Returns a list of 256 characters corresponding to the byte values \x00 to \xFF in the specified one-byte encoding. Undefined characters will be expressed with a default character. Interesting test: ebcdic!}
namespace eval i18n {variable version 1.0} proc i18n::codepage encoding { set res {} for {set i 0} {$i<256} {incr i} { lappend res [encoding convertfrom $encoding [format %c $i]] } set res }
if 0 {---------------------------------------------------------
i18n::data ?name ?value??
Called with no arguments, returns the names of available data tables (typically alternating lists). With one argument, returns the data table associated with name. With two arguments, stores the given value persistently as data table name. }
proc i18n::data args { variable data switch [llength $args] { 0 {lsort [array names data]} 1 {subst -nocom -novar [join $data([lindex $args 0])]} 2 {set data([lindex $args 0]) [lindex $args 1]} default {error "usage: i18n::data ?name ?value??"} } }
if 0 {---------------------------------------------------------
i18n::from repname string
Converts the ASCII string into a Unicode string according to the representation rules specified by repname. The following representations are included:
Add custom representations by just registering them as i18n::data:
i18n::data foolish {foo T bar c grill l} i18n::from foolish foobargrill ;# => Tcl
}
proc i18n::from {repname string} { variable data switch -- $repname { hanglish {from_hanglish $string} pinyin {from_pinyin $string} default {string map $data($repname) $string} } } proc i18n::from_hanglish string { set res "" foreach i [split $string] { foreach j [split $i -] {append res [hanglish2uc $j]} append res " " } set res } proc i18n::hanglish2uc hanglish { set L ""; set V "" ;# in case regexp doesn't hit set h2 [string map { NG Q YE X YAI F AI R YA V YO Y YU Z VI F } [string toupper $hanglish]] regexp {^([GNDLMBSQJCKTPH]+)?([ARVFEIXOYUZW]+)([GNDLMBSQJCKTPH]*)$} \ $h2 -> L V T ;# lead cons.-vowel-trail cons. if {$L==""} {set L Q} if {$V==""} {return $hanglish} set l [lsearch {G GG N D DD L M B BB S SS Q J JJ C K T P H} $L] if {$l<0} {return $hanglish} set v [lsearch {A R V F E EI X XI O OA OR OI Y U UE UEI UI Z W WI I} $V] if {$v<0} {return $hanglish} set t [lsearch {"" G GG GS N NJ NH D L LG LM LB LS LT LP LH \ M B BS S SS Q J C K T P H} $T] ;# trailing consonants if {$t<0} {return $hanglish} format %c [expr {$l*21*28 + $v*28 + $t + 0xAC00}] } proc i18n::from_pinyin string { #-- list of Chinese chars for which pinyin is string in gb2312 variable data set pos [lsearch $data(pinyin) $string] if {$pos >= 0} { set res {} set from [lindex $data(pinyin) [incr pos]] set to [lindex $data(pinyin) [incr pos 2]] while {$from<$to} { if {($from-1)%100>93} continue ;# skip 95..00 gap set b1 [format %c [expr {$from/100+32}]] set b2 [format %c [expr {$from%100+32}]] lappend res [encoding convertfrom gb2312 $b1$b2] incr from } set res } }
if 0 {---------------------------------------------------------
i18n::language ?code?
Called with no arguments, returns the list of ISO 639 language codes (e.g. en for English). With one argument, returns the English name of the language coded in ISO 639 as code.}
interp alias {} i18n::language {} i18n::_lfind $i18n::data(iso639) #-- Generic table searcher: proc i18n::_lfind {list {code ""}} { if {$code==""} { lsort [_keys $list] } else { set pos [lsearch $list $code] if {$pos>=0} {lindex $list [incr pos]} } } proc i18n::_keys list { set res {} foreach {key -} $list {lappend res $key} set res }
if 0 {---------------------------------------------------------
i18n::render string
Prepares the string for display on a Tk widget. For Arabic, this involves selecting the correct context glyph, and r2l; for Hebrew, it does only r2l conversion. This is a workaround as long as Tk cannot handle bidi automatically. }
proc i18n::render string { if [regexp {^([\x0-\u04ff]*)([\u0621-\u064a ]+)(.*)$} $string -> a b c] { set string "$a[render_arab $b] $c" } set string } proc i18n::_srevert string { set res "" set i [string length $string] while {[incr i -1]>=0} {append res [string index $string $i]} set res } proc i18n::render_arab string { variable data set s2 [_srevert $string] foreach i [split $s2] { if [regexp {^[0-9][-.,0-9\u0660-\u0669]+$} $i] { regsub $i $s2 [_srevert $i] s2 } ;# re-revert decimal numbers } foreach {i j} { , \u060C ? \u061F \u0621 \uFE80} { regsub -all "\[$i\]" $s2 $j s2 } ;# special characters foreach i { \u0622 \u0623 \u0624 \u0625 \u0627 \u0629 \u062F \u0630 \u0631 \u0632 \u0648 \u0649 } { regsub -all $i $s2 $i, s2 } ;# joining right only foreach i { \u0626 \u0628 \u062A \u062B \u062C \u062D \u062E \u0633 \u0634 \u0635 \u0636 \u0637 \u0638 \u0639 \u063A \u0640 \u0641 \u0642 \u0643 \u0644 \u0645 \u0646 \u0647 \u064A \u064B \u064C \u064D \u064E \u064F \u0650 \u0651 \u0652 \u0670 \u0671 } { regsub -all $i $s2 ,$i, s2 } ;# joining both sides regsub -all ,, $s2 ,,,, s2 ;# pad for neighboring equals regsub -all {\\} $s2 "" s2 set res [string map $data(ar_join2) $s2] regsub -all , $res "" res ;# remove redundant commas set res }
if 0 {---------------------------------------------------------
i18n::tell string
Returns a descriptive string for the Unicode subsystem in which the first character is contained, e.g. ascii, hebrew, hiragana or cjkIdeograph.}
proc i18n::tell string { variable data scan $string %c uc foreach {name range} $data(tell) { foreach {from to} $range break if {$uc>=$from && $uc<=$to} {return $name} } }
if 0 {---------------------------------------------------------
i18n::to keyword string
Converts the Unicode string according to the rules specified by keyword, which may be one of the representation names used in the i18n::from command, or
}
proc i18n::to {keyword string} { variable data switch -- $keyword { escaped {to_escaped $string} pinyin {to_pinyin $string} default {string map [_swap $data($keyword)] $string} } }
#-------------- Internal works, but feel free to look :)
proc i18n::to_escaped string { set res "" foreach char [split $string ""] { scan $char %c uc if {$uc>127} {set char [format {\u%04X} $uc]} append res $char } set res } proc i18n::to_pinyin string { set res "" foreach char [split $string ""] { set try [u2pinyin $char] if {$try!=""} {set char $try} append res $char } set res } proc i18n::u2pinyin char { #-- returns the gb2312 pinyin for char if applicable, else "" if {[tell $char] == "cjkIdeograph"} { set gb [encoding convertto gb2312 $char] foreach {b1 b2} [split $gb ""] break set gbd [expr {([scan $b1 %c]-32)*100+[scan $b2 %c]-32}] if {$gbd>=1601 && $gbd<=5589} { variable data _rangesearch $data(pinyin) $gbd } } } proc i18n::_rangesearch {list value} { #-- returns foo if value>=x and value<y in {... foo x bar y ...} foreach {lastkey lastval} $list break foreach {key val} [lrange $list 2 end] { if {$value>=$lastval && $value<$val} {return $lastkey} set lastkey $key set lastval $val } } proc i18n::_swap list { set res {} foreach {a b} $list {lappend res $b $a} set res }
#-------------------- Get the data from The i18n package: data
source [file join [file dir [info script]] i18n_data.tcl] package provide i18n 1.0
For three months I've seen no reactions to this page. Has anybody used it? -RS
I (APN) have not used it directly but have found it very useful for my edification. Thanks! It's very likely I'll use some or all of it for some tools I'm writing. (What's the copyright on Wiki in terms of use?) - RS: Do what you want to do, just don't blame me, and don't have it patented ;)
What is needed for i18n is internationalization of the lsort -dictionary command. I believe this should be solved in the Tcl core, not in additional packages. The Unicode support Tcl boasts to have is not complete without it. Sorting Umlauts and French accents behind z is a pain in the butt. [[email protected]]
See lsort for replies to this.
LV I sure hope that this update, via Firefox, doesn't damage the code above. Note that I've submitted a feature request to http://tcl.sf.net/ (number 1601204) for the lsort enhancement. If you have suggestions or possible code to help with this, feel free to add a comment there for the maintainers.
WJP 2007-06-29 The two-letter ISO language codes fail to encode the great majority of the world's languages (including some that I use), so I've created a file containing the ISO639-3 three-letter codes, of which there are 7591, as a Tcl array. It is too large to put here on the wiki so I've put it at: http://billposer.org/Linguistics/Computation/iso639-3.txt