Replace iso-8859-2 chars with similar without dicritic mark: string map { "Ą" "A" "Ł" "L" "Ľ" "L" "Ś" "S" "Š" "S" "Ş" "S" "Ť" "T" "Ź" "Z" "Ž" "Z" "Ż" "Z" "ą" "a" "ł" "l" "ľ" "l" "ś" "s" "š" "s" "ş" "s" "ť" "t" "ź" "z" "ž" "z" "ż" "z" "Ŕ" "R" "Á" "A" "��" "A" "Ă" "A" "Ä" "A" "Ĺ" "L" "Ć" "C" "Ç" "C" "Č" "C" "É" "E" "Ę" "E" "Ë" "E" "Ě" "E" "Í" "I" "Î" "I" "Ď" "D" "Đ" "D" "Ń" "N" "Ň" "N" "Ó" "O" "Ô" "O" "Ő" "O" "Ö" "O" "×" "x" "Ř" "R" "Ů" "U" "Ú" "U" "Ű" "U" "Ü" "U" "Ý" "Y" "Ţ" "T" "ß" "B" "ŕ" "r" "á" "a" "â" "a" "ă" "a" "ä" "a" "ĺ" "l" "ć" "c" "ç" "c" "č" "c" "é" "e" "ę" "e" "ë" "e" "ě" "e" "í" "i" "î" "i" "ď" "d" "đ" "d" "ń" "n" "ň" "n" "ó" "o" "ô" "o" "ő" "o" "ö" "o" "ř" "r" "ů" "u" "ú" "u" "ű" "u" "ü" "u" "ý" "y" "ţ" "t" } $str UTF encoded char for Latin Extended-A: "\u8a" "S" "\u8e" "Z" "\u9a" "s" "\u9e" "Z" "\u9f" "Y" "\uaa" "a" "\ub2" "2" "\ub3" "3" "\ub5" "u" "\ubf" "?" "\uc0" "A" "\uc1" "A" "\uc2" "A" "\uc3" "A" "\uc4" "A" "\uc5" "A" "\uc6" "AE" "\uc7" "C" "\uc8" "E" "\uc9" "E" "\uca" "E" "\ucb" "E" "\ucc" "I" "\ucd" "I" "\uce" "I" "\ucf" "I" "\ud0" "D" "\ud1" "N" "\ud2" "O" "\ud3" "O" "\ud4" "O" "\ud5" "O" "\ud6" "O" "\ud7" "x" "\ud8" "O" "\ud9" "U" "\uda" "U" "\udb" "U" "\udc" "U" "\udd" "Y" "\udf" "B" "\ue0" "a" "\ue1" "a" "\ue2" "a" "\ue3" "a" "\ue4" "a" "\ue5" "a" "\ue6" "ae" "\ue7" "C" "\ue8" "e" "\ue9" "e" "\uea" "e" "\ueb" "e" "\uec" "i" "\ued" "i" "\uee" "i" "\uef" "i" "\uf1" "n" "\uf2" "o" "\uf3" "o" "\uf4" "o" "\uf5" "o" "\uf6" "o" "\uf9" "u" "\ufa" "u" "\ufb" "u" "\ufc" "u" "\ufd" "y" "\ufe" "b" "\uff" "y" "\u100" "A" "\u101" "a" "\u102" "A" "\u103" "a" "\u104" "A" "\u105" "a" "\u106" "C" "\u107" "c" "\u108" "C" "\u109" "c" "\u10a" "C" "\u10b" "c" "\u10c" "C" "\u10d" "c" "\u10e" "D" "\u10f" "d" "\u110" "D" "\u111" "d" "\u112" "E" "\u113" "e" "\u114" "E" "\u115" "e" "\u116" "E" "\u117" "e" "\u118" "E" "\u119" "e" "\u11a" "E" "\u11b" "e" "\u11c" "G" "\u11d" "g" "\u11e" "G" "\u11f" "g" "\u120" "G" "\u121" "g" "\u122" "G" "\u123" "g" "\u124" "H" "\u125" "h" "\u126" "H" "\u127" "h" "\u128" "I" "\u129" "i" "\u12a" "I" "\u12b" "i" "\u12c" "I" "\u12d" "i" "\u12e" "I" "\u12f" "i" "\u130" "I" "\u132" "IJ" "\u133" "ij" "\u134" "j" "\u135" "J" "\u136" "K" "\u137" "k" "\u138" "K" "\u139" "L" "\u13a" "l" "\u13b" "L" "\u13c" "l" "\u13d" "L" "\u13e" "l" "\u13f" "L" "\u140" "l" "\u141" "L" "\u142" "l" "\u143" "N" "\u144" "n" "\u145" "N" "\u146" "n" "\u147" "N" "\u148" "n" "\u149" "n" "\u14a" "N" "\u14b" "n" "\u14c" "O" "\u14d" "o" "\u14e" "O" "\u14f" "o" "\u150" "O" "\u151" "o" "\u152" "CE" "\u153" "ce" "\u154" "R" "\u155" "r" "\u156" "R" "\u157" "r" "\u158" "R" "\u159" "r" "\u15a" "S" "\u15b" "s" "\u15c" "S" "\u15d" "s" "\u15e" "S" "\u15f" "s" "\u160" "S" "\u161" "s" "\u162" "T" "\u163" "t" "\u164" "T" "\u165" "t" "\u166" "T" "\u167" "t" "\u168" "U" "\u169" "u" "\u16a" "U" "\u16b" "u" "\u16c" "U" "\u16d" "u" "\u16e" "U" "\u16f" "u" "\u170" "U" "\u171" "u" "\u172" "U" "\u173" "u" "\u174" "W" "\u175" "w" "\u176" "Y" "\u177" "y" "\u178" "Y" "\u179" "Z" "\u17a" "z" "\u17b" "Z" "\u17c" "z" "\u17d" "Z" "\u17e" "z" "\u17f" "f" "\u192" "f" "\u1fa" "A" "\u1fb" "a" "\u1fc" "AE" "\u1fd" "ae" "\u1fe" "O" "\u1ff" "o" Code which generates unicode list in html: set fd [open "unicode.html" w] for { set i 128 } { $i < 512 } { incr i } { set hex [format %x $i] eval "set char \"\\u$hex\"" puts $fd " \"\\u$hex\" \"&#x$hex;\"
" } close $fd ---- [Lars H], 10 Oct 2005: The fully kosher way of doing this would be to normalise the characters to decomposed form, and then just [string map] all the combining chars (i.e., loose diacritics) to the empty string. Pity there doesn't seem to be any implementation of Unicode normalisations for Tcl (but that's just because noone has done it yet). ---- [[ [Category Characters] - [Category String Processing] ]]