remove diacritic

Replace iso-8859-2 chars with similar without diacritic mark:

    string map {
        "Ą" "A"  "Ł" "L"  "Ľ" "L"  "Ś" "S"  "Š" "S"  "Ş" "S"  "Ť" "T"  "Ź" "Z"
        "Ž" "Z"  "Ż" "Z"  "ą" "a"  "ł" "l"  "ľ" "l"  "ś" "s"  "š" "s"  "ş" "s"
        "ť" "t"  "ź" "z"  "ž" "z"  "ż" "z"  "Ŕ" "R"  "Á" "A"  "A" "A"  "Ă" "A"
        "Ä" "A"  "Ĺ" "L"  "Ć" "C"  "Ç" "C"  "Č" "C"  "É" "E"  "Ę" "E"  "Ë" "E"
        "Ě" "E"  "Í" "I"  "Î" "I"  "Ď" "D"  "Đ" "D"  "Ń" "N"  "Ň" "N"  "Ó" "O"
        "Ô" "O"  "Ő" "O"  "Ö" "O"  "×" "x"  "Ř" "R"  "Ů" "U"  "Ú" "U"  "Ű" "U"
        "Ü" "U"  "Ý" "Y"  "Ţ" "T"  "ß" "s"  "ŕ" "r"  "á" "a"  "â" "a"  "ă" "a"
        "ä" "a"  "ĺ" "l"  "ć" "c"  "ç" "c"  "č" "c"  "é" "e"  "ę" "e"  "ë" "e"
        "ě" "e"  "í" "i"  "î" "i"  "ď" "d"  "đ" "d"  "ń" "n"  "ň" "n"  "ó" "o"
        "ô" "o"  "ő" "o"  "ö" "o"  "ř" "r"  "ů" "u"  "ú" "u"  "ű" "u"  "ü" "u"
        "ý" "y"  "ţ" "t"
    } $str

UTF encoded char for Latin Extended-A:

    "\u8a"  "S"    "\u8e"  "Z"    "\u9a"  "s"    "\u9e"  "Z"    "\u9f"  "Y"
    "\uaa"  "a"    "\ub2"  "2"    "\ub3"  "3"    "\ub5"  "u"    "\ubf"  "?"
    "\uc0"  "A"    "\uc1"  "A"    "\uc2"  "A"    "\uc3"  "A"    "\uc4"  "A"
    "\uc5"  "A"    "\uc6"  "AE"   "\uc7"  "C"    "\uc8"  "E"    "\uc9"  "E"
    "\uca"  "E"    "\ucb"  "E"    "\ucc"  "I"    "\ucd"  "I"    "\uce"  "I"
    "\ucf"  "I"    "\ud0"  "D"    "\ud1"  "N"    "\ud2"  "O"    "\ud3"  "O"
    "\ud4"  "O"    "\ud5"  "O"    "\ud6"  "O"    "\ud7"  "x"    "\ud8"  "O"
    "\ud9"  "U"    "\uda"  "U"    "\udb"  "U"    "\udc"  "U"    "\udd"  "Y"
    "\udf"  "s"    "\ue0"  "a"    "\ue1"  "a"    "\ue2"  "a"    "\ue3"  "a"
    "\ue4"  "a"    "\ue5"  "a"    "\ue6"  "ae"   "\ue7"  "C"    "\ue8"  "e"
    "\ue9"  "e"    "\uea"  "e"    "\ueb"  "e"    "\uec"  "i"    "\ued"  "i"
    "\uee"  "i"    "\uef"  "i"    "\uf1"  "n"    "\uf2"  "o"    "\uf3"  "o"
    "\uf4"  "o"    "\uf5"  "o"    "\uf6"  "o"    "\uf9"  "u"    "\ufa"  "u"
    "\ufb"  "u"    "\ufc"  "u"    "\ufd"  "y"    "\ufe"  "b"    "\uff"  "y"
    "\u100" "A"    "\u101" "a"    "\u102" "A"    "\u103" "a"    "\u104" "A"
    "\u105" "a"    "\u106" "C"    "\u107" "c"    "\u108" "C"    "\u109" "c"
    "\u10a" "C"    "\u10b" "c"    "\u10c" "C"    "\u10d" "c"    "\u10e" "D"
    "\u10f" "d"    "\u110" "D"    "\u111" "d"    "\u112" "E"    "\u113" "e"
    "\u114" "E"    "\u115" "e"    "\u116" "E"    "\u117" "e"    "\u118" "E"
    "\u119" "e"    "\u11a" "E"    "\u11b" "e"    "\u11c" "G"    "\u11d" "g"
    "\u11e" "G"    "\u11f" "g"    "\u120" "G"    "\u121" "g"    "\u122" "G"
    "\u123" "g"    "\u124" "H"    "\u125" "h"    "\u126" "H"    "\u127" "h"
    "\u128" "I"    "\u129" "i"    "\u12a" "I"    "\u12b" "i"    "\u12c" "I"
    "\u12d" "i"    "\u12e" "I"    "\u12f" "i"    "\u130" "I"    "\u132" "IJ"
    "\u133" "ij"   "\u134" "j"    "\u135" "J"    "\u136" "K"    "\u137" "k"
    "\u138" "K"    "\u139" "L"    "\u13a" "l"    "\u13b" "L"    "\u13c" "l"
    "\u13d" "L"    "\u13e" "l"    "\u13f" "L"    "\u140" "l"    "\u141" "L"
    "\u142" "l"    "\u143" "N"    "\u144" "n"    "\u145" "N"    "\u146" "n"
    "\u147" "N"    "\u148" "n"    "\u149" "n"    "\u14a" "N"    "\u14b" "n"
    "\u14c" "O"    "\u14d" "o"    "\u14e" "O"    "\u14f" "o"    "\u150" "O"
    "\u151" "o"    "\u152" "CE"   "\u153" "ce"   "\u154" "R"    "\u155" "r"
    "\u156" "R"    "\u157" "r"    "\u158" "R"    "\u159" "r"    "\u15a" "S"
    "\u15b" "s"    "\u15c" "S"    "\u15d" "s"    "\u15e" "S"    "\u15f" "s"
    "\u160" "S"    "\u161" "s"    "\u162" "T"    "\u163" "t"    "\u164" "T"
    "\u165" "t"    "\u166" "T"    "\u167" "t"    "\u168" "U"    "\u169" "u"
    "\u16a" "U"    "\u16b" "u"    "\u16c" "U"    "\u16d" "u"    "\u16e" "U"
    "\u16f" "u"    "\u170" "U"    "\u171" "u"    "\u172" "U"    "\u173" "u"
    "\u174" "W"    "\u175" "w"    "\u176" "Y"    "\u177" "y"    "\u178" "Y"
    "\u179" "Z"    "\u17a" "z"    "\u17b" "Z"    "\u17c" "z"    "\u17d" "Z"
    "\u17e" "z"    "\u17f" "f"    "\u192" "f"    "\u1fa" "A"    "\u1fb" "a"
    "\u1fc" "AE"   "\u1fd" "ae"   "\u1fe" "O"    "\u1ff" "o"

Code which generates unicode list in html:

    set fd [open "unicode.html" w] 
    for { set i 128 } { $i < 512 } { incr i } {
        set hex [format %x $i]
        eval "set char \"\\u$hex\""
        puts $fd "  \"\\u$hex\" \"&#x$hex;\"<br>"
    close $fd

Lars H, 10 Oct 2005: The fully kosher way of doing this would be to normalise the characters to decomposed form, and then just string map all the combining chars (i.e., loose diacritics) to the empty string. Pity there doesn't seem to be any implementation of Unicode normalisations for Tcl (but that's just because noone has done it yet).

VK 18-dec-2005 Totally agree: non-kosher way often beats those who use cyrillic character sets (I beleive many charsets suffer this problem). Therefore its better to keep Unicode library within reach; more to say - you must have *loaded* Unicode library within reach. Simplistic way of dealing Unicode often inacceptable for i18n

wdb changed \udf and ß from B to s -- its regular transscription is "ss", and its pronounciation is voiceless "s" in German.