Version 3 of remove diacritic

Updated 2005-10-10 14:21:29

Replace iso-8859-2 chars with similar without dicritic mark.

    string map {
        "Ą" "A"  "Ł" "L"  "Ľ" "L"  "Ś" "S"  "Š" "S"  "Ş" "S"  "Ť" "T"  "Ź" "Z"
        "Ž" "Z"  "Ż" "Z"  "ą" "a"  "ł" "l"  "ľ" "l"  "ś" "s"  "š" "s"  "ş" "s"
        "ť" "t"  "ź" "z"  "ž" "z"  "ż" "z"  "Ŕ" "R"  "Á" "A"  "��" "A"  "Ă" "A"
        "Ä" "A"  "Ĺ" "L"  "Ć" "C"  "Ç" "C"  "Č" "C"  "É" "E"  "Ę" "E"  "Ë" "E"
        "Ě" "E"  "Í" "I"  "Î" "I"  "Ď" "D"  "Đ" "D"  "Ń" "N"  "Ň" "N"  "Ó" "O"
        "Ô" "O"  "Ő" "O"  "Ö" "O"  "×" "x"  "Ř" "R"  "Ů" "U"  "Ú" "U"  "Ű" "U"
        "Ü" "U"  "Ý" "Y"  "Ţ" "T"  "ß" "B"  "ŕ" "r"  "á" "a"  "â" "a"  "ă" "a"
        "ä" "a"  "ĺ" "l"  "ć" "c"  "ç" "c"  "č" "c"  "é" "e"  "ę" "e"  "ë" "e"
        "ě" "e"  "í" "i"  "î" "i"  "ď" "d"  "đ" "d"  "ń" "n"  "ň" "n"  "ó" "o"
        "ô" "o"  "ő" "o"  "ö" "o"  "ř" "r"  "ů" "u"  "ú" "u"  "ű" "u"  "ü" "u"
        "ý" "y"  "ţ" "t"
    } $str

UTF encoded char for Latin Extended-A

    "\u8a"  "S"    "\u8e"  "Z"    "\u9a"  "s"    "\u9e"  "Z"    "\u9f"  "Y"
    "\uaa"  "a"    "\ub2"  "2"    "\ub3"  "3"    "\ub5"  "u"    "\ubf"  "?"
    "\uc0"  "A"    "\uc1"  "A"    "\uc2"  "A"    "\uc3"  "A"    "\uc4"  "A"
    "\uc5"  "A"    "\uc6"  "AE"   "\uc7"  "C"    "\uc8"  "E"    "\uc9"  "E"
    "\uca"  "E"    "\ucb"  "E"    "\ucc"  "I"    "\ucd"  "I"    "\uce"  "I"
    "\ucf"  "I"    "\ud0"  "D"    "\ud1"  "N"    "\ud2"  "O"    "\ud3"  "O"
    "\ud4"  "O"    "\ud5"  "O"    "\ud6"  "O"    "\ud7"  "x"    "\ud8"  "O"
    "\ud9"  "U"    "\uda"  "U"    "\udb"  "U"    "\udc"  "U"    "\udd"  "Y"
    "\udf"  "B"    "\ue0"  "a"    "\ue1"  "a"    "\ue2"  "a"    "\ue3"  "a"
    "\ue4"  "a"    "\ue5"  "a"    "\ue6"  "ae"   "\ue7"  "C"    "\ue8"  "e"
    "\ue9"  "e"    "\uea"  "e"    "\ueb"  "e"    "\uec"  "i"    "\ued"  "i"
    "\uee"  "i"    "\uef"  "i"    "\uf1"  "n"    "\uf2"  "o"    "\uf3"  "o"
    "\uf4"  "o"    "\uf5"  "o"    "\uf6"  "o"    "\uf9"  "u"    "\ufa"  "u"
    "\ufb"  "u"    "\ufc"  "u"    "\ufd"  "y"    "\ufe"  "b"    "\uff"  "y"
    "\u100" "A"    "\u101" "a"    "\u102" "A"    "\u103" "a"    "\u104" "A"
    "\u105" "a"    "\u106" "C"    "\u107" "c"    "\u108" "C"    "\u109" "c"
    "\u10a" "C"    "\u10b" "c"    "\u10c" "C"    "\u10d" "c"    "\u10e" "D"
    "\u10f" "d"    "\u110" "D"    "\u111" "d"    "\u112" "E"    "\u113" "e"
    "\u114" "E"    "\u115" "e"    "\u116" "E"    "\u117" "e"    "\u118" "E"
    "\u119" "e"    "\u11a" "E"    "\u11b" "e"    "\u11c" "G"    "\u11d" "g"
    "\u11e" "G"    "\u11f" "g"    "\u120" "G"    "\u121" "g"    "\u122" "G"
    "\u123" "g"    "\u124" "H"    "\u125" "h"    "\u126" "H"    "\u127" "h"
    "\u128" "I"    "\u129" "i"    "\u12a" "I"    "\u12b" "i"    "\u12c" "I"
    "\u12d" "i"    "\u12e" "I"    "\u12f" "i"    "\u130" "I"    "\u132" "IJ"
    "\u133" "ij"   "\u134" "j"    "\u135" "J"    "\u136" "K"    "\u137" "k"
    "\u138" "K"    "\u139" "L"    "\u13a" "l"    "\u13b" "L"    "\u13c" "l"
    "\u13d" "L"    "\u13e" "l"    "\u13f" "L"    "\u140" "l"    "\u141" "L"
    "\u142" "l"    "\u143" "N"    "\u144" "n"    "\u145" "N"    "\u146" "n"
    "\u147" "N"    "\u148" "n"    "\u149" "n"    "\u14a" "N"    "\u14b" "n"
    "\u14c" "O"    "\u14d" "o"    "\u14e" "O"    "\u14f" "o"    "\u150" "O"
    "\u151" "o"    "\u152" "CE"   "\u153" "ce"   "\u154" "R"    "\u155" "r"
    "\u156" "R"    "\u157" "r"    "\u158" "R"    "\u159" "r"    "\u15a" "S"
    "\u15b" "s"    "\u15c" "S"    "\u15d" "s"    "\u15e" "S"    "\u15f" "s"
    "\u160" "S"    "\u161" "s"    "\u162" "T"    "\u163" "t"    "\u164" "T"
    "\u165" "t"    "\u166" "T"    "\u167" "t"    "\u168" "U"    "\u169" "u"
    "\u16a" "U"    "\u16b" "u"    "\u16c" "U"    "\u16d" "u"    "\u16e" "U"
    "\u16f" "u"    "\u170" "U"    "\u171" "u"    "\u172" "U"    "\u173" "u"
    "\u174" "W"    "\u175" "w"    "\u176" "Y"    "\u177" "y"    "\u178" "Y"
    "\u179" "Z"    "\u17a" "z"    "\u17b" "Z"    "\u17c" "z"    "\u17d" "Z"
    "\u17e" "z"    "\u17f" "f"    "\u192" "f"    "\u1fa" "A"    "\u1fb" "a"
    "\u1fc" "AE"   "\u1fd" "ae"   "\u1fe" "O"    "\u1ff" "o"

Code whitch generate unicode list in html

    set fd [open "unicode.html" w] 
    for { set i 128 } { $i < 512 } { incr i } {
        set hex [format %x $i]
        eval "set char \"\\u$hex\""
        puts $fd "  \"\\u$hex\" \"&#x$hex;\"<br>"
    }
    close $fd

Lars H, 10 Oct 2005: The fully kosher way of doing this would be to normalise the characters to decomposed form, and then just string map all the combining chars (i.e., loose diacritics) to the empty string. Pity there doesn't seem to be any implementation of Unicode normalisations for Tcl (but that's just because noone has done it yet).


[ Category Characters ]