Version 1 of Pinyin, ASCII to Unicode Converter

Updated 2005-05-27 09:25:04

WJG (24th May, 2005) RS inspired me to produce this humble offering. It takes a line of ASCII formatted pinyin and spits out a string with the appropriate diacritics. At present it I haven't handled capitals.

 # -------------------------------------------------------------
 # pinyin_covert.tcl
 # Written by William J Giddings, 2005
 # -------------------------------------------------------------
 # Usage:
 # pinyin:parse <string>
 # 
 # Args:
 # <string> any ascii string representing hanyu pinyin in the form 
 # <initial sound><final sound><tone>
 # e.g. chuang2
 # <ch><uang>2
 # 
 # Returns:
 # correctly formatted unicode string
 # 
 # Purpose:
 # --------
 # Convert linear 'ascii pinyin' Hanyu finals into correct forms 
 # complete with diacritics.
 # 
 # Notes:
 # ------
 # Use versions of ttf 2.76 or higher. These include the necessary 
 # diacritics to display all pinyin vowels plus their tone marks.
 # 
 # For further examples, see CEDIT dictionary
 # http://www.mandarintools.com/cedict.html
 # 
 # -------------------------------------------------------------

 #-----------------------------------------------------------
 # swap linear ascii input for correct finals including diacritics
 #-----------------------------------------------------------
 proc pinyin:parse {str} { 

   set str [string map {

       a1 \u101 a2 \ue1 a3 \u103 a4 \ue0 a5 a
       ai1 "\u101 i" ai2 "\ue1 i" ai3 "\u103 i" ai4 "\ue0 i" ai5 ai
       ao1 "\u101 o" ao2 "\ue1 o" ao3 "\u103 o" ao4 "\ue0 o"  ao5 ao
       an1 "\u101 n" an2 "\ue1 n" an3 "\u103 n" an4 "\ue0 n" an5 an
       ang1 "\u101 ng" ang2 "\ue1 ng" ang3 "\u103 ng" ang4 "\ue0 ng" ang5 ang

       e1 \u113 e2 \ue9 e3 \u11b e4 \ue8 e5 e 
       ei1 "\u113 i" ei2 "\ue9 i" ei3 "\u11b i" ei4 "\ue8 i" ei5 ei
       en1 "\u113 n" en2 "\ue9 n" en3 "\u11b n" en4 "\ue8 n" en5 en
       eng1 "\u113 ng" eng2 "\ue9 ng" eng3 "\u11b ng" eng4 "\ue8 ng" eng5 eng

       o1 \u14d o2 \uf3 o3 \u14f o4 \uf2 o5 o
       ong1 "\u14d ng"  ong2 "\uf3 ng" ong3 "\u14f ng" ong4 "\uf2 ng" ong5 ong
       ou1 "\u14d u" ou2 "\uf3 u" ou3 "\u14f u" ou4 "\uf2 u" ou5 ou

       i1 \u12b i2 \ued i3 \u12d i4 \uec i5 i
       ia1 "i \u101" ia2 "i \ue1" ia3 "i \u103" ia4 "i \ue0" ia5 ia
       iao1 "i \u101 o" iao2 "i \ue1 o" iao3 "i \u103 o" iao4 "i \ue0 o" 
       iu1 "i \u16b" iu2 "i \ufa" iu3 "i \u16d" iu4 "i \uf9" iu5 iu
       ian1 "\u12b an" ian2 "\ued an" ian3 "\u12d an" ian4 "\uec an" 
       in1 "\u12b n" in2 "\ued n" in3 "\u12d n" in4 "\uec n"  in5 in
       iang1 "i \u101 ng" iang2 "i \ue1 ng" iang3 "i \u103 ng" iang4 "i \ue0 ng" 
       ing1 "\u12b ng" ing2 "\ued ng" ing3 "\u12d ng" ing4 "\uec ng" ing5 ing
       iong1 "i \u14d ng" iong2 "i \uf3 ng" iong3 "i \u14f ng" iong4 "i \uf2 ng" 

       u1 \u16b u2 \ufa u3 \u16d u4 \uf9 u5 u
       ua1 "u \u101"  ua2 "u \ue1" ua3 "u \u103" ua4 "u \ue0" ua5 ua
       uai1 "u \u101 i" uai2 "u \ue1 i" uai3 "u \u103 i" uai4 "u \ue0 i"
       uo1 "u \u14d" uo2 "u \uf3" uo3 "u \u14f" uo4 "u \uf2" uo5 uo
       ui1 "u \u12b" ui2 "u \ued" ui3 "u \u12d" ui4 "u \uec" ui4 ui
       un1 "\u16b n" un2 "\ufa n" un3 "\u16d n" un4 "\uf9 n"  un5 un
       uan1 "u \u101 n" uan2 "u \ue1 n" uan3 "u \u103 n" uan4 "u \ue0 n" 
       uang1 "u \u101 ng" uang2 "u \ue1 ng" uang3 "u \u103 ng" uang4 "u\ue0 ng" 
       ueng1 "u \u113 ng" ueng2 "u \ue9 ng" ueng3 "u \u11b ng" ueng4 "u \ue8 ng" 

       u:1 \u1d6 u:2 \u1d8 u:3 \u1da u:4 \u1dc u:5 \ufc u: \ufc
       u:e1 "\ufc \u113" u:e2 "\ufc \ue9" u:e3 "\ufc \u11b" u:e4 "\ufc \ue8" u:e5 "\ufc e"
       u:an1 "\ufc \u101 n" u:an2 "\ufc \ue1 n" u:an3 "\ufc \u103 n" u:an4 "\ufc \ue0 n" 
       u:en1 "\ufc \u113 n" u:en2 "\ufc \ue9 n" u:en3 "\ufc \u11b n" u:en4 "\ufc \ue8 n" 

       er4 "\ue8 r"

        } $str]

    set str [string map {" " ""} $str]

    return $str
 }

 #-----------------------------------------------------------
 # work through each word in a string, may not be needed.
 #-----------------------------------------------------------
 proc pinyin:convertstring {str} {
    set ret ""
    foreach word $str {
        lappend ret [pinyin:parse $word] 
    }
    return $ret
 }

 #-----------------------------------------------------------
 # demo
 #-----------------------------------------------------------

 proc demo {} {
    package require Tk
    text .t -font {{Bitstream Cyberbit} 14} 
    pack .t -fill both -expand 1

    .t insert end [pinyin:convertstring "A few test words"]\n

    .t insert end [pinyin:convertstring "nu:3ren2"]\n
    .t insert end [pinyin:convertstring "chuang2"]\n
    .t insert end [pinyin:convertstring "ni3 hao3 ma?"]\n
    .t insert end [pinyin:convertstring "jiang1"]\n\n

    .t insert end [pinyin:convertstring "Test out for all logical final/tone combinations"]\n

    .t insert end [pinyin:convertstring "a1 a2 a3 a4 a"]\n
    .t insert end [pinyin:convertstring "ai1 ai2 ai3 ai4 ai"]\n
    .t insert end [pinyin:convertstring "ao1 ao2 ao3 ao4 ao"]\n
    .t insert end [pinyin:convertstring "an1 an2 an3 an4 an"]\n
    .t insert end [pinyin:convertstring "ang1 ang2 ang3 ang4 ang"]\n\n

    .t insert end [pinyin:convertstring "e1 e2 e3 e4 e"]\n
    .t insert end [pinyin:convertstring "ei1 ei2 ei3 ei4 ei"]\n
    .t insert end [pinyin:convertstring "en1 en2 en3 en4 en"]\n
    .t insert end [pinyin:convertstring "eng1 eng2 eng3 eng4 eng"]\n\n

    .t insert end [pinyin:convertstring "i1 i2 i3 i4 i"]\n
    .t insert end [pinyin:convertstring "ia1 ia2 ia3 ia4 ia"]\n
    .t insert end [pinyin:convertstring "iao1 iao2 iao3 iao4 iao"]\n
    .t insert end [pinyin:convertstring "iu1 iu2 iu3 iu4 iu"]\n
    .t insert end [pinyin:convertstring "ian1 ian2 ian3 ian4 ian"]\n
    .t insert end [pinyin:convertstring "in1 in2 in3 in4 in"]\n
    .t insert end [pinyin:convertstring "iang1 iang2 iang3 iang4 iang"]\n
    .t insert end [pinyin:convertstring "ing1 ing2 ing3 ing4 ing"]\n
    .t insert end [pinyin:convertstring "iong1 iong2 iong3 iong4 iong"]\n\n

    .t insert end [pinyin:convertstring "o1 o2 o3 o4 o"]\n
    .t insert end [pinyin:convertstring "ong1 ong2 ong3 ong4 ong"]\n\n
    .t insert end [pinyin:convertstring "ou1 ou2 ou3 ou4 ou"]\n

    .t insert end [pinyin:convertstring "u1 u2 u3 u4 u"]\n
    .t insert end [pinyin:convertstring "ua1 ua2 ua3 ua4 ua"]\n
    .t insert end [pinyin:convertstring "uai1 uai2 uai3 uai4 uai"]\n
    .t insert end [pinyin:convertstring "uo1 uo2 uo3 uo4 uo"]\n
    .t insert end [pinyin:convertstring "ui1 ui2 ui3 ui4 ui"]\n
    .t insert end [pinyin:convertstring "uan1 uan2 uan3 uan4 uan"]\n
    .t insert end [pinyin:convertstring "uang1 uang2 uang3 uang4 uang"]\n
    .t insert end [pinyin:convertstring "uen1 uen2 uen3 uen4 uen"]\n
    .t insert end [pinyin:convertstring "ueng1 ueng2 ueng3 ueng4 ueng"]\n\n

    .t insert end [pinyin:convertstring "u:1 u:2 u:3 u:4 u:"]\n
    .t insert end [pinyin:convertstring "u:e1 u:e2 u:e3 u:e4 u:e"]\n
    .t insert end [pinyin:convertstring "u:an1 u:an2 u:an3 u:an4 u:an"]\n
    .t insert end [pinyin:convertstring "u:en1 u:en2 u:en3 u:en4 u:en"]\n\n

    .t insert end [pinyin:convertstring "Something more substantial"]\n

    set testtext {
        {Chinese}
        {Ding1 yun2 shi4 Ying1yu3 xi4 de xue2sheng. Ta1 cong1 Bei3jing1 lai2 zher4 xue3xi.} 
        {Ying1yu3 xi4 you3 wu3ge zhong1guo2 liu2xue2sheng1. Ta1men dou1 zai4 xue2sheng su4she4 zhu4.}
        {}
        {Translation}
        {Ding1 Yun2 is a student in the English language department. She has come from Bei3jing1 to study here.}
        {The English language deparment has five foreign students. They all live in the students dormitory.}
    }

    foreach str $testtext {
        .t insert end [pinyin:convertstring $str]\n
    }
 }

 demo