Pinyin, ASCII to Unicode Converter

WJG (24th May, 2005): RS inspired me to produce this humble offering. It takes a line of ASCII formatted pinyin and spits out a string with the appropriate diacritics. At present it doesn't handle capitals.


# -------------------------------------------------------------
# pinyin_covert.tcl
# Written by William J Giddings, 2005
# -------------------------------------------------------------
# Usage:
# pinyin:parse <string>
# 
# Args:
# <string> any ascii string representing hanyu pinyin in the form 
# <initial sound><final sound><tone>
# e.g. chuang2
# <ch><uang>2
# 
# Returns:
# correctly formatted unicode string
# 
# Purpose:
# --------
# Convert linear 'ascii pinyin' Hanyu finals into correct forms 
# complete with diacritics.
# 
# Notes:
# ------
# Use versions of ttf 2.76 or higher. These include the necessary 
# diacritics to display all pinyin vowels plus their tone marks.
# 
# For further examples, see CEDIT dictionary
# http://www.mandarintools.com/cedict.html 
# 
# -------------------------------------------------------------
#-----------------------------------------------------------
# swap linear ascii input for correct finals including diacritics
#-----------------------------------------------------------
proc pinyin:parse {str} { 
   set str [string map {

      a1 \u101 a2 \ue1 a3 \u103 a4 \ue0 a5 a
      ai1 "\u101 i" ai2 "\ue1 i" ai3 "\u103 i" ai4 "\ue0 i" ai5 ai
      ao1 "\u101 o" ao2 "\ue1 o" ao3 "\u103 o" ao4 "\ue0 o"  ao5 ao
      an1 "\u101 n" an2 "\ue1 n" an3 "\u103 n" an4 "\ue0 n" an5 an
      ang1 "\u101 ng" ang2 "\ue1 ng" ang3 "\u103 ng" ang4 "\ue0 ng" ang5 ang

      e1 \u113 e2 \ue9 e3 \u11b e4 \ue8 e5 e 
      ei1 "\u113 i" ei2 "\ue9 i" ei3 "\u11b i" ei4 "\ue8 i" ei5 ei
      en1 "\u113 n" en2 "\ue9 n" en3 "\u11b n" en4 "\ue8 n" en5 en
      eng1 "\u113 ng" eng2 "\ue9 ng" eng3 "\u11b ng" eng4 "\ue8 ng" eng5 eng
 
      o1 \u14d o2 \uf3 o3 \u14f o4 \uf2 o5 o
      ong1 "\u14d ng"  ong2 "\uf3 ng" ong3 "\u14f ng" ong4 "\uf2 ng" ong5 ong
      ou1 "\u14d u" ou2 "\uf3 u" ou3 "\u14f u" ou4 "\uf2 u" ou5 ou

      i1 \u12b i2 \ued i3 \u12d i4 \uec i5 i
      ia1 "i \u101" ia2 "i \ue1" ia3 "i \u103" ia4 "i \ue0" ia5 ia
      iao1 "i \u101 o" iao2 "i \ue1 o" iao3 "i \u103 o" iao4 "i \ue0 o" 
      iu1 "i \u16b" iu2 "i \ufa" iu3 "i \u16d" iu4 "i \uf9" iu5 iu
      ian1 "\u12b an" ian2 "\ued an" ian3 "\u12d an" ian4 "\uec an" 
      in1 "\u12b n" in2 "\ued n" in3 "\u12d n" in4 "\uec n"  in5 in
      iang1 "i \u101 ng" iang2 "i \ue1 ng" iang3 "i \u103 ng" iang4 "i \ue0 ng" 
      ing1 "\u12b ng" ing2 "\ued ng" ing3 "\u12d ng" ing4 "\uec ng" ing5 ing
      iong1 "i \u14d ng" iong2 "i \uf3 ng" iong3 "i \u14f ng" iong4 "i \uf2 ng" 

      u1 \u16b u2 \ufa u3 \u16d u4 \uf9 u5 u
      ua1 "u \u101"  ua2 "u \ue1" ua3 "u \u103" ua4 "u \ue0" ua5 ua
      uai1 "u \u101 i" uai2 "u \ue1 i" uai3 "u \u103 i" uai4 "u \ue0 i"
      uo1 "u \u14d" uo2 "u \uf3" uo3 "u \u14f" uo4 "u \uf2" uo5 uo
      ui1 "u \u12b" ui2 "u \ued" ui3 "u \u12d" ui4 "u \uec" ui4 ui
      un1 "\u16b n" un2 "\ufa n" un3 "\u16d n" un4 "\uf9 n"  un5 un
      uan1 "u \u101 n" uan2 "u \ue1 n" uan3 "u \u103 n" uan4 "u \ue0 n" 
      uang1 "u \u101 ng" uang2 "u \ue1 ng" uang3 "u \u103 ng" uang4 "u\ue0 ng" 
      ueng1 "u \u113 ng" ueng2 "u \ue9 ng" ueng3 "u \u11b ng" ueng4 "u \ue8 ng" 

      u:1 \u1d6 u:2 \u1d8 u:3 \u1da u:4 \u1dc u:5 \ufc u: \ufc
      u:e1 "\ufc \u113" u:e2 "\ufc \ue9" u:e3 "\ufc \u11b" u:e4 "\ufc \ue8" u:e5 "\ufc e"
      u:an1 "\ufc \u101 n" u:an2 "\ufc \ue1 n" u:an3 "\ufc \u103 n" u:an4 "\ufc \ue0 n" 
      u:en1 "\ufc \u113 n" u:en2 "\ufc \ue9 n" u:en3 "\ufc \u11b n" u:en4 "\ufc \ue8 n" 

      er4 "\ue8 r"
       
   } $str]

   set str [string map {" " ""} $str]
   return $str
}

#-----------------------------------------------------------
# work through each word in a string, may not be needed.
#-----------------------------------------------------------
proc pinyin:convertstring {str} {
   set ret ""
   foreach word $str {
       lappend ret [pinyin:parse $word] 
   }
   return $ret
}

#-----------------------------------------------------------
# demo
#-----------------------------------------------------------

proc demo {} {
   package require Tk
   text .t -font {{Bitstream Cyberbit} 14} 
   pack .t -fill both -expand 1

   .t insert end [pinyin:convertstring "A few test words"]\n

   .t insert end [pinyin:convertstring "nu:3ren2"]\n
   .t insert end [pinyin:convertstring "chuang2"]\n
   .t insert end [pinyin:convertstring "ni3 hao3 ma?"]\n
   .t insert end [pinyin:convertstring "jiang1"]\n\n

   .t insert end [pinyin:convertstring "Test out for all logical final/tone combinations"]\n

   .t insert end [pinyin:convertstring "a1 a2 a3 a4 a"]\n
   .t insert end [pinyin:convertstring "ai1 ai2 ai3 ai4 ai"]\n
   .t insert end [pinyin:convertstring "ao1 ao2 ao3 ao4 ao"]\n
   .t insert end [pinyin:convertstring "an1 an2 an3 an4 an"]\n
   .t insert end [pinyin:convertstring "ang1 ang2 ang3 ang4 ang"]\n\n

   .t insert end [pinyin:convertstring "e1 e2 e3 e4 e"]\n
   .t insert end [pinyin:convertstring "ei1 ei2 ei3 ei4 ei"]\n
   .t insert end [pinyin:convertstring "en1 en2 en3 en4 en"]\n
   .t insert end [pinyin:convertstring "eng1 eng2 eng3 eng4 eng"]\n\n

   .t insert end [pinyin:convertstring "i1 i2 i3 i4 i"]\n
   .t insert end [pinyin:convertstring "ia1 ia2 ia3 ia4 ia"]\n
   .t insert end [pinyin:convertstring "iao1 iao2 iao3 iao4 iao"]\n
   .t insert end [pinyin:convertstring "iu1 iu2 iu3 iu4 iu"]\n
   .t insert end [pinyin:convertstring "ian1 ian2 ian3 ian4 ian"]\n
   .t insert end [pinyin:convertstring "in1 in2 in3 in4 in"]\n
   .t insert end [pinyin:convertstring "iang1 iang2 iang3 iang4 iang"]\n
   .t insert end [pinyin:convertstring "ing1 ing2 ing3 ing4 ing"]\n
   .t insert end [pinyin:convertstring "iong1 iong2 iong3 iong4 iong"]\n\n

   .t insert end [pinyin:convertstring "o1 o2 o3 o4 o"]\n
   .t insert end [pinyin:convertstring "ong1 ong2 ong3 ong4 ong"]\n\n
   .t insert end [pinyin:convertstring "ou1 ou2 ou3 ou4 ou"]\n

   .t insert end [pinyin:convertstring "u1 u2 u3 u4 u"]\n
   .t insert end [pinyin:convertstring "ua1 ua2 ua3 ua4 ua"]\n
   .t insert end [pinyin:convertstring "uai1 uai2 uai3 uai4 uai"]\n
   .t insert end [pinyin:convertstring "uo1 uo2 uo3 uo4 uo"]\n
   .t insert end [pinyin:convertstring "ui1 ui2 ui3 ui4 ui"]\n
   .t insert end [pinyin:convertstring "uan1 uan2 uan3 uan4 uan"]\n
   .t insert end [pinyin:convertstring "uang1 uang2 uang3 uang4 uang"]\n
   .t insert end [pinyin:convertstring "uen1 uen2 uen3 uen4 uen"]\n
   .t insert end [pinyin:convertstring "ueng1 ueng2 ueng3 ueng4 ueng"]\n\n

   .t insert end [pinyin:convertstring "u:1 u:2 u:3 u:4 u:"]\n
   .t insert end [pinyin:convertstring "u:e1 u:e2 u:e3 u:e4 u:e"]\n
   .t insert end [pinyin:convertstring "u:an1 u:an2 u:an3 u:an4 u:an"]\n
   .t insert end [pinyin:convertstring "u:en1 u:en2 u:en3 u:en4 u:en"]\n\n

   .t insert end [pinyin:convertstring "Something more substantial"]\n

   set testtext {
       {Chinese}
       {Ding1 yun2 shi4 Ying1yu3 xi4 de xue2sheng. Ta1 cong1 Bei3jing1 lai2 zher4 xue3xi.} 
       {Ying1yu3 xi4 you3 wu3ge zhong1guo2 liu2xue2sheng1. Ta1men dou1 zai4 xue2sheng su4she4 zhu4.}
       {}
       {Translation}
       {Ding1 Yun2 is a student in the English language department. She has come from Bei3jing1 to study here.}
       {The English language deparment has five foreign students. They all live in the students dormitory.}
   }

   foreach str $testtext {
       .t insert end [pinyin:convertstring $str]\n
   }
}

demo

WJG (24/JUN/11) Took another look at this sample script and reworked the mappings. In the previous example 'logical tone-syllable' combinations are created using this approach is not wholly satisfying, some of the possible combinations are not found in Chinese in practice. The following mapping should be used:

#---------------
#!/bin/sh
# the next line restarts using tclsh \
exec tclsh "$0" "$@"
#---------------

set pinyin_map {
        a1   ā   a2   á   a3   ǎ   a4   à   a5   a
        ai1  āi  ai2  ái  ai3  ǎi  ai4  ài  ai5  ai
        ao1  āo  ao2  áo  ao3  ǎo  ao4  ào  ao5  ao
        an1  ān  an2  án  an3  ǎn  an4  àn  an5  an
        ang1 āng ang2 áng ang3 ǎng ang4 àng ang5 ang

        e1   ē   e2   é   e3   ě   e4   è   e5   e
        ei1  ēi  ei2  éi  ei3  ěi  ei4  èi  ei5  ei
        en1  ēn  en2  én  en3  ěn  en4  èn  en5  en
        eng1 ēng eng2 éng eng3 ěng eng4 èng eng5 eng

        i1    ī    i2    í    i3    ǐ    i4    ì    i5    i
        ia1   iā   ia2   iá   ia3   iǎ   ia4   ià   ia5   ia
        iao1  iāo  iao2  iáo  iao3  iǎo  iao4  iào  iao5  iao
        iu1   iū   iu2   iú   iu3   iǔ   iu4   iù   iu5   iu
        ian1  iān  ian2  ián  ian3  iǎn  ian4  iàn  ian5  ian
        in1   īn   in2   ín   in3   ǐn   in4   ìn   in5   in
        iang1 iāng iang2 iáng iang3 iǎng iang4 iàng iang5 iang
        ing1  īng  ing2  íng  ing3  ǐng  ing4  ìng  ing5  ing
        iong1 iōng iong2 ióng iong3 iǒng iong4 iòng iong5 iong

        o1   ō   o2   ó   o3   ǒ   o4   ò   o5   o
        ong1 ōng ong2 óng ong3 ǒng ong4 òng ong5 ong
        ou1  ōu  ou2  óu  ou3  ǒu  ou4  òu  ou5  ou

        u1    ū    u2    ú     u3    ǔ     u4   ù    u5    u
        ua1   uā   ua2   uá    ua3   uǎ   ua4   uà   ua5   ua
        uai1  uā   uai2  uá    uai3  uǎ   uai4  uà   uai5  ua
        uo1   uō   uo2   uó    uo3   uǒ   uo4   uò   uo5   uo
        ui1   uī   ui2   uí    ui3   uǐ   ui4   uì   ui5   ui
        uan1  uān  uan2  uán   uan3  uǎn  uan4  uàn  uan5  uan
        uang1 uāng uang2 uáng  uang3 uǎng uang4 uàng uang5 uang
        un1   ūn   un2   ún    un3   ǔn   un4   ùn   un5   un

        u:1  ǖ  u:2  ǘ  u:3  ǚ  u:4  ǜ  u:5 ü
}

 set testtext {
       {Chinese}
       {Ding1 yun2 shi4 Ying1yu3 xi4 de xue2sheng. Ta1 cong1 Bei3jing1 lai2 zher4 xue3xi.}
       {Ying1yu3 xi4 you3 wu3ge zhong1guo2 liu2xue2sheng1. Ta1men dou1 zai4 xue2sheng su4she4 zhu4.}
       {}
       {Translation}
       {Ding1 Yun2 is a student in the English language department. She has come from Bei3jing1 to study here.}
       {The English language deparment has five foreign students. They all live in the students dormitory.}
   }

puts [string map $pinyin_map $testtext]