Version 2 of Pinyin, ASCII to Unicode Converter

Updated 2008-03-25 20:13:59 by dkf

WJG (24th May, 2005): RS inspired me to produce this humble offering. It takes a line of ASCII formatted pinyin and spits out a string with the appropriate diacritics. At present it I haven't handled capitals.


# -------------------------------------------------------------
# pinyin_covert.tcl
# Written by William J Giddings, 2005
# -------------------------------------------------------------
# Usage:
# pinyin:parse <string>
# 
# Args:
# <string> any ascii string representing hanyu pinyin in the form 
# <initial sound><final sound><tone>
# e.g. chuang2
# <ch><uang>2
# 
# Returns:
# correctly formatted unicode string
# 
# Purpose:
# --------
# Convert linear 'ascii pinyin' Hanyu finals into correct forms 
# complete with diacritics.
# 
# Notes:
# ------
# Use versions of ttf 2.76 or higher. These include the necessary 
# diacritics to display all pinyin vowels plus their tone marks.
# 
# For further examples, see CEDIT dictionary
# http://www.mandarintools.com/cedict.html 
# 
# -------------------------------------------------------------
#-----------------------------------------------------------
# swap linear ascii input for correct finals including diacritics
#-----------------------------------------------------------
proc pinyin:parse {str} { 
   set str [string map {

      a1 \u101 a2 \ue1 a3 \u103 a4 \ue0 a5 a
      ai1 "\u101 i" ai2 "\ue1 i" ai3 "\u103 i" ai4 "\ue0 i" ai5 ai
      ao1 "\u101 o" ao2 "\ue1 o" ao3 "\u103 o" ao4 "\ue0 o"  ao5 ao
      an1 "\u101 n" an2 "\ue1 n" an3 "\u103 n" an4 "\ue0 n" an5 an
      ang1 "\u101 ng" ang2 "\ue1 ng" ang3 "\u103 ng" ang4 "\ue0 ng" ang5 ang

      e1 \u113 e2 \ue9 e3 \u11b e4 \ue8 e5 e 
      ei1 "\u113 i" ei2 "\ue9 i" ei3 "\u11b i" ei4 "\ue8 i" ei5 ei
      en1 "\u113 n" en2 "\ue9 n" en3 "\u11b n" en4 "\ue8 n" en5 en
      eng1 "\u113 ng" eng2 "\ue9 ng" eng3 "\u11b ng" eng4 "\ue8 ng" eng5 eng
 
      o1 \u14d o2 \uf3 o3 \u14f o4 \uf2 o5 o
      ong1 "\u14d ng"  ong2 "\uf3 ng" ong3 "\u14f ng" ong4 "\uf2 ng" ong5 ong
      ou1 "\u14d u" ou2 "\uf3 u" ou3 "\u14f u" ou4 "\uf2 u" ou5 ou

      i1 \u12b i2 \ued i3 \u12d i4 \uec i5 i
      ia1 "i \u101" ia2 "i \ue1" ia3 "i \u103" ia4 "i \ue0" ia5 ia
      iao1 "i \u101 o" iao2 "i \ue1 o" iao3 "i \u103 o" iao4 "i \ue0 o" 
      iu1 "i \u16b" iu2 "i \ufa" iu3 "i \u16d" iu4 "i \uf9" iu5 iu
      ian1 "\u12b an" ian2 "\ued an" ian3 "\u12d an" ian4 "\uec an" 
      in1 "\u12b n" in2 "\ued n" in3 "\u12d n" in4 "\uec n"  in5 in
      iang1 "i \u101 ng" iang2 "i \ue1 ng" iang3 "i \u103 ng" iang4 "i \ue0 ng" 
      ing1 "\u12b ng" ing2 "\ued ng" ing3 "\u12d ng" ing4 "\uec ng" ing5 ing
      iong1 "i \u14d ng" iong2 "i \uf3 ng" iong3 "i \u14f ng" iong4 "i \uf2 ng" 

      u1 \u16b u2 \ufa u3 \u16d u4 \uf9 u5 u
      ua1 "u \u101"  ua2 "u \ue1" ua3 "u \u103" ua4 "u \ue0" ua5 ua
      uai1 "u \u101 i" uai2 "u \ue1 i" uai3 "u \u103 i" uai4 "u \ue0 i"
      uo1 "u \u14d" uo2 "u \uf3" uo3 "u \u14f" uo4 "u \uf2" uo5 uo
      ui1 "u \u12b" ui2 "u \ued" ui3 "u \u12d" ui4 "u \uec" ui4 ui
      un1 "\u16b n" un2 "\ufa n" un3 "\u16d n" un4 "\uf9 n"  un5 un
      uan1 "u \u101 n" uan2 "u \ue1 n" uan3 "u \u103 n" uan4 "u \ue0 n" 
      uang1 "u \u101 ng" uang2 "u \ue1 ng" uang3 "u \u103 ng" uang4 "u\ue0 ng" 
      ueng1 "u \u113 ng" ueng2 "u \ue9 ng" ueng3 "u \u11b ng" ueng4 "u \ue8 ng" 

      u:1 \u1d6 u:2 \u1d8 u:3 \u1da u:4 \u1dc u:5 \ufc u: \ufc
      u:e1 "\ufc \u113" u:e2 "\ufc \ue9" u:e3 "\ufc \u11b" u:e4 "\ufc \ue8" u:e5 "\ufc e"
      u:an1 "\ufc \u101 n" u:an2 "\ufc \ue1 n" u:an3 "\ufc \u103 n" u:an4 "\ufc \ue0 n" 
      u:en1 "\ufc \u113 n" u:en2 "\ufc \ue9 n" u:en3 "\ufc \u11b n" u:en4 "\ufc \ue8 n" 

      er4 "\ue8 r"
       
   } $str]

   set str [string map {" " ""} $str]
   return $str
}

#-----------------------------------------------------------
# work through each word in a string, may not be needed.
#-----------------------------------------------------------
proc pinyin:convertstring {str} {
   set ret ""
   foreach word $str {
       lappend ret [pinyin:parse $word] 
   }
   return $ret
}

#-----------------------------------------------------------
# demo
#-----------------------------------------------------------

proc demo {} {
   package require Tk
   text .t -font {{Bitstream Cyberbit} 14} 
   pack .t -fill both -expand 1

   .t insert end [pinyin:convertstring "A few test words"]\n

   .t insert end [pinyin:convertstring "nu:3ren2"]\n
   .t insert end [pinyin:convertstring "chuang2"]\n
   .t insert end [pinyin:convertstring "ni3 hao3 ma?"]\n
   .t insert end [pinyin:convertstring "jiang1"]\n\n

   .t insert end [pinyin:convertstring "Test out for all logical final/tone combinations"]\n

   .t insert end [pinyin:convertstring "a1 a2 a3 a4 a"]\n
   .t insert end [pinyin:convertstring "ai1 ai2 ai3 ai4 ai"]\n
   .t insert end [pinyin:convertstring "ao1 ao2 ao3 ao4 ao"]\n
   .t insert end [pinyin:convertstring "an1 an2 an3 an4 an"]\n
   .t insert end [pinyin:convertstring "ang1 ang2 ang3 ang4 ang"]\n\n

   .t insert end [pinyin:convertstring "e1 e2 e3 e4 e"]\n
   .t insert end [pinyin:convertstring "ei1 ei2 ei3 ei4 ei"]\n
   .t insert end [pinyin:convertstring "en1 en2 en3 en4 en"]\n
   .t insert end [pinyin:convertstring "eng1 eng2 eng3 eng4 eng"]\n\n

   .t insert end [pinyin:convertstring "i1 i2 i3 i4 i"]\n
   .t insert end [pinyin:convertstring "ia1 ia2 ia3 ia4 ia"]\n
   .t insert end [pinyin:convertstring "iao1 iao2 iao3 iao4 iao"]\n
   .t insert end [pinyin:convertstring "iu1 iu2 iu3 iu4 iu"]\n
   .t insert end [pinyin:convertstring "ian1 ian2 ian3 ian4 ian"]\n
   .t insert end [pinyin:convertstring "in1 in2 in3 in4 in"]\n
   .t insert end [pinyin:convertstring "iang1 iang2 iang3 iang4 iang"]\n
   .t insert end [pinyin:convertstring "ing1 ing2 ing3 ing4 ing"]\n
   .t insert end [pinyin:convertstring "iong1 iong2 iong3 iong4 iong"]\n\n

   .t insert end [pinyin:convertstring "o1 o2 o3 o4 o"]\n
   .t insert end [pinyin:convertstring "ong1 ong2 ong3 ong4 ong"]\n\n
   .t insert end [pinyin:convertstring "ou1 ou2 ou3 ou4 ou"]\n

   .t insert end [pinyin:convertstring "u1 u2 u3 u4 u"]\n
   .t insert end [pinyin:convertstring "ua1 ua2 ua3 ua4 ua"]\n
   .t insert end [pinyin:convertstring "uai1 uai2 uai3 uai4 uai"]\n
   .t insert end [pinyin:convertstring "uo1 uo2 uo3 uo4 uo"]\n
   .t insert end [pinyin:convertstring "ui1 ui2 ui3 ui4 ui"]\n
   .t insert end [pinyin:convertstring "uan1 uan2 uan3 uan4 uan"]\n
   .t insert end [pinyin:convertstring "uang1 uang2 uang3 uang4 uang"]\n
   .t insert end [pinyin:convertstring "uen1 uen2 uen3 uen4 uen"]\n
   .t insert end [pinyin:convertstring "ueng1 ueng2 ueng3 ueng4 ueng"]\n\n

   .t insert end [pinyin:convertstring "u:1 u:2 u:3 u:4 u:"]\n
   .t insert end [pinyin:convertstring "u:e1 u:e2 u:e3 u:e4 u:e"]\n
   .t insert end [pinyin:convertstring "u:an1 u:an2 u:an3 u:an4 u:an"]\n
   .t insert end [pinyin:convertstring "u:en1 u:en2 u:en3 u:en4 u:en"]\n\n

   .t insert end [pinyin:convertstring "Something more substantial"]\n

   set testtext {
       {Chinese}
       {Ding1 yun2 shi4 Ying1yu3 xi4 de xue2sheng. Ta1 cong1 Bei3jing1 lai2 zher4 xue3xi.} 
       {Ying1yu3 xi4 you3 wu3ge zhong1guo2 liu2xue2sheng1. Ta1men dou1 zai4 xue2sheng su4she4 zhu4.}
       {}
       {Translation}
       {Ding1 Yun2 is a student in the English language department. She has come from Bei3jing1 to study here.}
       {The English language deparment has five foreign students. They all live in the students dormitory.}
   }

   foreach str $testtext {
       .t insert end [pinyin:convertstring $str]\n
   }
}

demo