eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scripts.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. #!/usr/bin/python
  2. # Copyright (C) 2012-2016 Reece H. Dunn
  3. #
  4. # This file is part of ucd-tools.
  5. #
  6. # ucd-tools is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # ucd-tools is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import sys
  20. import ucd
  21. ucd_rootdir = sys.argv[1]
  22. ucd_version = sys.argv[2]
  23. unicode_chars = {}
  24. for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
  25. for codepoint in data['Range']:
  26. unicode_chars[codepoint] = data['Script']
  27. if '--with-csur' in sys.argv:
  28. for csur in ['Klingon']:
  29. for data in ucd.parse_ucd_data('data/csur', csur):
  30. for codepoint in data['CodePoint']:
  31. unicode_chars[codepoint] = data['Script']
  32. # This map is a combination of the information in the UnicodeData and Blocks
  33. # data files. It is intended to reduce the number of character tables that
  34. # need to be generated.
  35. script_sets = [
  36. (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
  37. (ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'),
  38. (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'),
  39. (ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'),
  40. (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
  41. (ucd.CodeRange('0E0200..10FFFF'), 'Zzzz', 'Unassigned'),
  42. ]
  43. # These scripts have many pages consisting of just this script:
  44. special_scripts = []
  45. script_tables = {}
  46. for codepoints, script, comment in script_sets:
  47. if not script:
  48. table = {}
  49. table_entry = None
  50. table_codepoint = None
  51. table_script = None
  52. for i, codepoint in enumerate(codepoints):
  53. try:
  54. script = unicode_chars[codepoint]
  55. except KeyError:
  56. script = 'Zzzz' # Unknown
  57. if (i % 256) == 0:
  58. if table_entry:
  59. if table_script in special_scripts:
  60. table[table_codepoint] = table_script
  61. elif table_script:
  62. special_scripts.append(table_script)
  63. table[table_codepoint] = table_script
  64. else:
  65. table[table_codepoint] = table_entry
  66. table_entry = []
  67. table_codepoint = codepoint
  68. table_script = script
  69. if script != table_script:
  70. table_script = None
  71. table_entry.append(script)
  72. if table_entry:
  73. if table_script in special_scripts:
  74. table[table_codepoint] = table_script
  75. else:
  76. table[table_codepoint] = table_entry
  77. script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table
  78. if __name__ == '__main__':
  79. sys.stdout.write("""/* Unicode Scripts
  80. *
  81. * Copyright (C) 2012-2016 Reece H. Dunn
  82. *
  83. * This file is part of ucd-tools.
  84. *
  85. * ucd-tools is free software: you can redistribute it and/or modify
  86. * it under the terms of the GNU General Public License as published by
  87. * the Free Software Foundation, either version 3 of the License, or
  88. * (at your option) any later version.
  89. *
  90. * ucd-tools is distributed in the hope that it will be useful,
  91. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  92. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  93. * GNU General Public License for more details.
  94. *
  95. * You should have received a copy of the GNU General Public License
  96. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  97. */
  98. /* NOTE: This file is automatically generated from the Scripts.txt file in
  99. * the Unicode Character database by the ucd-tools/tools/scripts.py script.
  100. */
  101. #include "ucd/ucd.h"
  102. #include <stddef.h>
  103. #define Adlm UCD_SCRIPT_Adlm
  104. #define Afak UCD_SCRIPT_Afak
  105. #define Aghb UCD_SCRIPT_Aghb
  106. #define Ahom UCD_SCRIPT_Ahom
  107. #define Arab UCD_SCRIPT_Arab
  108. #define Armi UCD_SCRIPT_Armi
  109. #define Armn UCD_SCRIPT_Armn
  110. #define Avst UCD_SCRIPT_Avst
  111. #define Bali UCD_SCRIPT_Bali
  112. #define Bamu UCD_SCRIPT_Bamu
  113. #define Bass UCD_SCRIPT_Bass
  114. #define Batk UCD_SCRIPT_Batk
  115. #define Beng UCD_SCRIPT_Beng
  116. #define Bhks UCD_SCRIPT_Bhks
  117. #define Blis UCD_SCRIPT_Blis
  118. #define Bopo UCD_SCRIPT_Bopo
  119. #define Brah UCD_SCRIPT_Brah
  120. #define Brai UCD_SCRIPT_Brai
  121. #define Bugi UCD_SCRIPT_Bugi
  122. #define Buhd UCD_SCRIPT_Buhd
  123. #define Cakm UCD_SCRIPT_Cakm
  124. #define Cans UCD_SCRIPT_Cans
  125. #define Cari UCD_SCRIPT_Cari
  126. #define Cham UCD_SCRIPT_Cham
  127. #define Cher UCD_SCRIPT_Cher
  128. #define Cirt UCD_SCRIPT_Cirt
  129. #define Copt UCD_SCRIPT_Copt
  130. #define Cprt UCD_SCRIPT_Cprt
  131. #define Cyrl UCD_SCRIPT_Cyrl
  132. #define Cyrs UCD_SCRIPT_Cyrs
  133. #define Deva UCD_SCRIPT_Deva
  134. #define Dsrt UCD_SCRIPT_Dsrt
  135. #define Dupl UCD_SCRIPT_Dupl
  136. #define Egyd UCD_SCRIPT_Egyd
  137. #define Egyh UCD_SCRIPT_Egyh
  138. #define Egyp UCD_SCRIPT_Egyp
  139. #define Elba UCD_SCRIPT_Elba
  140. #define Ethi UCD_SCRIPT_Ethi
  141. #define Geok UCD_SCRIPT_Geok
  142. #define Geor UCD_SCRIPT_Geor
  143. #define Glag UCD_SCRIPT_Glag
  144. #define Gonm UCD_SCRIPT_Gonm
  145. #define Goth UCD_SCRIPT_Goth
  146. #define Gran UCD_SCRIPT_Gran
  147. #define Grek UCD_SCRIPT_Grek
  148. #define Gujr UCD_SCRIPT_Gujr
  149. #define Guru UCD_SCRIPT_Guru
  150. #define Hang UCD_SCRIPT_Hang
  151. #define Hani UCD_SCRIPT_Hani
  152. #define Hano UCD_SCRIPT_Hano
  153. #define Hans UCD_SCRIPT_Hans
  154. #define Hant UCD_SCRIPT_Hant
  155. #define Hatr UCD_SCRIPT_Hatr
  156. #define Hebr UCD_SCRIPT_Hebr
  157. #define Hira UCD_SCRIPT_Hira
  158. #define Hluw UCD_SCRIPT_Hluw
  159. #define Hmng UCD_SCRIPT_Hmng
  160. #define Hrkt UCD_SCRIPT_Hrkt
  161. #define Hung UCD_SCRIPT_Hung
  162. #define Inds UCD_SCRIPT_Inds
  163. #define Ital UCD_SCRIPT_Ital
  164. #define Java UCD_SCRIPT_Java
  165. #define Jpan UCD_SCRIPT_Jpan
  166. #define Jurc UCD_SCRIPT_Jurc
  167. #define Kali UCD_SCRIPT_Kali
  168. #define Kana UCD_SCRIPT_Kana
  169. #define Khar UCD_SCRIPT_Khar
  170. #define Khmr UCD_SCRIPT_Khmr
  171. #define Khoj UCD_SCRIPT_Khoj
  172. #define Knda UCD_SCRIPT_Knda
  173. #define Kore UCD_SCRIPT_Kore
  174. #define Kpel UCD_SCRIPT_Kpel
  175. #define Kthi UCD_SCRIPT_Kthi
  176. #define Lana UCD_SCRIPT_Lana
  177. #define Laoo UCD_SCRIPT_Laoo
  178. #define Latf UCD_SCRIPT_Latf
  179. #define Latg UCD_SCRIPT_Latg
  180. #define Latn UCD_SCRIPT_Latn
  181. #define Lepc UCD_SCRIPT_Lepc
  182. #define Limb UCD_SCRIPT_Limb
  183. #define Lina UCD_SCRIPT_Lina
  184. #define Linb UCD_SCRIPT_Linb
  185. #define Lisu UCD_SCRIPT_Lisu
  186. #define Loma UCD_SCRIPT_Loma
  187. #define Lyci UCD_SCRIPT_Lyci
  188. #define Lydi UCD_SCRIPT_Lydi
  189. #define Mahj UCD_SCRIPT_Mahj
  190. #define Mand UCD_SCRIPT_Mand
  191. #define Mani UCD_SCRIPT_Mani
  192. #define Marc UCD_SCRIPT_Marc
  193. #define Maya UCD_SCRIPT_Maya
  194. #define Mend UCD_SCRIPT_Mend
  195. #define Merc UCD_SCRIPT_Merc
  196. #define Mero UCD_SCRIPT_Mero
  197. #define Mlym UCD_SCRIPT_Mlym
  198. #define Modi UCD_SCRIPT_Modi
  199. #define Mong UCD_SCRIPT_Mong
  200. #define Moon UCD_SCRIPT_Moon
  201. #define Mroo UCD_SCRIPT_Mroo
  202. #define Mtei UCD_SCRIPT_Mtei
  203. #define Mult UCD_SCRIPT_Mult
  204. #define Mymr UCD_SCRIPT_Mymr
  205. #define Narb UCD_SCRIPT_Narb
  206. #define Nbat UCD_SCRIPT_Nbat
  207. #define Newa UCD_SCRIPT_Newa
  208. #define Nkgb UCD_SCRIPT_Nkgb
  209. #define Nkoo UCD_SCRIPT_Nkoo
  210. #define Nshu UCD_SCRIPT_Nshu
  211. #define Ogam UCD_SCRIPT_Ogam
  212. #define Olck UCD_SCRIPT_Olck
  213. #define Orkh UCD_SCRIPT_Orkh
  214. #define Orya UCD_SCRIPT_Orya
  215. #define Osge UCD_SCRIPT_Osge
  216. #define Osma UCD_SCRIPT_Osma
  217. #define Palm UCD_SCRIPT_Palm
  218. #define Pauc UCD_SCRIPT_Pauc
  219. #define Perm UCD_SCRIPT_Perm
  220. #define Phag UCD_SCRIPT_Phag
  221. #define Phli UCD_SCRIPT_Phli
  222. #define Phlp UCD_SCRIPT_Phlp
  223. #define Phlv UCD_SCRIPT_Phlv
  224. #define Phnx UCD_SCRIPT_Phnx
  225. #define Plrd UCD_SCRIPT_Plrd
  226. #define Prti UCD_SCRIPT_Prti
  227. #define Qaak UCD_SCRIPT_Qaak
  228. #define Rjng UCD_SCRIPT_Rjng
  229. #define Roro UCD_SCRIPT_Roro
  230. #define Runr UCD_SCRIPT_Runr
  231. #define Samr UCD_SCRIPT_Samr
  232. #define Sara UCD_SCRIPT_Sara
  233. #define Sarb UCD_SCRIPT_Sarb
  234. #define Saur UCD_SCRIPT_Saur
  235. #define Sgnw UCD_SCRIPT_Sgnw
  236. #define Shaw UCD_SCRIPT_Shaw
  237. #define Shrd UCD_SCRIPT_Shrd
  238. #define Sidd UCD_SCRIPT_Sidd
  239. #define Sind UCD_SCRIPT_Sind
  240. #define Sinh UCD_SCRIPT_Sinh
  241. #define Sora UCD_SCRIPT_Sora
  242. #define Soyo UCD_SCRIPT_Soyo
  243. #define Sund UCD_SCRIPT_Sund
  244. #define Sylo UCD_SCRIPT_Sylo
  245. #define Syrc UCD_SCRIPT_Syrc
  246. #define Syre UCD_SCRIPT_Syre
  247. #define Syrj UCD_SCRIPT_Syrj
  248. #define Syrn UCD_SCRIPT_Syrn
  249. #define Tagb UCD_SCRIPT_Tagb
  250. #define Takr UCD_SCRIPT_Takr
  251. #define Tale UCD_SCRIPT_Tale
  252. #define Talu UCD_SCRIPT_Talu
  253. #define Taml UCD_SCRIPT_Taml
  254. #define Tang UCD_SCRIPT_Tang
  255. #define Tavt UCD_SCRIPT_Tavt
  256. #define Telu UCD_SCRIPT_Telu
  257. #define Teng UCD_SCRIPT_Teng
  258. #define Tfng UCD_SCRIPT_Tfng
  259. #define Tglg UCD_SCRIPT_Tglg
  260. #define Thaa UCD_SCRIPT_Thaa
  261. #define Thai UCD_SCRIPT_Thai
  262. #define Tibt UCD_SCRIPT_Tibt
  263. #define Tirh UCD_SCRIPT_Tirh
  264. #define Ugar UCD_SCRIPT_Ugar
  265. #define Vaii UCD_SCRIPT_Vaii
  266. #define Visp UCD_SCRIPT_Visp
  267. #define Wara UCD_SCRIPT_Wara
  268. #define Wole UCD_SCRIPT_Wole
  269. #define Xpeo UCD_SCRIPT_Xpeo
  270. #define Xsux UCD_SCRIPT_Xsux
  271. #define Yiii UCD_SCRIPT_Yiii
  272. #define Zanb UCD_SCRIPT_Zanb
  273. #define Zinh UCD_SCRIPT_Zinh
  274. #define Zmth UCD_SCRIPT_Zmth
  275. #define Zsym UCD_SCRIPT_Zsym
  276. #define Zxxx UCD_SCRIPT_Zxxx
  277. #define Zyyy UCD_SCRIPT_Zyyy
  278. #define Zzzz UCD_SCRIPT_Zzzz
  279. /* Unicode Character Data %s */
  280. """ % ucd_version)
  281. for script in special_scripts:
  282. sys.stdout.write('\n')
  283. sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script)
  284. sys.stdout.write('{')
  285. for i in range(0, 256):
  286. if (i % 16) == 0:
  287. sys.stdout.write('\n\t/* %02X */' % i)
  288. sys.stdout.write(' %s,' % script)
  289. sys.stdout.write('\n};\n')
  290. for codepoints, script, comment in script_sets:
  291. if not script:
  292. tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)]
  293. for codepoint in sorted(tables.keys()):
  294. table = tables[codepoint]
  295. if table in special_scripts:
  296. continue
  297. sys.stdout.write('\n')
  298. sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint)
  299. sys.stdout.write('{')
  300. for i, script in enumerate(table):
  301. if (i % 16) == 0:
  302. sys.stdout.write('\n\t/* %02X */' % i)
  303. sys.stdout.write(' %s,' % script)
  304. sys.stdout.write('\n};\n')
  305. for codepoints, script, comment in script_sets:
  306. if not script:
  307. table_index = '%s_%s' % (codepoints.first, codepoints.last)
  308. sys.stdout.write('\n')
  309. sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index)
  310. sys.stdout.write('{\n')
  311. for codepoint, table in sorted(script_tables[table_index].items()):
  312. if isinstance(table, str):
  313. sys.stdout.write('\tscripts_%s, /* %s */\n' % (table, codepoint))
  314. else:
  315. sys.stdout.write('\tscripts_%s,\n' % codepoint)
  316. sys.stdout.write('};\n')
  317. sys.stdout.write('\n')
  318. sys.stdout.write('ucd_script ucd_lookup_script(codepoint_t c)\n')
  319. sys.stdout.write('{\n')
  320. for codepoints, script, comment in script_sets:
  321. if script:
  322. sys.stdout.write('\tif (c <= 0x%s) return %s; /* %s : %s */\n' % (codepoints.last, script, codepoints, comment))
  323. else:
  324. sys.stdout.write('\tif (c <= 0x%s) /* %s */\n' % (codepoints.last, codepoints))
  325. sys.stdout.write('\t{\n')
  326. sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
  327. sys.stdout.write('\t\treturn (ucd_script)table[c % 256];\n')
  328. sys.stdout.write('\t}\n')
  329. sys.stdout.write('\treturn Zzzz; /* Invalid Unicode Codepoint */\n')
  330. sys.stdout.write('}\n')