eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

categories.py 8.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. #!/usr/bin/python
  2. # Copyright (C) 2012-2016 Reece H. Dunn
  3. #
  4. # This file is part of ucd-tools.
  5. #
  6. # ucd-tools is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # ucd-tools is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import sys
  20. import ucd
  21. ucd_rootdir = sys.argv[1]
  22. ucd_version = sys.argv[2]
  23. unicode_chars = {}
  24. for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
  25. for codepoint in data['CodePoint']:
  26. unicode_chars[codepoint] = data['GeneralCategory']
  27. if '--with-csur' in sys.argv:
  28. for csur in ['Klingon']:
  29. for data in ucd.parse_ucd_data('data/csur', csur):
  30. for codepoint in data['CodePoint']:
  31. unicode_chars[codepoint] = data['GeneralCategory']
  32. # This map is a combination of the information in the UnicodeData and Blocks
  33. # data files. It is intended to reduce the number of character tables that
  34. # need to be generated.
  35. category_sets = [
  36. (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
  37. (ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'),
  38. (ucd.CodeRange('00E000..00F7FF'), 'Co', 'Private Use Area'),
  39. (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'),
  40. (ucd.CodeRange('02FB00..0DFFFF'), 'Cn', 'Unassigned'),
  41. (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
  42. (ucd.CodeRange('0E0200..0EFFFF'), 'Cn', 'Unassigned'),
  43. (ucd.CodeRange('0F0000..0FFFFD'), 'Co', 'Plane 15 Private Use'),
  44. (ucd.CodeRange('0FFFFE..0FFFFF'), 'Cn', 'Plane 15 Private Use'),
  45. (ucd.CodeRange('100000..10FFFD'), 'Co', 'Plane 16 Private Use'),
  46. (ucd.CodeRange('10FFFE..10FFFF'), 'Cn', 'Plane 16 Private Use'),
  47. ]
  48. # These categories have many pages consisting of just this category:
  49. # Cn -- Unassigned
  50. # Lo -- CJK Ideographs
  51. special_categories = ['Cn', 'Co', 'Lo', 'Sm', 'So']
  52. category_tables = {}
  53. for codepoints, category, comment in category_sets:
  54. if not category:
  55. table = {}
  56. table_entry = None
  57. table_codepoint = None
  58. table_category = None
  59. for i, codepoint in enumerate(codepoints):
  60. try:
  61. category = unicode_chars[codepoint]
  62. except KeyError:
  63. category = 'Cn' # Unassigned
  64. if (i % 256) == 0:
  65. if table_entry:
  66. if table_category in special_categories:
  67. table[table_codepoint] = table_category
  68. elif table_category:
  69. raise Exception('%s only table not in the special_categories list.' % table_category)
  70. else:
  71. table[table_codepoint] = table_entry
  72. table_entry = []
  73. table_codepoint = codepoint
  74. table_category = category
  75. if category != table_category:
  76. table_category = None
  77. table_entry.append(category)
  78. if table_entry:
  79. if table_category in special_categories:
  80. table[table_codepoint] = table_category
  81. else:
  82. table[table_codepoint] = table_entry
  83. category_tables['%s_%s' % (codepoints.first, codepoints.last)] = table
  84. if __name__ == '__main__':
  85. sys.stdout.write("""/* Unicode General Categories
  86. *
  87. * Copyright (C) 2012-2016 Reece H. Dunn
  88. *
  89. * This file is part of ucd-tools.
  90. *
  91. * ucd-tools is free software: you can redistribute it and/or modify
  92. * it under the terms of the GNU General Public License as published by
  93. * the Free Software Foundation, either version 3 of the License, or
  94. * (at your option) any later version.
  95. *
  96. * ucd-tools is distributed in the hope that it will be useful,
  97. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  98. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  99. * GNU General Public License for more details.
  100. *
  101. * You should have received a copy of the GNU General Public License
  102. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  103. */
  104. /* NOTE: This file is automatically generated from the UnicodeData.txt file in
  105. * the Unicode Character database by the ucd-tools/tools/categories.py script.
  106. */
  107. #include "ucd/ucd.h"
  108. #include <stddef.h>
  109. #define Cc UCD_CATEGORY_Cc
  110. #define Cf UCD_CATEGORY_Cf
  111. #define Cn UCD_CATEGORY_Cn
  112. #define Co UCD_CATEGORY_Co
  113. #define Cs UCD_CATEGORY_Cs
  114. #define Ll UCD_CATEGORY_Ll
  115. #define Lm UCD_CATEGORY_Lm
  116. #define Lo UCD_CATEGORY_Lo
  117. #define Lt UCD_CATEGORY_Lt
  118. #define Lu UCD_CATEGORY_Lu
  119. #define Mc UCD_CATEGORY_Mc
  120. #define Me UCD_CATEGORY_Me
  121. #define Mn UCD_CATEGORY_Mn
  122. #define Nd UCD_CATEGORY_Nd
  123. #define Nl UCD_CATEGORY_Nl
  124. #define No UCD_CATEGORY_No
  125. #define Pc UCD_CATEGORY_Pc
  126. #define Pd UCD_CATEGORY_Pd
  127. #define Pe UCD_CATEGORY_Pe
  128. #define Pf UCD_CATEGORY_Pf
  129. #define Pi UCD_CATEGORY_Pi
  130. #define Po UCD_CATEGORY_Po
  131. #define Ps UCD_CATEGORY_Ps
  132. #define Sc UCD_CATEGORY_Sc
  133. #define Sk UCD_CATEGORY_Sk
  134. #define Sm UCD_CATEGORY_Sm
  135. #define So UCD_CATEGORY_So
  136. #define Zl UCD_CATEGORY_Zl
  137. #define Zp UCD_CATEGORY_Zp
  138. #define Zs UCD_CATEGORY_Zs
  139. #define Ii UCD_CATEGORY_Ii
  140. /* Unicode Character Data %s */
  141. """ % ucd_version)
  142. for category in special_categories:
  143. sys.stdout.write('\n')
  144. sys.stdout.write('static const uint8_t categories_%s[256] =\n' % category)
  145. sys.stdout.write('{')
  146. for i in range(0, 256):
  147. if (i % 16) == 0:
  148. sys.stdout.write('\n\t/* %02X */' % i)
  149. sys.stdout.write(' %s,' % category)
  150. sys.stdout.write('\n};\n')
  151. for codepoints, category, comment in category_sets:
  152. if not category:
  153. tables = category_tables['%s_%s' % (codepoints.first, codepoints.last)]
  154. for codepoint in sorted(tables.keys()):
  155. table = tables[codepoint]
  156. if table in special_categories:
  157. continue
  158. sys.stdout.write('\n')
  159. sys.stdout.write('static const uint8_t categories_%s[256] =\n' % codepoint)
  160. sys.stdout.write('{')
  161. for i, category in enumerate(table):
  162. if (i % 16) == 0:
  163. sys.stdout.write('\n\t/* %02X */' % i)
  164. sys.stdout.write(' %s,' % category)
  165. sys.stdout.write('\n};\n')
  166. for codepoints, category, comment in category_sets:
  167. if not category:
  168. table_index = '%s_%s' % (codepoints.first, codepoints.last)
  169. sys.stdout.write('\n')
  170. sys.stdout.write('static const uint8_t *categories_%s[] =\n' % table_index)
  171. sys.stdout.write('{\n')
  172. for codepoint, table in sorted(category_tables[table_index].items()):
  173. if isinstance(table, str):
  174. sys.stdout.write('\tcategories_%s, /* %s */\n' % (table, codepoint))
  175. else:
  176. sys.stdout.write('\tcategories_%s,\n' % codepoint)
  177. sys.stdout.write('};\n')
  178. sys.stdout.write('\n')
  179. sys.stdout.write('ucd_category ucd_lookup_category(codepoint_t c)\n')
  180. sys.stdout.write('{\n')
  181. for codepoints, category, comment in category_sets:
  182. if category:
  183. sys.stdout.write('\tif (c <= 0x%s) return %s; /* %s : %s */\n' % (codepoints.last, category, codepoints, comment))
  184. else:
  185. sys.stdout.write('\tif (c <= 0x%s) /* %s */\n' % (codepoints.last, codepoints))
  186. sys.stdout.write('\t{\n')
  187. sys.stdout.write('\t\tconst uint8_t *table = categories_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
  188. sys.stdout.write('\t\treturn (ucd_category)table[c % 256];\n')
  189. sys.stdout.write('\t}\n')
  190. sys.stdout.write('\treturn Ii; /* Invalid Unicode Codepoint */\n')
  191. sys.stdout.write('}\n')
  192. sys.stdout.write("""
  193. ucd_category_group ucd_get_category_group_for_category(ucd_category c)
  194. {
  195. switch (c)
  196. {
  197. case Cc: case Cf: case Cn: case Co: case Cs:
  198. return UCD_CATEGORY_GROUP_C;
  199. case Ll: case Lm: case Lo: case Lt: case Lu:
  200. return UCD_CATEGORY_GROUP_L;
  201. case Mc: case Me: case Mn:
  202. return UCD_CATEGORY_GROUP_M;
  203. case Nd: case Nl: case No:
  204. return UCD_CATEGORY_GROUP_N;
  205. case Pc: case Pd: case Pe: case Pf: case Pi: case Po: case Ps:
  206. return UCD_CATEGORY_GROUP_P;
  207. case Sc: case Sk: case Sm: case So:
  208. return UCD_CATEGORY_GROUP_S;
  209. case Zl: case Zp: case Zs:
  210. return UCD_CATEGORY_GROUP_Z;
  211. case Ii:
  212. default:
  213. return UCD_CATEGORY_GROUP_I;
  214. }
  215. }
  216. ucd_category_group ucd_lookup_category_group(codepoint_t c)
  217. {
  218. return (ucd_category_group)ucd_get_category_group_for_category(ucd_lookup_category(c));
  219. }
  220. """)