eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

categories.py 6.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. #!/usr/bin/python
  2. # Copyright (C) 2012 Reece H. Dunn
  3. #
  4. # This file is part of ucd-tools.
  5. #
  6. # ucd-tools is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # ucd-tools is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import sys
  20. import ucd
  21. ucd_rootdir = sys.argv[1]
  22. ucd_version = sys.argv[2]
  23. unicode_chars = {}
  24. for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
  25. for codepoint in data['CodePoint']:
  26. unicode_chars[codepoint] = data['GeneralCategory']
  27. for data in ucd.parse_ucd_data('supplemental', 'Klingon'):
  28. for codepoint in data['CodePoint']:
  29. unicode_chars[codepoint] = data['GeneralCategory']
  30. # This map is a combination of the information in the UnicodeData and Blocks
  31. # data files. It is intended to reduce the number of character tables that
  32. # need to be generated.
  33. category_sets = [
  34. (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
  35. (ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'),
  36. (ucd.CodeRange('00E000..00F7FF'), 'Co', 'Private Use Area'),
  37. (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'),
  38. (ucd.CodeRange('02FB00..0DFFFF'), 'Cn', 'Unassigned'),
  39. (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
  40. (ucd.CodeRange('0E0200..0EFFFF'), 'Cn', 'Unassigned'),
  41. (ucd.CodeRange('0F0000..0FFFFD'), 'Co', 'Plane 15 Private Use'),
  42. (ucd.CodeRange('0FFFFE..0FFFFF'), 'Cn', 'Plane 15 Private Use'),
  43. (ucd.CodeRange('100000..10FFFD'), 'Co', 'Plane 16 Private Use'),
  44. (ucd.CodeRange('10FFFE..10FFFF'), 'Cn', 'Plane 16 Private Use'),
  45. ]
  46. # These categories have many pages consisting of just this category:
  47. # Cn -- Unassigned
  48. # Lo -- CJK Ideographs
  49. special_categories = ['Cn', 'Lo', 'Sm', 'So']
  50. category_tables = {}
  51. for codepoints, category, comment in category_sets:
  52. if not category:
  53. table = {}
  54. table_entry = None
  55. table_codepoint = None
  56. table_category = None
  57. for i, codepoint in enumerate(codepoints):
  58. try:
  59. category = unicode_chars[codepoint]
  60. except KeyError:
  61. category = 'Cn' # Unassigned
  62. if (i % 256) == 0:
  63. if table_entry:
  64. if table_category in special_categories:
  65. table[table_codepoint] = table_category
  66. elif table_category:
  67. raise Exception('%s only table not in the special_categories list.' % table_category)
  68. else:
  69. table[table_codepoint] = table_entry
  70. table_entry = []
  71. table_codepoint = codepoint
  72. table_category = category
  73. if category != table_category:
  74. table_category = None
  75. table_entry.append(category)
  76. if table_entry:
  77. if table_category in special_categories:
  78. table[table_codepoint] = table_category
  79. else:
  80. table[table_codepoint] = table_entry
  81. category_tables['%s_%s' % (codepoints.first, codepoints.last)] = table
  82. if __name__ == '__main__':
  83. sys.stdout.write("""/* Unicode General Categories
  84. *
  85. * Copyright (C) 2012 Reece H. Dunn
  86. *
  87. * This file is part of ucd-tools.
  88. *
  89. * ucd-tools is free software: you can redistribute it and/or modify
  90. * it under the terms of the GNU General Public License as published by
  91. * the Free Software Foundation, either version 3 of the License, or
  92. * (at your option) any later version.
  93. *
  94. * ucd-tools is distributed in the hope that it will be useful,
  95. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  96. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  97. * GNU General Public License for more details.
  98. *
  99. * You should have received a copy of the GNU General Public License
  100. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  101. */
  102. // NOTE: This file is automatically generated from the UnicodeData.txt file in
  103. // the Unicode Character database by the ucd-tools/tools/categories.py script.
  104. #include "ucd/ucd.h"
  105. #include <stddef.h>
  106. using namespace ucd;
  107. // Unicode Character Data %s
  108. """ % ucd_version)
  109. for category in special_categories:
  110. sys.stdout.write('\n')
  111. sys.stdout.write('static const uint8_t categories_%s[256] =\n' % category)
  112. sys.stdout.write('{')
  113. for i in range(0, 256):
  114. if (i % 16) == 0:
  115. sys.stdout.write('\n\t/* %02X */' % i)
  116. sys.stdout.write(' %s,' % category)
  117. sys.stdout.write('\n};\n')
  118. for codepoints, category, comment in category_sets:
  119. if not category:
  120. tables = category_tables['%s_%s' % (codepoints.first, codepoints.last)]
  121. for codepoint in sorted(tables.keys()):
  122. table = tables[codepoint]
  123. if table in special_categories:
  124. continue
  125. sys.stdout.write('\n')
  126. sys.stdout.write('static const uint8_t categories_%s[256] =\n' % codepoint)
  127. sys.stdout.write('{')
  128. for i, category in enumerate(table):
  129. if (i % 16) == 0:
  130. sys.stdout.write('\n\t/* %02X */' % i)
  131. sys.stdout.write(' %s,' % category)
  132. sys.stdout.write('\n};\n')
  133. for codepoints, category, comment in category_sets:
  134. if not category:
  135. table_index = '%s_%s' % (codepoints.first, codepoints.last)
  136. sys.stdout.write('\n')
  137. sys.stdout.write('static const uint8_t *categories_%s[] =\n' % table_index)
  138. sys.stdout.write('{\n')
  139. for codepoint, table in sorted(category_tables[table_index].items()):
  140. if isinstance(table, str):
  141. sys.stdout.write('\tcategories_%s, // %s\n' % (table, codepoint))
  142. else:
  143. sys.stdout.write('\tcategories_%s,\n' % codepoint)
  144. sys.stdout.write('};\n')
  145. sys.stdout.write('\n')
  146. sys.stdout.write('ucd::category ucd::lookup_category(codepoint_t c)\n')
  147. sys.stdout.write('{\n')
  148. for codepoints, category, comment in category_sets:
  149. if category:
  150. sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, category, codepoints, comment))
  151. else:
  152. sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints))
  153. sys.stdout.write('\t{\n')
  154. sys.stdout.write('\t\tconst uint8_t *table = categories_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
  155. sys.stdout.write('\t\treturn (ucd::category)table[c % 256];\n')
  156. sys.stdout.write('\t}\n')
  157. sys.stdout.write('\treturn Ii; // Invalid Unicode Codepoint\n')
  158. sys.stdout.write('}\n')
  159. sys.stdout.write("""
  160. ucd::category_group ucd::lookup_category_group(category c)
  161. {
  162. switch (c)
  163. {
  164. case Cc: case Cf: case Cn: case Co: case Cs:
  165. return C;
  166. case Ll: case Lm: case Lo: case Lt: case Lu:
  167. return L;
  168. case Mc: case Me: case Mn:
  169. return M;
  170. case Nd: case Nl: case No:
  171. return N;
  172. case Pc: case Pd: case Pe: case Pf: case Pi: case Po: case Ps:
  173. return P;
  174. case Sc: case Sk: case Sm: case So:
  175. return S;
  176. case Zl: case Zp: case Zs:
  177. return Z;
  178. case Ii:
  179. return I;
  180. }
  181. }
  182. ucd::category_group ucd::lookup_category_group(codepoint_t c)
  183. {
  184. return lookup_category_group(lookup_category(c));
  185. }
  186. """)