eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

categories.py 5.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. #!/usr/bin/python
  2. # Copyright (C) 2012 Reece H. Dunn
  3. #
  4. # This file is part of ucd-tools.
  5. #
  6. # ucd-tools is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # ucd-tools is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import sys
  20. import ucd
  21. ucd_rootdir = sys.argv[1]
  22. unicode_chars = {}
  23. for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
  24. if isinstance(data['CodePoint'], ucd.CodePoint):
  25. unicode_chars[data['CodePoint']] = data['GeneralCategory']
  26. # This map is a combination of the information in the UnicodeData and Blocks
  27. # data files. It is intended to reduce the number of character tables that
  28. # need to be generated.
  29. category_sets = [
  30. (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
  31. (ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'),
  32. (ucd.CodeRange('00E000..00F8FF'), 'Co', 'Private Use Area'),
  33. (ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'),
  34. (ucd.CodeRange('02FB00..0DFFFF'), 'Cn', 'Unassigned'),
  35. (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
  36. (ucd.CodeRange('0E0200..0EFFFF'), 'Cn', 'Unassigned'),
  37. (ucd.CodeRange('0F0000..0FFFFD'), 'Co', 'Plane 15 Private Use'),
  38. (ucd.CodeRange('0FFFFE..0FFFFF'), 'Cn', 'Plane 15 Private Use'),
  39. (ucd.CodeRange('100000..10FFFD'), 'Co', 'Plane 16 Private Use'),
  40. (ucd.CodeRange('10FFFE..10FFFF'), 'Cn', 'Plane 16 Private Use'),
  41. ]
  42. category_tables = {}
  43. for codepoints, category, comment in category_sets:
  44. if not category:
  45. table = {}
  46. table_entry = None
  47. table_codepoint = None
  48. is_unassigned = True
  49. for i, codepoint in enumerate(codepoints):
  50. if (i % 256) == 0:
  51. if table_entry:
  52. if is_unassigned:
  53. table[table_codepoint] = None
  54. else:
  55. table[table_codepoint] = table_entry
  56. table_entry = []
  57. table_codepoint = codepoint
  58. is_unassigned = True
  59. try:
  60. category = unicode_chars[codepoint]
  61. is_unassigned = False
  62. except KeyError:
  63. category = 'Cn' # Unassigned
  64. table_entry.append(category)
  65. if table_entry:
  66. if is_unassigned:
  67. table[table_codepoint] = None
  68. else:
  69. table[table_codepoint] = table_entry
  70. category_tables['%s_%s' % (codepoints.first, codepoints.last)] = table
  71. if __name__ == '__main__':
  72. sys.stdout.write("""/* Unicode General Categories
  73. *
  74. * Copyright (C) 2012 Reece H. Dunn
  75. *
  76. * This file is part of ucd-tools.
  77. *
  78. * ucd-tools is free software: you can redistribute it and/or modify
  79. * it under the terms of the GNU General Public License as published by
  80. * the Free Software Foundation, either version 3 of the License, or
  81. * (at your option) any later version.
  82. *
  83. * ucd-tools is distributed in the hope that it will be useful,
  84. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  85. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  86. * GNU General Public License for more details.
  87. *
  88. * You should have received a copy of the GNU General Public License
  89. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  90. */
  91. // NOTE: This file is automatically generated from the UnicodeData.txt file in
  92. // the Unicode Character database by the ucd-tools/tools/categories.py script.
  93. #include "ucd/ucd.h"
  94. #include <stddef.h>
  95. using namespace ucd;
  96. """)
  97. for codepoints, category, comment in category_sets:
  98. if not category:
  99. tables = category_tables['%s_%s' % (codepoints.first, codepoints.last)]
  100. for codepoint in sorted(tables.keys()):
  101. table = tables[codepoint]
  102. if not table:
  103. continue
  104. sys.stdout.write('\n')
  105. sys.stdout.write('static const ucd::category categories_%s[256] =\n' % codepoint)
  106. sys.stdout.write('{')
  107. for i, category in enumerate(table):
  108. if (i % 16) == 0:
  109. sys.stdout.write('\n\t/* %02X */' % i)
  110. sys.stdout.write(' %s,' % category)
  111. sys.stdout.write('\n};\n')
  112. for codepoints, category, comment in category_sets:
  113. if not category:
  114. table_index = '%s_%s' % (codepoints.first, codepoints.last)
  115. sys.stdout.write('\n')
  116. sys.stdout.write('static const ucd::category *categories_%s[] =\n' % table_index)
  117. sys.stdout.write('{\n')
  118. for codepoint, table in sorted(category_tables[table_index].items()):
  119. if table:
  120. sys.stdout.write('\tcategories_%s,\n' % codepoint)
  121. else:
  122. sys.stdout.write('\tNULL, // %s : Unassigned\n' % codepoint)
  123. sys.stdout.write('};\n')
  124. sys.stdout.write('\n')
  125. sys.stdout.write('ucd::category ucd::lookup_category(codepoint_t c)\n')
  126. sys.stdout.write('{\n')
  127. for codepoints, category, comment in category_sets:
  128. if category:
  129. sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, category, codepoints, comment))
  130. else:
  131. sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints))
  132. sys.stdout.write('\t{\n')
  133. sys.stdout.write('\t\tconst ucd::category *table = categories_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
  134. sys.stdout.write('\t\treturn table ? table[c % 256] : Cn;\n')
  135. sys.stdout.write('\t}\n')
  136. sys.stdout.write('\treturn Ci;\n')
  137. sys.stdout.write('}\n')