eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scripts.py 5.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. #!/usr/bin/python
  2. # Copyright (C) 2012, 2014 Reece H. Dunn
  3. #
  4. # This file is part of ucd-tools.
  5. #
  6. # ucd-tools is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # ucd-tools is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import sys
  20. import ucd
  21. ucd_rootdir = sys.argv[1]
  22. ucd_version = sys.argv[2]
  23. unicode_chars = {}
  24. for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
  25. for codepoint in data['Range']:
  26. unicode_chars[codepoint] = data['Script']
  27. if '--with-csur' in sys.argv:
  28. for csur in ['Klingon']:
  29. for data in ucd.parse_ucd_data('data/csur', csur):
  30. for codepoint in data['CodePoint']:
  31. unicode_chars[codepoint] = data['Script']
  32. # This map is a combination of the information in the UnicodeData and Blocks
  33. # data files. It is intended to reduce the number of character tables that
  34. # need to be generated.
  35. script_sets = [
  36. (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
  37. (ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'),
  38. (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'),
  39. (ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'),
  40. (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
  41. (ucd.CodeRange('0E0200..10FFFF'), 'Zzzz', 'Unassigned'),
  42. ]
  43. # These scripts have many pages consisting of just this script:
  44. special_scripts = []
  45. script_tables = {}
  46. for codepoints, script, comment in script_sets:
  47. if not script:
  48. table = {}
  49. table_entry = None
  50. table_codepoint = None
  51. table_script = None
  52. for i, codepoint in enumerate(codepoints):
  53. try:
  54. script = unicode_chars[codepoint]
  55. except KeyError:
  56. script = 'Zzzz' # Unknown
  57. if (i % 256) == 0:
  58. if table_entry:
  59. if table_script in special_scripts:
  60. table[table_codepoint] = table_script
  61. elif table_script:
  62. special_scripts.append(table_script)
  63. table[table_codepoint] = table_script
  64. else:
  65. table[table_codepoint] = table_entry
  66. table_entry = []
  67. table_codepoint = codepoint
  68. table_script = script
  69. if script != table_script:
  70. table_script = None
  71. table_entry.append(script)
  72. if table_entry:
  73. if table_script in special_scripts:
  74. table[table_codepoint] = table_script
  75. else:
  76. table[table_codepoint] = table_entry
  77. script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table
  78. if __name__ == '__main__':
  79. sys.stdout.write("""/* Unicode Scripts
  80. *
  81. * Copyright (C) 2012 Reece H. Dunn
  82. *
  83. * This file is part of ucd-tools.
  84. *
  85. * ucd-tools is free software: you can redistribute it and/or modify
  86. * it under the terms of the GNU General Public License as published by
  87. * the Free Software Foundation, either version 3 of the License, or
  88. * (at your option) any later version.
  89. *
  90. * ucd-tools is distributed in the hope that it will be useful,
  91. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  92. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  93. * GNU General Public License for more details.
  94. *
  95. * You should have received a copy of the GNU General Public License
  96. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  97. */
  98. // NOTE: This file is automatically generated from the Scripts.txt file in
  99. // the Unicode Character database by the ucd-tools/tools/scripts.py script.
  100. #include "ucd/ucd.h"
  101. #include <stddef.h>
  102. using namespace ucd;
  103. // Unicode Character Data %s
  104. """ % ucd_version)
  105. for script in special_scripts:
  106. sys.stdout.write('\n')
  107. sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script)
  108. sys.stdout.write('{')
  109. for i in range(0, 256):
  110. if (i % 16) == 0:
  111. sys.stdout.write('\n\t/* %02X */' % i)
  112. sys.stdout.write(' %s,' % script)
  113. sys.stdout.write('\n};\n')
  114. for codepoints, script, comment in script_sets:
  115. if not script:
  116. tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)]
  117. for codepoint in sorted(tables.keys()):
  118. table = tables[codepoint]
  119. if table in special_scripts:
  120. continue
  121. sys.stdout.write('\n')
  122. sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint)
  123. sys.stdout.write('{')
  124. for i, script in enumerate(table):
  125. if (i % 16) == 0:
  126. sys.stdout.write('\n\t/* %02X */' % i)
  127. sys.stdout.write(' %s,' % script)
  128. sys.stdout.write('\n};\n')
  129. for codepoints, script, comment in script_sets:
  130. if not script:
  131. table_index = '%s_%s' % (codepoints.first, codepoints.last)
  132. sys.stdout.write('\n')
  133. sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index)
  134. sys.stdout.write('{\n')
  135. for codepoint, table in sorted(script_tables[table_index].items()):
  136. if isinstance(table, str):
  137. sys.stdout.write('\tscripts_%s, // %s\n' % (table, codepoint))
  138. else:
  139. sys.stdout.write('\tscripts_%s,\n' % codepoint)
  140. sys.stdout.write('};\n')
  141. sys.stdout.write('\n')
  142. sys.stdout.write('ucd::script ucd::lookup_script(codepoint_t c)\n')
  143. sys.stdout.write('{\n')
  144. for codepoints, script, comment in script_sets:
  145. if script:
  146. sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, script, codepoints, comment))
  147. else:
  148. sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints))
  149. sys.stdout.write('\t{\n')
  150. sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
  151. sys.stdout.write('\t\treturn (ucd::script)table[c % 256];\n')
  152. sys.stdout.write('\t}\n')
  153. sys.stdout.write('\treturn Zzzz; // Invalid Unicode Codepoint\n')
  154. sys.stdout.write('}\n')