eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scripts.py 6.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. #!/usr/bin/python
  2. # Copyright (C) 2012 Reece H. Dunn
  3. #
  4. # This file is part of ucd-tools.
  5. #
  6. # ucd-tools is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # ucd-tools is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import sys
  20. import ucd
  21. ucd_rootdir = sys.argv[1]
  22. ucd_version = sys.argv[2]
  23. unicode_chars = {}
  24. for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
  25. for codepoint in data['Range']:
  26. unicode_chars[codepoint] = data['Script']
  27. # This map is a combination of the information in the UnicodeData and Blocks
  28. # data files. It is intended to reduce the number of character tables that
  29. # need to be generated.
  30. script_sets = [
  31. (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
  32. (ucd.CodeRange('00D800..00DFFF'), 'Zzzz', 'Surrogates'),
  33. (ucd.CodeRange('00E000..00F8FF'), 'Zzzz', 'Private Use Area'),
  34. (ucd.CodeRange('00F900..02FAFF'), None, 'Multiple Blocks'),
  35. (ucd.CodeRange('02FB00..0DFFFF'), 'Zzzz', 'Unassigned'),
  36. (ucd.CodeRange('0E0000..0E01FF'), None, 'Multiple Blocks'),
  37. (ucd.CodeRange('0E0200..0EFFFF'), 'Zzzz', 'Unassigned'),
  38. (ucd.CodeRange('0F0000..0FFFFD'), 'Zzzz', 'Plane 15 Private Use'),
  39. (ucd.CodeRange('0FFFFE..0FFFFF'), 'Zzzz', 'Plane 15 Private Use'),
  40. (ucd.CodeRange('100000..10FFFD'), 'Zzzz', 'Plane 16 Private Use'),
  41. (ucd.CodeRange('10FFFE..10FFFF'), 'Zzzz', 'Plane 16 Private Use'),
  42. ]
  43. # These scripts have many pages consisting of just this script:
  44. special_scripts = []
  45. script_tables = {}
  46. for codepoints, script, comment in script_sets:
  47. if not script:
  48. table = {}
  49. table_entry = None
  50. table_codepoint = None
  51. table_script = None
  52. for i, codepoint in enumerate(codepoints):
  53. try:
  54. script = unicode_chars[codepoint]
  55. except KeyError:
  56. script = 'Zzzz' # Unknown
  57. if (i % 256) == 0:
  58. if table_entry:
  59. if table_script in special_scripts:
  60. table[table_codepoint] = table_script
  61. elif table_script:
  62. special_scripts.append(table_script)
  63. table[table_codepoint] = table_script
  64. else:
  65. table[table_codepoint] = table_entry
  66. table_entry = []
  67. table_codepoint = codepoint
  68. table_script = script
  69. if script != table_script:
  70. table_script = None
  71. table_entry.append(script)
  72. if table_entry:
  73. if table_script in special_scripts:
  74. table[table_codepoint] = table_script
  75. else:
  76. table[table_codepoint] = table_entry
  77. script_tables['%s_%s' % (codepoints.first, codepoints.last)] = table
  78. if __name__ == '__main__':
  79. sys.stdout.write("""/* Unicode Scripts
  80. *
  81. * Copyright (C) 2012 Reece H. Dunn
  82. *
  83. * This file is part of ucd-tools.
  84. *
  85. * ucd-tools is free software: you can redistribute it and/or modify
  86. * it under the terms of the GNU General Public License as published by
  87. * the Free Software Foundation, either version 3 of the License, or
  88. * (at your option) any later version.
  89. *
  90. * ucd-tools is distributed in the hope that it will be useful,
  91. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  92. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  93. * GNU General Public License for more details.
  94. *
  95. * You should have received a copy of the GNU General Public License
  96. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  97. */
  98. // NOTE: This file is automatically generated from the Scripts.txt file in
  99. // the Unicode Character database by the ucd-tools/tools/scripts.py script.
  100. #include "ucd/ucd.h"
  101. #include <stddef.h>
  102. using namespace ucd;
  103. // Unicode Character Data %s
  104. """ % ucd_version)
  105. for script in special_scripts:
  106. sys.stdout.write('\n')
  107. sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % script)
  108. sys.stdout.write('{')
  109. for i in range(0, 256):
  110. if (i % 16) == 0:
  111. sys.stdout.write('\n\t/* %02X */' % i)
  112. sys.stdout.write(' %s,' % script)
  113. sys.stdout.write('\n};\n')
  114. for codepoints, script, comment in script_sets:
  115. if not script:
  116. tables = script_tables['%s_%s' % (codepoints.first, codepoints.last)]
  117. for codepoint in sorted(tables.keys()):
  118. table = tables[codepoint]
  119. if table in special_scripts:
  120. continue
  121. sys.stdout.write('\n')
  122. sys.stdout.write('static const uint8_t scripts_%s[256] =\n' % codepoint)
  123. sys.stdout.write('{')
  124. for i, script in enumerate(table):
  125. if (i % 16) == 0:
  126. sys.stdout.write('\n\t/* %02X */' % i)
  127. sys.stdout.write(' %s,' % script)
  128. sys.stdout.write('\n};\n')
  129. for codepoints, script, comment in script_sets:
  130. if not script:
  131. table_index = '%s_%s' % (codepoints.first, codepoints.last)
  132. sys.stdout.write('\n')
  133. sys.stdout.write('static const uint8_t *scripts_%s[] =\n' % table_index)
  134. sys.stdout.write('{\n')
  135. for codepoint, table in sorted(script_tables[table_index].items()):
  136. if isinstance(table, str):
  137. sys.stdout.write('\tscripts_%s, // %s\n' % (table, codepoint))
  138. else:
  139. sys.stdout.write('\tscripts_%s,\n' % codepoint)
  140. sys.stdout.write('};\n')
  141. sys.stdout.write('\n')
  142. sys.stdout.write('ucd::script ucd::lookup_script(codepoint_t c)\n')
  143. sys.stdout.write('{\n')
  144. for codepoints, script, comment in script_sets:
  145. if script:
  146. sys.stdout.write('\tif (c <= 0x%s) return %s; // %s : %s\n' % (codepoints.last, script, codepoints, comment))
  147. else:
  148. sys.stdout.write('\tif (c <= 0x%s) // %s\n' % (codepoints.last, codepoints))
  149. sys.stdout.write('\t{\n')
  150. sys.stdout.write('\t\tconst uint8_t *table = scripts_%s_%s[(c - 0x%s) / 256];\n' % (codepoints.first, codepoints.last, codepoints.first))
  151. sys.stdout.write('\t\treturn (ucd::script)table[c % 256];\n')
  152. sys.stdout.write('\t}\n')
  153. sys.stdout.write('\treturn Zzzz; // Invalid Unicode Codepoint\n')
  154. sys.stdout.write('}\n')