eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.

printdata.py 4.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. #!/usr/bin/python
  2. # Copyright (C) 2012-2017 Reece H. Dunn
  3. #
  4. # This file is part of ucd-tools.
  5. #
  6. # ucd-tools is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # ucd-tools is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import sys
  20. import ucd
  21. ucd_rootdir = sys.argv[1]
  22. csur_rootdir = 'data/csur'
  23. null = ucd.CodePoint('0000')
  24. unicode_chars = {}
  25. for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
  26. for codepoint in data['CodePoint']:
  27. unicode_chars[codepoint] = data
  28. for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'):
  29. for codepoint in data['Range']:
  30. try:
  31. unicode_chars[codepoint][data['Property']] = 1
  32. except KeyError:
  33. unicode_chars[codepoint] = {'CodePoint': codepoint}
  34. unicode_chars[codepoint][data['Property']] = 1
  35. for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
  36. for codepoint in data['Range']:
  37. unicode_chars[codepoint]['Script'] = data['Script']
  38. if '--with-csur' in sys.argv:
  39. for csur in ['Klingon']:
  40. for data in ucd.parse_ucd_data('data/csur', csur):
  41. for codepoint in data['CodePoint']:
  42. unicode_chars[codepoint] = data
  43. def iscntrl(data):
  44. return 1 if data.get('Name', '') == '<control>' else 0
  45. def isdigit(data):
  46. return 1 if data['CodePoint'].char() in '0123456789' else 0
  47. def isxdigit(data):
  48. return 1 if data['CodePoint'].char() in '0123456789ABCDEFabcdef' else 0
  49. def isspace(data):
  50. if data.get('White_Space', 0):
  51. dt = data.get('DecompositionType', '')
  52. return 1 if dt == None or not dt.startswith('<noBreak>') else 0
  53. else:
  54. return 0
  55. def isblank(data): # word separator
  56. if data.get('GeneralCategory', 'Cn') == 'Zs' or data['CodePoint'].char() == '\t':
  57. dt = data.get('DecompositionType', '')
  58. return 1 if dt == None or not dt.startswith('<noBreak>') else 0
  59. else:
  60. return 0
  61. def ispunct(data):
  62. if data.get('GeneralCategory', 'Cn')[0] in 'P':
  63. return 1
  64. else:
  65. return 0
  66. def isprint(data):
  67. if data.get('GeneralCategory', 'Cn')[0] in 'LMNPSZ': # not in 'CI'
  68. return 1
  69. else:
  70. return 0
  71. def isgraph(data):
  72. if data.get('GeneralCategory', 'Cn')[0] in 'LMNPS': # not in 'CZI'
  73. return 1
  74. else:
  75. return 0
  76. def isalnum(data):
  77. if data.get('GeneralCategory', 'Cn')[0] in 'LN':
  78. return 1
  79. else:
  80. return 0
  81. def isalpha(data):
  82. if data.get('GeneralCategory', 'Cn')[0] in 'L':
  83. return 1
  84. else:
  85. return 0
  86. def isupper(data):
  87. if data.get('GeneralCategory', 'Cn') == 'Lu':
  88. return 1
  89. elif data.get('Other_Uppercase', 0):
  90. return 1
  91. elif data.get('LowerCase', null) != null: # Some Lt characters have lowercase forms.
  92. return 1
  93. else:
  94. return 0
  95. def islower(data):
  96. if data.get('UpperCase', null) != null:
  97. return 1
  98. elif data.get('GeneralCategory', 'Cn') == 'Ll':
  99. return 1
  100. else:
  101. return 0
  102. if __name__ == '__main__':
  103. for codepoint in ucd.CodeRange('000000..10FFFF'):
  104. try:
  105. data = unicode_chars[codepoint]
  106. except KeyError:
  107. data = {'CodePoint': codepoint}
  108. script = data.get('Script', 'Zzzz')
  109. title = data.get('TitleCase', codepoint)
  110. upper = data.get('UpperCase', codepoint)
  111. lower = data.get('LowerCase', codepoint)
  112. if title == null: title = codepoint
  113. if upper == null: upper = codepoint
  114. if lower == null: lower = codepoint
  115. print('%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % (
  116. codepoint, script,
  117. data.get('GeneralCategory', 'Cn')[0], data.get('GeneralCategory', 'Cn'),
  118. upper, lower, title,
  119. isdigit(data), isxdigit(data),
  120. iscntrl(data), isspace(data), isblank(data), ispunct(data),
  121. isprint(data), isgraph(data), isalnum(data), isalpha(data), isupper(data), islower(data)))