eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

printdata.py 5.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. #!/usr/bin/python
  2. # Copyright (C) 2012-2017 Reece H. Dunn
  3. #
  4. # This file is part of ucd-tools.
  5. #
  6. # ucd-tools is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # ucd-tools is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import sys
  20. import ucd
  21. ucd_rootdir = sys.argv[1]
  22. csur_rootdir = 'data/csur'
  23. null = ucd.CodePoint('0000')
  24. unicode_chars = {}
  25. for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
  26. for codepoint in data['CodePoint']:
  27. unicode_chars[codepoint] = data
  28. for propfile in ['PropList', 'DerivedCoreProperties']:
  29. for data in ucd.parse_ucd_data(ucd_rootdir, propfile):
  30. for codepoint in data['Range']:
  31. try:
  32. unicode_chars[codepoint][data['Property']] = 1
  33. except KeyError:
  34. unicode_chars[codepoint] = {'CodePoint': codepoint}
  35. unicode_chars[codepoint][data['Property']] = 1
  36. for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
  37. for codepoint in data['Range']:
  38. unicode_chars[codepoint]['Script'] = data['Script']
  39. if '--with-csur' in sys.argv:
  40. for csur in ['Klingon']:
  41. for data in ucd.parse_ucd_data('data/csur', csur):
  42. for codepoint in data['CodePoint']:
  43. unicode_chars[codepoint] = data
  44. def iscntrl(data):
  45. return 1 if data.get('Name', '') == '<control>' else 0
  46. def isdigit(data):
  47. return 1 if data['CodePoint'].char() in '0123456789' else 0
  48. def isxdigit(data):
  49. return 1 if data['CodePoint'].char() in '0123456789ABCDEFabcdef' else 0
  50. def isspace(data):
  51. if data.get('White_Space', 0):
  52. dt = data.get('DecompositionType', '')
  53. return 1 if dt == None or not dt.startswith('<noBreak>') else 0
  54. else:
  55. return 0
  56. def isblank(data): # word separator
  57. if data.get('GeneralCategory', 'Cn') == 'Zs' or data['CodePoint'].char() == '\t':
  58. dt = data.get('DecompositionType', '')
  59. return 1 if dt == None or not dt.startswith('<noBreak>') else 0
  60. else:
  61. return 0
  62. def ispunct(data):
  63. if data.get('GeneralCategory', 'Cn')[0] in 'P':
  64. return 1
  65. else:
  66. return 0
  67. def isprint(data):
  68. if data.get('GeneralCategory', 'Cn')[0] in 'LMNPSZ': # not in 'CI'
  69. return 1
  70. else:
  71. return 0
  72. def isgraph(data):
  73. if data.get('GeneralCategory', 'Cn')[0] in 'LMNPS': # not in 'CZI'
  74. return 1
  75. else:
  76. return 0
  77. def isalnum(data):
  78. if data.get('GeneralCategory', 'Cn')[0] in 'N':
  79. return 1
  80. else:
  81. return data.get('Alphabetic', 0)
  82. def isalpha(data):
  83. return data.get('Alphabetic', 0)
  84. def isupper(data):
  85. if data.get('Uppercase', 0):
  86. return 1
  87. elif data.get('LowerCase', null) != null: # Some Lt characters have lowercase forms.
  88. return 1
  89. else:
  90. return 0
  91. def islower(data):
  92. if data.get('Lowercase', 0):
  93. return 1
  94. elif data.get('UpperCase', null) != null:
  95. return 1
  96. else:
  97. return 0
  98. def decomposition_type(data, dtype):
  99. value = data.get('DecompositionType', None)
  100. if value and value.startswith(dtype):
  101. return value
  102. return None
  103. def properties(data):
  104. props = 0
  105. props += (2 ** 0) * data.get('White_Space', 0)
  106. props += (2 ** 1) * data.get('Bidi_Control', 0)
  107. props += (2 ** 2) * data.get('Join_Control', 0)
  108. props += (2 ** 3) * data.get('Dash', 0)
  109. props += (2 ** 4) * data.get('Hyphen', 0)
  110. props += (2 ** 5) * data.get('Quotation_Mark', 0)
  111. props += (2 ** 6) * data.get('Terminal_Punctuation', 0)
  112. props += (2 ** 7) * data.get('Other_Math', 0)
  113. props += (2 ** 8) * data.get('Hex_Digit', 0)
  114. props += (2 ** 9) * data.get('ASCII_Hex_Digit', 0)
  115. props += (2 ** 10) * data.get('Other_Alphabetic', 0)
  116. props += (2 ** 11) * data.get('Ideographic', 0)
  117. props += (2 ** 12) * data.get('Diacritic', 0)
  118. props += (2 ** 13) * data.get('Extender', 0)
  119. props += (2 ** 14) * data.get('Other_Lowercase', 0)
  120. props += (2 ** 15) * data.get('Other_Uppercase', 0)
  121. props += (2 ** 16) * data.get('Noncharacter_Code_Point', 0)
  122. props += (2 ** 17) * data.get('Other_Grapheme_Extend', 0)
  123. props += (2 ** 18) * data.get('IDS_Binary_Operator', 0)
  124. props += (2 ** 19) * data.get('IDS_Trinary_Operator', 0)
  125. props += (2 ** 20) * data.get('Radical', 0)
  126. props += (2 ** 21) * data.get('Unified_Ideograph', 0)
  127. props += (2 ** 22) * data.get('Other_Default_Ignorable_Code_Point', 0)
  128. props += (2 ** 23) * data.get('Deprecated', 0)
  129. props += (2 ** 24) * data.get('Soft_Dotted', 0)
  130. props += (2 ** 25) * data.get('Logical_Order_Exception', 0)
  131. props += (2 ** 26) * data.get('Other_ID_Start', 0)
  132. props += (2 ** 27) * data.get('Other_ID_Continue', 0)
  133. props += (2 ** 28) * data.get('Sentence_Terminal', 0)
  134. return props
  135. if __name__ == '__main__':
  136. for codepoint in ucd.CodeRange('000000..10FFFF'):
  137. try:
  138. data = unicode_chars[codepoint]
  139. except KeyError:
  140. data = {'CodePoint': codepoint}
  141. script = data.get('Script', 'Zzzz')
  142. title = data.get('TitleCase', codepoint)
  143. upper = data.get('UpperCase', codepoint)
  144. lower = data.get('LowerCase', codepoint)
  145. if title == null: title = codepoint
  146. if upper == null: upper = codepoint
  147. if lower == null: lower = codepoint
  148. print('%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %016x' % (
  149. codepoint, script,
  150. data.get('GeneralCategory', 'Cn')[0], data.get('GeneralCategory', 'Cn'),
  151. upper, lower, title,
  152. isdigit(data), isxdigit(data),
  153. iscntrl(data), isspace(data), isblank(data), ispunct(data),
  154. isprint(data), isgraph(data), isalnum(data), isalpha(data), isupper(data), islower(data),
  155. properties(data)))