eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

printdata.py 2.8KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. #!/usr/bin/python
  2. # Copyright (C) 2012-2017 Reece H. Dunn
  3. #
  4. # This file is part of ucd-tools.
  5. #
  6. # ucd-tools is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # ucd-tools is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import sys
  20. import ucd
  21. ucd_rootdir = sys.argv[1]
  22. csur_rootdir = 'data/csur'
  23. null = ucd.CodePoint('0000')
  24. unicode_chars = {}
  25. for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
  26. for codepoint in data['CodePoint']:
  27. unicode_chars[codepoint] = data
  28. for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'):
  29. if data['Property'] in ['White_Space']:
  30. for codepoint in data['Range']:
  31. unicode_chars[codepoint][data['Property']] = 1
  32. for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
  33. for codepoint in data['Range']:
  34. unicode_chars[codepoint]['Script'] = data['Script']
  35. if '--with-csur' in sys.argv:
  36. for csur in ['Klingon']:
  37. for data in ucd.parse_ucd_data('data/csur', csur):
  38. for codepoint in data['CodePoint']:
  39. unicode_chars[codepoint] = data
  40. def isdigit(data):
  41. return 1 if data['CodePoint'].char() in '0123456789' else 0
  42. def isxdigit(data):
  43. return 1 if data['CodePoint'].char() in '0123456789ABCDEFabcdef' else 0
  44. def isspace(data):
  45. if data.get('White_Space', 0):
  46. dt = data.get('DecompositionType', '')
  47. return 1 if dt == None or not dt.startswith('<noBreak>') else 0
  48. else:
  49. return 0
  50. def isupper(data):
  51. if data.get('LowerCase', null) != null:
  52. return 1
  53. elif data.get('GeneralCategory', 'Cn') == 'Lu':
  54. return 1
  55. else:
  56. return 0
  57. def islower(data):
  58. if data.get('UpperCase', null) != null:
  59. return 1
  60. elif data.get('GeneralCategory', 'Cn') == 'Ll':
  61. return 1
  62. else:
  63. return 0
  64. if __name__ == '__main__':
  65. for codepoint in ucd.CodeRange('000000..10FFFF'):
  66. try:
  67. data = unicode_chars[codepoint]
  68. except KeyError:
  69. data = {'CodePoint': codepoint}
  70. script = data.get('Script', 'Zzzz')
  71. title = data.get('TitleCase', codepoint)
  72. upper = data.get('UpperCase', codepoint)
  73. lower = data.get('LowerCase', codepoint)
  74. if title == null: title = codepoint
  75. if upper == null: upper = codepoint
  76. if lower == null: lower = codepoint
  77. print('%s %s %s %s %s %s %s %s %s %s %s %s' % (
  78. codepoint, script,
  79. data.get('GeneralCategory', 'Cn')[0], data.get('GeneralCategory', 'Cn'),
  80. upper, lower, title,
  81. isdigit(data), isxdigit(data),
  82. isspace(data),
  83. isupper(data), islower(data)))