eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ucd.py 3.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. #!/usr/bin/python
  2. # Copyright (C) 2012 Reece H. Dunn
  3. #
  4. # This file is part of ucd-tools.
  5. #
  6. # ucd-tools is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # ucd-tools is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import sys
  20. class CodePoint:
  21. def __init__(self, x):
  22. if isinstance(x, str):
  23. self.codepoint = int(x, 16)
  24. else:
  25. self.codepoint = x
  26. def __repr__(self):
  27. return '%06X' % self.codepoint
  28. def __str__(self):
  29. return '%06X' % self.codepoint
  30. def __iter__(self):
  31. yield self
  32. def __hash__(self):
  33. return self.codepoint
  34. def __eq__(self, other):
  35. return self.codepoint == other.codepoint
  36. def __ne__(self, other):
  37. return self.codepoint != other.codepoint
  38. def __lt__(self, other):
  39. return self.codepoint < other.codepoint
  40. class CodeRange:
  41. def __init__(self, x):
  42. f, l = x.split('..')
  43. self.first = CodePoint(f)
  44. self.last = CodePoint(l)
  45. def __repr__(self):
  46. return '%s..%s' % (self.first, self.last)
  47. def __str__(self):
  48. return '%s..%s' % (self.first, self.last)
  49. def __iter__(self):
  50. for c in range(self.first.codepoint, self.last.codepoint + 1):
  51. yield CodePoint(c)
  52. def size(self):
  53. return self.last.codepoint - self.first.codepoint + 1
  54. def codepoint(x):
  55. if '..' in x:
  56. return CodeRange(x)
  57. if ' ' in x:
  58. return [CodePoint(c) for c in x.split()]
  59. if x == '':
  60. return CodePoint('0000')
  61. return CodePoint(x)
  62. def string(x):
  63. if x == '':
  64. return None
  65. return x
  66. def boolean(x):
  67. if x == 'Y':
  68. return True
  69. return False
  70. data_items = {
  71. 'Blocks': [
  72. ('Range', codepoint),
  73. ('Name', str)
  74. ],
  75. 'DerivedAge': [
  76. ('Range', codepoint),
  77. ('Age', str),
  78. ],
  79. 'PropList': [
  80. ('Range', codepoint),
  81. ('Property', str),
  82. ],
  83. 'Scripts': [
  84. ('Range', codepoint),
  85. ('Script', str),
  86. ],
  87. 'UnicodeData': [
  88. ('CodePoint', codepoint),
  89. ('Name', string),
  90. ('GeneralCategory', string),
  91. ('CanonicalCombiningClass', int),
  92. ('BidiClass', string),
  93. ('DecompositionType', string),
  94. ('DecompositionMapping', string),
  95. ('NumericType', string),
  96. ('NumericValue', string),
  97. ('BidiMirrored', boolean),
  98. ('UnicodeName', string),
  99. ('ISOComment', string),
  100. ('UpperCase', codepoint),
  101. ('LowerCase', codepoint),
  102. ('TitleCase', codepoint),
  103. ],
  104. }
  105. def parse_ucd_data(ucd_rootdir, dataset):
  106. keys = data_items[dataset]
  107. first = None
  108. with open(os.path.join(ucd_rootdir, '%s.txt' % dataset)) as f:
  109. for line in f:
  110. line = line.replace('\n', '').split('#')[0]
  111. linedata = [' '.join(x.split()) for x in line.split(';')]
  112. if len(linedata) == len(keys):
  113. if linedata[1].endswith(', First>'):
  114. first = linedata
  115. continue
  116. if linedata[1].endswith(', Last>'):
  117. linedata[0] = '%s..%s' % (first[0], linedata[0])
  118. linedata[1] = linedata[1].replace(', Last>', '').replace('<', '')
  119. first = None
  120. data = {}
  121. for keydata, value in zip(keys, linedata):
  122. key, typemap = keydata
  123. if key:
  124. data[key] = typemap(value)
  125. yield data
  126. if __name__ == '__main__':
  127. for entry in parse_ucd_data(sys.argv[1], sys.argv[2]):
  128. print entry