eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ucd.py 4.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. #!/usr/bin/python
  2. # Copyright (C) 2012-2014 Reece H. Dunn
  3. #
  4. # This file is part of ucd-tools.
  5. #
  6. # ucd-tools is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # ucd-tools is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import sys
  20. script_map = {}
  21. class CodePoint:
  22. def __init__(self, x):
  23. if isinstance(x, str):
  24. self.codepoint = int(x, 16)
  25. else:
  26. self.codepoint = x
  27. def __repr__(self):
  28. return '%06X' % self.codepoint
  29. def __str__(self):
  30. return '%06X' % self.codepoint
  31. def __iter__(self):
  32. yield self
  33. def __hash__(self):
  34. return self.codepoint
  35. def __eq__(self, other):
  36. return self.codepoint == other.codepoint
  37. def __ne__(self, other):
  38. return self.codepoint != other.codepoint
  39. def __lt__(self, other):
  40. return self.codepoint < other.codepoint
  41. class CodeRange:
  42. def __init__(self, x):
  43. f, l = x.split('..')
  44. self.first = CodePoint(f)
  45. self.last = CodePoint(l)
  46. def __repr__(self):
  47. return '%s..%s' % (self.first, self.last)
  48. def __str__(self):
  49. return '%s..%s' % (self.first, self.last)
  50. def __iter__(self):
  51. for c in range(self.first.codepoint, self.last.codepoint + 1):
  52. yield CodePoint(c)
  53. def size(self):
  54. return self.last.codepoint - self.first.codepoint + 1
  55. def codepoint(x):
  56. if '..' in x[0]:
  57. return CodeRange(x[0]), x[1:]
  58. if ' ' in x:
  59. return [CodePoint(c) for c in x[0].split()], x[1:]
  60. if x[0] == '':
  61. return CodePoint('0000'), x[1:]
  62. return CodePoint(x[0]), x[1:]
  63. def string(x):
  64. if x[0] == '':
  65. return None, x[1:]
  66. return x[0], x[1:]
  67. def integer(x):
  68. return int(x[0]), x[1:]
  69. def boolean(x):
  70. if x[0] == 'Y':
  71. return True, x[1:]
  72. return False, x[1:]
  73. def script(x):
  74. return script_map[x[0]], x[1:]
  75. def strlist(x):
  76. return x, []
  77. data_items = {
  78. # Unicode Character Data:
  79. 'Blocks': [
  80. ('Range', codepoint),
  81. ('Name', string)
  82. ],
  83. 'DerivedAge': [
  84. ('Range', codepoint),
  85. ('Age', string),
  86. ],
  87. 'PropList': [
  88. ('Range', codepoint),
  89. ('Property', string),
  90. ],
  91. 'PropertyValueAliases': [
  92. ('Property', string),
  93. ('Key', string),
  94. ('Value', string),
  95. ('Aliases', strlist),
  96. ],
  97. 'Scripts': [
  98. ('Range', codepoint),
  99. ('Script', script),
  100. ],
  101. 'UnicodeData': [
  102. ('CodePoint', codepoint),
  103. ('Name', string),
  104. ('GeneralCategory', string),
  105. ('CanonicalCombiningClass', integer),
  106. ('BidiClass', string),
  107. ('DecompositionType', string),
  108. ('DecompositionMapping', string),
  109. ('NumericType', string),
  110. ('NumericValue', string),
  111. ('BidiMirrored', boolean),
  112. ('UnicodeName', string),
  113. ('ISOComment', string),
  114. ('UpperCase', codepoint),
  115. ('LowerCase', codepoint),
  116. ('TitleCase', codepoint),
  117. ],
  118. # ConScript Unicode Registry Data:
  119. 'Klingon': [
  120. ('CodePoint', codepoint),
  121. ('Script', string),
  122. ('GeneralCategory', string),
  123. ('Name', string),
  124. ('Transliteration', string),
  125. ],
  126. }
  127. def parse_ucd_data(ucd_rootdir, dataset):
  128. keys = data_items[dataset]
  129. first = None
  130. with open(os.path.join(ucd_rootdir, '%s.txt' % dataset)) as f:
  131. for line in f:
  132. line = line.replace('\n', '').split('#')[0]
  133. linedata = [' '.join(x.split()) for x in line.split(';')]
  134. if len(linedata) > 1:
  135. if linedata[1].endswith(', First>'):
  136. first = linedata
  137. continue
  138. if linedata[1].endswith(', Last>'):
  139. linedata[0] = '%s..%s' % (first[0], linedata[0])
  140. linedata[1] = linedata[1].replace(', Last>', '').replace('<', '')
  141. first = None
  142. data = {}
  143. for key, typemap in keys:
  144. data[key], linedata = typemap(linedata)
  145. yield data
  146. def parse_property_mapping(ucd_rootdir, propname, reverse=False):
  147. ret = {}
  148. for data in parse_ucd_data(ucd_rootdir, 'PropertyValueAliases'):
  149. if data['Property'] == propname:
  150. if reverse:
  151. ret[data['Value']] = data['Key']
  152. else:
  153. ret[data['Key']] = data['Value']
  154. return ret
  155. if __name__ == '__main__':
  156. try:
  157. items = sys.argv[3].split(',')
  158. except:
  159. items = None
  160. script_map = parse_property_mapping(sys.argv[1], 'sc', reverse=True)
  161. for entry in parse_ucd_data(sys.argv[1], sys.argv[2]):
  162. if items:
  163. print(','.join([str(entry[item]) for item in items]))
  164. else:
  165. print(entry)
  166. else:
  167. script_map = parse_property_mapping('data/ucd', 'sc', reverse=True)