eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ucd.py 4.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. #!/usr/bin/python
  2. # Copyright (C) 2012 Reece H. Dunn
  3. #
  4. # This file is part of ucd-tools.
  5. #
  6. # ucd-tools is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # ucd-tools is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import sys
  20. import iana
  21. script_map = {
  22. # UCD script names not derivable from IANA script tags:
  23. 'Canadian_Aboriginal': 'Cans',
  24. 'Common': 'Zyyy',
  25. 'Egyptian_Hieroglyphs': 'Egyp',
  26. 'Inherited': 'Zyyy',
  27. 'Meetei_Mayek': 'Mtei',
  28. 'Nko': 'Nkoo',
  29. 'Phags_Pa': 'Phag',
  30. # Codes in http://www.unicode.org/iso15924/iso15924-codes.html not in IANA:
  31. 'Cuneiform': 'Xsux',
  32. }
  33. for ref, tag in iana.read_iana_subtags('data/language-subtag-registry').items():
  34. if tag['Type'] == 'Script':
  35. # Convert the IANA scipt tag descriptions to the UCD script names:
  36. desc = tag['Description']
  37. if ' (' in desc:
  38. desc = desc.split(' (')[0]
  39. desc = desc.replace(' ', '_')
  40. script_map[desc] = ref
  41. # Fix up incorrectly mapped script names:
  42. script_map['Cyrillic'] = 'Cyrl'
  43. class CodePoint:
  44. def __init__(self, x):
  45. if isinstance(x, str):
  46. self.codepoint = int(x, 16)
  47. else:
  48. self.codepoint = x
  49. def __repr__(self):
  50. return '%06X' % self.codepoint
  51. def __str__(self):
  52. return '%06X' % self.codepoint
  53. def __iter__(self):
  54. yield self
  55. def __hash__(self):
  56. return self.codepoint
  57. def __eq__(self, other):
  58. return self.codepoint == other.codepoint
  59. def __ne__(self, other):
  60. return self.codepoint != other.codepoint
  61. def __lt__(self, other):
  62. return self.codepoint < other.codepoint
  63. class CodeRange:
  64. def __init__(self, x):
  65. f, l = x.split('..')
  66. self.first = CodePoint(f)
  67. self.last = CodePoint(l)
  68. def __repr__(self):
  69. return '%s..%s' % (self.first, self.last)
  70. def __str__(self):
  71. return '%s..%s' % (self.first, self.last)
  72. def __iter__(self):
  73. for c in range(self.first.codepoint, self.last.codepoint + 1):
  74. yield CodePoint(c)
  75. def size(self):
  76. return self.last.codepoint - self.first.codepoint + 1
  77. def codepoint(x):
  78. if '..' in x:
  79. return CodeRange(x)
  80. if ' ' in x:
  81. return [CodePoint(c) for c in x.split()]
  82. if x == '':
  83. return CodePoint('0000')
  84. return CodePoint(x)
  85. def string(x):
  86. if x == '':
  87. return None
  88. return x
  89. def boolean(x):
  90. if x == 'Y':
  91. return True
  92. return False
  93. def script(x):
  94. return script_map[x]
  95. data_items = {
  96. 'Blocks': [
  97. ('Range', codepoint),
  98. ('Name', str)
  99. ],
  100. 'DerivedAge': [
  101. ('Range', codepoint),
  102. ('Age', str),
  103. ],
  104. 'PropList': [
  105. ('Range', codepoint),
  106. ('Property', str),
  107. ],
  108. 'Scripts': [
  109. ('Range', codepoint),
  110. ('Script', script),
  111. ],
  112. 'UnicodeData': [
  113. ('CodePoint', codepoint),
  114. ('Name', string),
  115. ('GeneralCategory', string),
  116. ('CanonicalCombiningClass', int),
  117. ('BidiClass', string),
  118. ('DecompositionType', string),
  119. ('DecompositionMapping', string),
  120. ('NumericType', string),
  121. ('NumericValue', string),
  122. ('BidiMirrored', boolean),
  123. ('UnicodeName', string),
  124. ('ISOComment', string),
  125. ('UpperCase', codepoint),
  126. ('LowerCase', codepoint),
  127. ('TitleCase', codepoint),
  128. ],
  129. # Supplemental Data:
  130. 'Klingon': [
  131. ('CodePoint', codepoint),
  132. ('Script', str),
  133. ('GeneralCategory', string),
  134. ('Name', string),
  135. ('Transliteration', string),
  136. ],
  137. }
  138. def parse_ucd_data(ucd_rootdir, dataset):
  139. keys = data_items[dataset]
  140. first = None
  141. with open(os.path.join(ucd_rootdir, '%s.txt' % dataset)) as f:
  142. for line in f:
  143. line = line.replace('\n', '').split('#')[0]
  144. linedata = [' '.join(x.split()) for x in line.split(';')]
  145. if len(linedata) == len(keys):
  146. if linedata[1].endswith(', First>'):
  147. first = linedata
  148. continue
  149. if linedata[1].endswith(', Last>'):
  150. linedata[0] = '%s..%s' % (first[0], linedata[0])
  151. linedata[1] = linedata[1].replace(', Last>', '').replace('<', '')
  152. first = None
  153. data = {}
  154. for keydata, value in zip(keys, linedata):
  155. key, typemap = keydata
  156. if key:
  157. data[key] = typemap(value)
  158. yield data
  159. if __name__ == '__main__':
  160. try:
  161. items = sys.argv[3].split(',')
  162. except:
  163. items = None
  164. for entry in parse_ucd_data(sys.argv[1], sys.argv[2]):
  165. if items:
  166. print ','.join([str(entry[item]) for item in items])
  167. else:
  168. print entry