eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ucd.py 2.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. #!/usr/bin/python
  2. # Copyright (C) 2012 Reece H. Dunn
  3. #
  4. # This file is part of ucd-tools.
  5. #
  6. # ucd-tools is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # ucd-tools is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import sys
  20. class CodePoint:
  21. def __init__(self, x):
  22. self.codepoint = x
  23. def __repr__(self):
  24. return self.codepoint
  25. def __str__(self):
  26. return self.codepoint
  27. class CodeRange:
  28. def __init__(self, x):
  29. self.first, self.last = x.split('..')
  30. def __repr__(self):
  31. return '%s..%s' % (self.first, self.last)
  32. def __str__(self):
  33. return '%s..%s' % (self.first, self.last)
  34. def codepoint(x):
  35. if '..' in x:
  36. return CodeRange(x)
  37. if ' ' in x:
  38. return [CodePoint(c) for c in x.split()]
  39. return CodePoint(x)
  40. def string(x):
  41. if x == '':
  42. return None
  43. return x
  44. def boolean(x):
  45. if x == 'Y':
  46. return True
  47. return False
  48. data_items = {
  49. 'Blocks': [
  50. ('Range', codepoint),
  51. ('Name', str)
  52. ],
  53. 'DerivedAge': [
  54. ('Range', codepoint),
  55. ('Age', str),
  56. ],
  57. 'PropList': [
  58. ('Range', codepoint),
  59. ('Property', str),
  60. ],
  61. 'Scripts': [
  62. ('Range', codepoint),
  63. ('Script', str),
  64. ],
  65. 'UnicodeData': [
  66. ('CodePoint', codepoint),
  67. ('Name', string),
  68. ('GeneralCategory', string),
  69. ('CanonicalCombiningClass', int),
  70. ('BidiClass', string),
  71. ('DecompositionType', string),
  72. ('DecompositionMapping', string),
  73. ('NumericType', string),
  74. ('NumericValue', string),
  75. ('BidiMirrored', boolean),
  76. ('UnicodeName', string),
  77. ('ISOComment', string),
  78. ('UpperCase', codepoint),
  79. ('LowerCase', codepoint),
  80. ('TitleCase', codepoint),
  81. ],
  82. }
  83. def parse_ucd_data(ucd_rootdir, dataset):
  84. keys = data_items[dataset]
  85. with open(os.path.join(ucd_rootdir, '%s.txt' % dataset)) as f:
  86. for line in f:
  87. line = line.replace('\n', '').split('#')[0]
  88. linedata = [' '.join(x.split()) for x in line.split(';')]
  89. if len(linedata) == len(keys):
  90. data = {}
  91. for keydata, value in zip(keys, linedata):
  92. key, typemap = keydata
  93. if key:
  94. data[key] = typemap(value)
  95. yield data
  96. if __name__ == '__main__':
  97. for entry in parse_ucd_data(sys.argv[1], sys.argv[2]):
  98. print entry