eSpeak NG is an open source speech synthesizer that supports more than hundred languages and accents.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ucd.h 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. /* Unicode Character Database API
  2. *
  3. * Copyright (C) 2012 Reece H. Dunn
  4. *
  5. * This file is part of ucd-tools.
  6. *
  7. * ucd-tools is free software: you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation, either version 3 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * ucd-tools is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  19. */
  20. #ifndef UNICODE_CHARACTER_DATA_H
  21. #define UNICODE_CHARACTER_DATA_H
  22. #include <stdint.h>
  23. /** @brief Unicode Character Database
  24. */
  25. namespace ucd
  26. {
  27. /** @brief Represents a Unicode codepoint.
  28. */
  29. typedef uint32_t codepoint_t;
  30. /** @name Unicode General Category
  31. * @brief These functions query the General Category property of Unicode codepoints.
  32. */
  33. //@{
  34. /** @brief Unicode General Category Groups
  35. * @see http://www.unicode.org/reports/tr44/
  36. */
  37. enum category_group
  38. {
  39. C, /**< @brief Other */
  40. I, /**< @brief Invalid */
  41. L, /**< @brief Letter */
  42. M, /**< @brief Mark */
  43. N, /**< @brief Number */
  44. P, /**< @brief Punctuation */
  45. S, /**< @brief Symbol */
  46. Z, /**< @brief Separator */
  47. };
  48. /** @brief Unicode General Category Values
  49. * @see http://www.unicode.org/reports/tr44/
  50. */
  51. enum category
  52. {
  53. Cc, /**< @brief Control Character */
  54. Cf, /**< @brief Format Control Character */
  55. Cn, /**< @brief Unassigned */
  56. Co, /**< @brief Private Use */
  57. Cs, /**< @brief Surrogate Code Point */
  58. Ii, /**< @brief Invalid Unicode Codepoint */
  59. Ll, /**< @brief Lower Case Letter */
  60. Lm, /**< @brief Letter Modifier */
  61. Lo, /**< @brief Other Letter */
  62. Lt, /**< @brief Title Case Letter */
  63. Lu, /**< @brief Upper Case Letter */
  64. Mc, /**< @brief Spacing Mark */
  65. Me, /**< @brief Enclosing Mark */
  66. Mn, /**< @brief Non-Spacing Mark */
  67. Nd, /**< @brief Decimal Digit */
  68. Nl, /**< @brief Letter-Like Number */
  69. No, /**< @brief Other Number */
  70. Pc, /**< @brief Connector */
  71. Pd, /**< @brief Dash/Hyphen */
  72. Pe, /**< @brief Close Punctuation Mark */
  73. Pf, /**< @brief Final Quotation Mark */
  74. Pi, /**< @brief Initial Quotation Mark */
  75. Po, /**< @brief Other */
  76. Ps, /**< @brief Open PUnctuation Mark */
  77. Sc, /**< @brief Currency Symbol */
  78. Sk, /**< @brief Modifier Symbol */
  79. Sm, /**< @brief Math Symbol */
  80. So, /**< @brief Other Symbol */
  81. Zl, /**< @brief Line Separator */
  82. Zp, /**< @brief Paragraph Separator */
  83. Zs, /**< @brief Space Separator */
  84. };
  85. /** @brief Lookup the General Category Group for a General Category.
  86. *
  87. * @param c The General Category to lookup.
  88. * @return The General Category Group of the General Category.
  89. */
  90. category_group lookup_category_group(category c);
  91. /** @brief Lookup the General Category Group for a Unicode codepoint.
  92. *
  93. * @param c The Unicode codepoint to lookup.
  94. * @return The General Category Group of the Unicode codepoint.
  95. */
  96. category_group lookup_category_group(codepoint_t c);
  97. /** @brief Lookup the General Category for a Unicode codepoint.
  98. *
  99. * @param c The Unicode codepoint to lookup.
  100. * @return The General Category of the Unicode codepoint.
  101. */
  102. category lookup_category(codepoint_t c);
  103. //@}
  104. /** @name Unicode Script
  105. * @brief These functions query the Script property of Unicode codepoints.
  106. */
  107. //@{
  108. /** @brief Unicode Script
  109. * @see http://www.iana.org/assignments/language-subtag-registry
  110. * @see http://www.unicode.org/iso15924/iso15924-codes.html
  111. */
  112. enum script
  113. {
  114. Arab, /**< @brief Arabic Script */
  115. Armi, /**< @brief Imperial Aramaic Script */
  116. Armn, /**< @brief Armenian Script */
  117. Avst, /**< @brief Avestan Script */
  118. Bali, /**< @brief Balinese Script */
  119. Bamu, /**< @brief Bamum Script */
  120. Batk, /**< @brief Batak Script */
  121. Beng, /**< @brief Bengali Script */
  122. Bopo, /**< @brief Bopomofo Script */
  123. Brah, /**< @brief Brahmi Script */
  124. Brai, /**< @brief Braille Script */
  125. Bugi, /**< @brief Buginese Script */
  126. Buhd, /**< @brief Buhid Script */
  127. Cans, /**< @brief Unified Canadian Aboriginal Syllabics */
  128. Cari, /**< @brief Carian Script */
  129. Cakm, /**< @brief Chakma Script */
  130. Cham, /**< @brief Cham Script */
  131. Cher, /**< @brief Cherokee Script */
  132. Copt, /**< @brief Coptic Script */
  133. Cprt, /**< @brief Cypriot Script */
  134. Cyrl, /**< @brief Cyrillic Script */
  135. Deva, /**< @brief Devanagari Script */
  136. Dsrt, /**< @brief Deseret Script */
  137. Egyp, /**< @brief Egyptian Hiegoglyphs */
  138. Ethi, /**< @brief Ethiopic Script */
  139. Geor, /**< @brief Geirgian Script */
  140. Glag, /**< @brief Glagolitic Script */
  141. Goth, /**< @brief Gothic Script */
  142. Grek, /**< @brief Greek Script */
  143. Gujr, /**< @brief Gujarati Script */
  144. Guru, /**< @brief Gurmukhi Script */
  145. Hang, /**< @brief Hangul Script */
  146. Hano, /**< @brief Hanunoo Script */
  147. Hant, /**< @brief Han (Traditional) Script */
  148. Hebr, /**< @brief Hebrew Script */
  149. Hira, /**< @brief Hiragana Script */
  150. Ital, /**< @brief Old Italic Script */
  151. Java, /**< @brief Javanese Script */
  152. Kali, /**< @brief Kayah Li Script */
  153. Kana, /**< @brief Katakana Script */
  154. Khar, /**< @brief Kharoshthi Script */
  155. Khmr, /**< @brief Khmer Script */
  156. Knda, /**< @brief Kannada Script */
  157. Kthi, /**< @brief Kaithi Script */
  158. Lana, /**< @brief Tai Tham Script */
  159. Laoo, /**< @brief Lao Script */
  160. Latn, /**< @brief Latin Script */
  161. Lepc, /**< @brief Lepcha Script */
  162. Limb, /**< @brief Limbu Script */
  163. Linb, /**< @brief Linear B Script */
  164. Lisu, /**< @brief Lisu Script */
  165. Lyci, /**< @brief Lycian Script */
  166. Lydi, /**< @brief Lydian Script */
  167. Mand, /**< @brief Mandaic Script */
  168. Merc, /**< @brief Meroitic Cursive Script */
  169. Mero, /**< @brief Meroitic Hieroglyphs */
  170. Mlym, /**< @brief Malayalam Script */
  171. Mong, /**< @brief Mongolian Script */
  172. Mtei, /**< @brief Meitei Mayek Script */
  173. Mymr, /**< @brief Myanmar Script */
  174. Nkoo, /**< @brief N'Ko Script */
  175. Ogam, /**< @brief Ogham Script */
  176. Olck, /**< @brief Ol Chiki Script */
  177. Orkh, /**< @brief Old Turkic Script */
  178. Orya, /**< @brief Oriya Script */
  179. Osma, /**< @brief Osmanya Script */
  180. Phag, /**< @brief Phags-Pa Script */
  181. Phli, /**< @brief Inscriptional Pahlavi Script */
  182. Phnx, /**< @brief Phoenician Script */
  183. Plrd, /**< @brief Miao Script */
  184. Prti, /**< @brief Inscriptional Parthian Script */
  185. Rjng, /**< @brief Rejang Script */
  186. Runr, /**< @brief Runic Script */
  187. Samr, /**< @brief Samaritan Script */
  188. Sarb, /**< @brief Old South Arabian Script */
  189. Saur, /**< @brief Saurashtra Script */
  190. Shaw, /**< @brief Shavian Script */
  191. Shrd, /**< @brief Sharada Script */
  192. Sinh, /**< @brief Sinhala Script */
  193. Sora, /**< @brief Sora Sompeng Script */
  194. Sund, /**< @brief Sundanese Script */
  195. Sylo, /**< @brief Syloti Nagri Script */
  196. Syrn, /**< @brief Syriatic (Eastern) Script */
  197. Tagb, /**< @brief Tagbanwa Script */
  198. Takr, /**< @brief Takri Script */
  199. Tale, /**< @brief Tai Le Script */
  200. Talu, /**< @brief New Tai Lue Script */
  201. Taml, /**< @brief Tamil Script */
  202. Tavt, /**< @brief Tai Viet Script */
  203. Telu, /**< @brief Telugu Script */
  204. Tfng, /**< @brief Tifinagh Script */
  205. Tglg, /**< @brief Tagalog Script */
  206. Thaa, /**< @brief Thaana Script */
  207. Thai, /**< @brief Thai Script */
  208. Tibt, /**< @brief Tibetan Script */
  209. Ugar, /**< @brief Ugaritic Script */
  210. Vaii, /**< @brief Vai Script */
  211. Xpeo, /**< @brief Old Persian Script */
  212. Xsux, /**< @brief Cuneiform Script */
  213. Yiii, /**< @brief Yi Script */
  214. Zyyy, /**< @brief Inherited Script */
  215. Zzzz, /**< @brief Unknown Script */
  216. };
  217. /** @brief Lookup the Script for a Unicode codepoint.
  218. *
  219. * @param c The Unicode codepoint to lookup.
  220. * @return The Script of the Unicode codepoint.
  221. */
  222. script lookup_script(codepoint_t c);
  223. //@}
  224. /** @name ctype-style APIs
  225. * @brief These functions provide wctype compatible functions using the UCD data.
  226. */
  227. //@{
  228. /** @brief Is the codepoint an alpha-numeric character?
  229. *
  230. * @param c The Unicode codepoint to check.
  231. * @return Non-zero if the codepoint is a letter or number, zero otherwise.
  232. */
  233. int isalnum(codepoint_t c);
  234. /** @brief Is the codepoint a letter?
  235. *
  236. * @param c The Unicode codepoint to check.
  237. * @return Non-zero if the codepoint is a letter, zero otherwise.
  238. */
  239. int isalpha(codepoint_t c);
  240. /** @brief Is the codepoint a control character?
  241. *
  242. * @param c The Unicode codepoint to check.
  243. * @return Non-zero if the codepoint is a control character, zero otherwise.
  244. */
  245. int iscntrl(codepoint_t c);
  246. /** @brief Is the codepoint a numeric character?
  247. *
  248. * @param c The Unicode codepoint to check.
  249. * @return Non-zero if the codepoint is a number, zero otherwise.
  250. */
  251. int isdigit(codepoint_t c);
  252. /** @brief Does the codepoint have a displayable glyph?
  253. *
  254. * @param c The Unicode codepoint to check.
  255. * @return Non-zero if the codepoint has a displayable glyph, zero otherwise.
  256. */
  257. int isgraph(codepoint_t c);
  258. /** @brief Is the codepoint a lower-case letter?
  259. *
  260. * @param c The Unicode codepoint to check.
  261. * @return Non-zero if the codepoint is a lower-case letter, zero otherwise.
  262. */
  263. int islower(codepoint_t c);
  264. /** @brief Is the codepoint a printable character?
  265. *
  266. * @param c The Unicode codepoint to check.
  267. * @return Non-zero if the codepoint is a printable character, zero otherwise.
  268. */
  269. int isprint(codepoint_t c);
  270. /** @brief Is the codepoint a punctuation character?
  271. *
  272. * @param c The Unicode codepoint to check.
  273. * @return Non-zero if the codepoint is a punctuation character, zero otherwise.
  274. */
  275. int ispunct(codepoint_t c);
  276. /** @brief Is the codepoint a whitespace character?
  277. *
  278. * @param c The Unicode codepoint to check.
  279. * @return Non-zero if the codepoint is a whitespace character, zero otherwise.
  280. */
  281. int isspace(codepoint_t c);
  282. /** @brief Is the codepoint an upper-case letter?
  283. *
  284. * @param c The Unicode codepoint to check.
  285. * @return Non-zero if the codepoint is an upper-case letter, zero otherwise.
  286. */
  287. int isupper(codepoint_t c);
  288. //@}
  289. /** @name Case Conversion APIs
  290. * @brief These functions convert Unicode codepoints between lower, upper and title case.
  291. */
  292. //@{
  293. /** @brief Convert the Unicode codepoint to upper-case.
  294. *
  295. * This function only uses the simple case mapping present in the
  296. * UnicodeData file. The data in SpecialCasing requires Unicode
  297. * codepoints to be mapped to multiple codepoints.
  298. *
  299. * @param c The Unicode codepoint to convert.
  300. * @return The upper-case Unicode codepoint for this codepoint, or
  301. * this codepoint if there is no upper-case codepoint.
  302. */
  303. codepoint_t toupper(codepoint_t c);
  304. /** @brief Convert the Unicode codepoint to lower-case.
  305. *
  306. * This function only uses the simple case mapping present in the
  307. * UnicodeData file. The data in SpecialCasing requires Unicode
  308. * codepoints to be mapped to multiple codepoints.
  309. *
  310. * @param c The Unicode codepoint to convert.
  311. * @return The lower-case Unicode codepoint for this codepoint, or
  312. * this codepoint if there is no upper-case codepoint.
  313. */
  314. codepoint_t tolower(codepoint_t c);
  315. /** @brief Convert the Unicode codepoint to title-case.
  316. *
  317. * This function only uses the simple case mapping present in the
  318. * UnicodeData file. The data in SpecialCasing requires Unicode
  319. * codepoints to be mapped to multiple codepoints.
  320. *
  321. * @param c The Unicode codepoint to convert.
  322. * @return The title-case Unicode codepoint for this codepoint, or
  323. * this codepoint if there is no upper-case codepoint.
  324. */
  325. codepoint_t totitle(codepoint_t c);
  326. //@}
  327. }
  328. #endif